diff options
Diffstat (limited to 'kernel')
140 files changed, 10942 insertions, 4152 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 54e581072617..15632358bcf7 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -94,30 +94,6 @@ config KEXEC_JUMP Jump between original kernel and kexeced kernel and invoke code in physical address mode via KEXEC -config KEXEC_HANDOVER - bool "kexec handover" - depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE - depends on !DEFERRED_STRUCT_PAGE_INIT - select MEMBLOCK_KHO_SCRATCH - select KEXEC_FILE - select DEBUG_FS - select LIBFDT - select CMA - help - Allow kexec to hand over state across kernels by generating and - passing additional metadata to the target kernel. This is useful - to keep data or state alive across the kexec. For this to work, - both source and target kernels need to have this option enabled. - -config KEXEC_HANDOVER_DEBUG - bool "Enable Kexec Handover debug checks" - depends on KEXEC_HANDOVER - help - This option enables extra sanity checks for the Kexec Handover - subsystem. Since, KHO performance is crucial in live update - scenarios and the extra code might be adding overhead it is - only optionally enabled. - config CRASH_DUMP bool "kernel crash dumps" default ARCH_DEFAULT_CRASH_DUMP diff --git a/kernel/Makefile b/kernel/Makefile index 9fe722305c9b..e83669841b8c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -52,6 +52,7 @@ obj-y += printk/ obj-y += irq/ obj-y += rcu/ obj-y += livepatch/ +obj-y += liveupdate/ obj-y += dma/ obj-y += entry/ obj-y += unwind/ @@ -82,8 +83,6 @@ obj-$(CONFIG_CRASH_DUMP_KUNIT_TEST) += crash_core_test.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o -obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o -obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/bounds.c b/kernel/bounds.c index 29b2cd00df2c..02b619eb6106 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -6,6 +6,7 @@ */ #define __GENERATING_BOUNDS_H +#define COMPILE_OFFSETS /* Include headers that define the enum constants of interest */ #include <linux/page-flags.h> #include <linux/mmzone.h> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 7fd0badfacb1..232cbc97434d 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -9,7 +9,7 @@ CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 1074ac4459f2..872dc0e41c65 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -334,7 +334,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad return -EINVAL; } - ret = mm_get_unmapped_area(current->mm, filp, addr, len * 2, 0, flags); + ret = mm_get_unmapped_area(filp, addr, len * 2, 0, flags); if (IS_ERR_VALUE(ret)) return ret; if ((ret >> 32) == ((ret + len - 1) >> 32)) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 80b1765a3159..1eeb31c5b317 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -335,18 +335,17 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) } /* Called from syscall */ -static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key) { - struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = (u32 *)next_key; - if (index >= array->map.max_entries) { + if (index >= map->max_entries) { *next = 0; return 0; } - if (index == array->map.max_entries - 1) + if (index == map->max_entries - 1) return -ENOENT; *next = index + 1; @@ -448,19 +447,12 @@ static void array_map_free_internal_structs(struct bpf_map *map) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - /* We don't reset or free fields other than timer and workqueue - * on uref dropping to zero. - */ - if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { - for (i = 0; i < array->map.max_entries; i++) { - if (btf_record_has_field(map->record, BPF_TIMER)) - bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); - if (btf_record_has_field(map->record, BPF_WORKQUEUE)) - bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); - if (btf_record_has_field(map->record, BPF_TASK_WORK)) - bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i)); - } - } + /* We only free internal structs on uref dropping to zero */ + if (!bpf_map_has_internal_structs(map)) + return; + + for (i = 0; i < array->map.max_entries; i++) + bpf_map_free_internal_structs(map, array_map_elem_ptr(array, i)); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -796,7 +788,7 @@ const struct bpf_map_ops array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_release_uref = array_map_free_internal_structs, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, @@ -822,7 +814,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = percpu_array_map_lookup_elem, .map_gen_lookup = percpu_array_map_gen_lookup, .map_update_elem = array_map_update_elem, @@ -1211,7 +1203,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_poke_track = prog_array_map_poke_track, .map_poke_untrack = prog_array_map_poke_untrack, .map_poke_run = prog_array_map_poke_run, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, @@ -1315,7 +1307,7 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = perf_event_fd_array_map_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = perf_event_fd_array_get_ptr, @@ -1351,7 +1343,7 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = cgroup_fd_array_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = cgroup_fd_array_get_ptr, @@ -1436,7 +1428,7 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, .map_free = array_of_map_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = array_of_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = bpf_map_fd_get_ptr, diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c new file mode 100644 index 000000000000..c96630cb75bf --- /dev/null +++ b/kernel/bpf/bpf_insn_array.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Isovalent */ + +#include <linux/bpf.h> + +struct bpf_insn_array { + struct bpf_map map; + atomic_t used; + long *ips; + DECLARE_FLEX_ARRAY(struct bpf_insn_array_value, values); +}; + +#define cast_insn_array(MAP_PTR) \ + container_of((MAP_PTR), struct bpf_insn_array, map) + +#define INSN_DELETED ((u32)-1) + +static inline u64 insn_array_alloc_size(u32 max_entries) +{ + const u64 base_size = sizeof(struct bpf_insn_array); + const u64 entry_size = sizeof(struct bpf_insn_array_value); + + return base_size + max_entries * (entry_size + sizeof(long)); +} + +static int insn_array_alloc_check(union bpf_attr *attr) +{ + u32 value_size = sizeof(struct bpf_insn_array_value); + + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != value_size || attr->map_flags != 0) + return -EINVAL; + + return 0; +} + +static void insn_array_free(struct bpf_map *map) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + + bpf_map_area_free(insn_array); +} + +static struct bpf_map *insn_array_alloc(union bpf_attr *attr) +{ + u64 size = insn_array_alloc_size(attr->max_entries); + struct bpf_insn_array *insn_array; + + insn_array = bpf_map_area_alloc(size, NUMA_NO_NODE); + if (!insn_array) + return ERR_PTR(-ENOMEM); + + /* ips are allocated right after the insn_array->values[] array */ + insn_array->ips = (void *)&insn_array->values[attr->max_entries]; + + bpf_map_init_from_attr(&insn_array->map, attr); + + /* BPF programs aren't allowed to write to the map */ + insn_array->map.map_flags |= BPF_F_RDONLY_PROG; + + return &insn_array->map; +} + +static void *insn_array_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + u32 index = *(u32 *)key; + + if (unlikely(index >= insn_array->map.max_entries)) + return NULL; + + return &insn_array->values[index]; +} + +static long insn_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + u32 index = *(u32 *)key; + struct bpf_insn_array_value val = {}; + + if (unlikely(index >= insn_array->map.max_entries)) + return -E2BIG; + + if (unlikely(map_flags & BPF_NOEXIST)) + return -EEXIST; + + copy_map_value(map, &val, value); + if (val.jitted_off || val.xlated_off) + return -EINVAL; + + insn_array->values[index].orig_off = val.orig_off; + + return 0; +} + +static long insn_array_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +static int insn_array_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + if (!btf_type_is_i32(key_type)) + return -EINVAL; + + if (!btf_type_is_i64(value_type)) + return -EINVAL; + + return 0; +} + +static u64 insn_array_mem_usage(const struct bpf_map *map) +{ + return insn_array_alloc_size(map->max_entries); +} + +static int insn_array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + + if ((off % sizeof(long)) != 0 || + (off / sizeof(long)) >= map->max_entries) + return -EINVAL; + + /* from BPF's point of view, this map is a jump table */ + *imm = (unsigned long)insn_array->ips + off; + + return 0; +} + +BTF_ID_LIST_SINGLE(insn_array_btf_ids, struct, bpf_insn_array) + +const struct bpf_map_ops insn_array_map_ops = { + .map_alloc_check = insn_array_alloc_check, + .map_alloc = insn_array_alloc, + .map_free = insn_array_free, + .map_get_next_key = bpf_array_get_next_key, + .map_lookup_elem = insn_array_lookup_elem, + .map_update_elem = insn_array_update_elem, + .map_delete_elem = insn_array_delete_elem, + .map_check_btf = insn_array_check_btf, + .map_mem_usage = insn_array_mem_usage, + .map_direct_value_addr = insn_array_map_direct_value_addr, + .map_btf_id = &insn_array_btf_ids[0], +}; + +static inline bool is_frozen(struct bpf_map *map) +{ + guard(mutex)(&map->freeze_mutex); + + return map->frozen; +} + +static bool is_insn_array(const struct bpf_map *map) +{ + return map->map_type == BPF_MAP_TYPE_INSN_ARRAY; +} + +static inline bool valid_offsets(const struct bpf_insn_array *insn_array, + const struct bpf_prog *prog) +{ + u32 off; + int i; + + for (i = 0; i < insn_array->map.max_entries; i++) { + off = insn_array->values[i].orig_off; + + if (off >= prog->len) + return false; + + if (off > 0) { + if (prog->insnsi[off-1].code == (BPF_LD | BPF_DW | BPF_IMM)) + return false; + } + } + + return true; +} + +int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + struct bpf_insn_array_value *values = insn_array->values; + int i; + + if (!is_frozen(map)) + return -EINVAL; + + if (!valid_offsets(insn_array, prog)) + return -EINVAL; + + /* + * There can be only one program using the map + */ + if (atomic_xchg(&insn_array->used, 1)) + return -EBUSY; + + /* + * Reset all the map indexes to the original values. This is needed, + * e.g., when a replay of verification with different log level should + * be performed. + */ + for (i = 0; i < map->max_entries; i++) + values[i].xlated_off = values[i].orig_off; + + return 0; +} + +int bpf_insn_array_ready(struct bpf_map *map) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + int i; + + for (i = 0; i < map->max_entries; i++) { + if (insn_array->values[i].xlated_off == INSN_DELETED) + continue; + if (!insn_array->ips[i]) + return -EFAULT; + } + + return 0; +} + +void bpf_insn_array_release(struct bpf_map *map) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + + atomic_set(&insn_array->used, 0); +} + +void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + int i; + + if (len <= 1) + return; + + for (i = 0; i < map->max_entries; i++) { + if (insn_array->values[i].xlated_off <= off) + continue; + if (insn_array->values[i].xlated_off == INSN_DELETED) + continue; + insn_array->values[i].xlated_off += len - 1; + } +} + +void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + int i; + + for (i = 0; i < map->max_entries; i++) { + if (insn_array->values[i].xlated_off < off) + continue; + if (insn_array->values[i].xlated_off == INSN_DELETED) + continue; + if (insn_array->values[i].xlated_off < off + len) + insn_array->values[i].xlated_off = INSN_DELETED; + else + insn_array->values[i].xlated_off -= len; + } +} + +/* + * This function is called by JITs. The image is the real program + * image, the offsets array set up the xlated -> jitted mapping. + * The offsets[xlated] offset should point to the beginning of + * the jitted instruction. + */ +void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) +{ + struct bpf_insn_array *insn_array; + struct bpf_map *map; + u32 xlated_off; + int i, j; + + if (!offsets || !image) + return; + + for (i = 0; i < prog->aux->used_map_cnt; i++) { + map = prog->aux->used_maps[i]; + if (!is_insn_array(map)) + continue; + + insn_array = cast_insn_array(map); + for (j = 0; j < map->max_entries; j++) { + xlated_off = insn_array->values[j].xlated_off; + if (xlated_off == INSN_DELETED) + continue; + if (xlated_off < prog->aux->subprog_start) + continue; + xlated_off -= prog->aux->subprog_start; + if (xlated_off >= prog->len) + continue; + + insn_array->values[j].jitted_off = offsets[xlated_off]; + insn_array->ips[j] = (long)(image + offsets[xlated_off]); + } + } +} diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index b931fbceb54d..e2fe6c32822b 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -73,30 +73,24 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, - void *value, bool charge_mem, bool swap_uptrs, gfp_t gfp_flags) + void *value, bool swap_uptrs, gfp_t gfp_flags) { struct bpf_local_storage_elem *selem; - if (charge_mem && mem_charge(smap, owner, smap->elem_size)) + if (mem_charge(smap, owner, smap->elem_size)) return NULL; - if (smap->bpf_ma) { - selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags); - if (selem) - /* Keep the original bpf_map_kzalloc behavior - * before started using the bpf_mem_cache_alloc. - * - * No need to use zero_map_value. The bpf_selem_free() - * only does bpf_mem_cache_free when there is - * no other bpf prog is using the selem. - */ - memset(SDATA(selem)->data, 0, smap->map.value_size); + if (smap->use_kmalloc_nolock) { + selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size, + __GFP_ZERO, NUMA_NO_NODE); } else { selem = bpf_map_kzalloc(&smap->map, smap->elem_size, gfp_flags | __GFP_NOWARN); } if (selem) { + RCU_INIT_POINTER(SDATA(selem)->smap, smap); + if (value) { /* No need to call check_and_init_map_value as memory is zero init */ copy_map_value(&smap->map, SDATA(selem)->data, value); @@ -106,13 +100,12 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, return selem; } - if (charge_mem) - mem_uncharge(smap, owner, smap->elem_size); + mem_uncharge(smap, owner, smap->elem_size); return NULL; } -/* rcu tasks trace callback for bpf_ma == false */ +/* rcu tasks trace callback for use_kmalloc_nolock == false */ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; @@ -127,12 +120,23 @@ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) kfree_rcu(local_storage, rcu); } +/* Handle use_kmalloc_nolock == false */ +static void __bpf_local_storage_free(struct bpf_local_storage *local_storage, + bool vanilla_rcu) +{ + if (vanilla_rcu) + kfree_rcu(local_storage, rcu); + else + call_rcu_tasks_trace(&local_storage->rcu, + __bpf_local_storage_free_trace_rcu); +} + static void bpf_local_storage_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; local_storage = container_of(rcu, struct bpf_local_storage, rcu); - bpf_mem_cache_raw_free(local_storage); + kfree_nolock(local_storage); } static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) @@ -143,46 +147,27 @@ static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) call_rcu(rcu, bpf_local_storage_free_rcu); } -/* Handle bpf_ma == false */ -static void __bpf_local_storage_free(struct bpf_local_storage *local_storage, - bool vanilla_rcu) -{ - if (vanilla_rcu) - kfree_rcu(local_storage, rcu); - else - call_rcu_tasks_trace(&local_storage->rcu, - __bpf_local_storage_free_trace_rcu); -} - static void bpf_local_storage_free(struct bpf_local_storage *local_storage, - struct bpf_local_storage_map *smap, - bool bpf_ma, bool reuse_now) + bool reuse_now) { if (!local_storage) return; - if (!bpf_ma) { + if (!local_storage->use_kmalloc_nolock) { __bpf_local_storage_free(local_storage, reuse_now); return; } - if (!reuse_now) { - call_rcu_tasks_trace(&local_storage->rcu, - bpf_local_storage_free_trace_rcu); + if (reuse_now) { + call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); return; } - if (smap) - bpf_mem_cache_free(&smap->storage_ma, local_storage); - else - /* smap could be NULL if the selem that triggered - * this 'local_storage' creation had been long gone. - * In this case, directly do call_rcu(). - */ - call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); + call_rcu_tasks_trace(&local_storage->rcu, + bpf_local_storage_free_trace_rcu); } -/* rcu tasks trace callback for bpf_ma == false */ +/* rcu tasks trace callback for use_kmalloc_nolock == false */ static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; @@ -194,7 +179,7 @@ static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) kfree_rcu(selem, rcu); } -/* Handle bpf_ma == false */ +/* Handle use_kmalloc_nolock == false */ static void __bpf_selem_free(struct bpf_local_storage_elem *selem, bool vanilla_rcu) { @@ -216,7 +201,7 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) migrate_disable(); bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); migrate_enable(); - bpf_mem_cache_raw_free(selem); + kfree_nolock(selem); } static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) @@ -228,14 +213,17 @@ static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) } void bpf_selem_free(struct bpf_local_storage_elem *selem, - struct bpf_local_storage_map *smap, bool reuse_now) { - if (!smap->bpf_ma) { - /* Only task storage has uptrs and task storage - * has moved to bpf_mem_alloc. Meaning smap->bpf_ma == true - * for task storage, so this bpf_obj_free_fields() won't unpin - * any uptr. + struct bpf_local_storage_map *smap; + + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + + if (!smap->use_kmalloc_nolock) { + /* + * No uptr will be unpin even when reuse_now == false since uptr + * is only supported in task local storage, where + * smap->use_kmalloc_nolock == true. */ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); __bpf_selem_free(selem, reuse_now); @@ -243,18 +231,11 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, } if (reuse_now) { - /* reuse_now == true only happens when the storage owner - * (e.g. task_struct) is being destructed or the map itself - * is being destructed (ie map_free). In both cases, - * no bpf prog can have a hold on the selem. It is - * safe to unpin the uptrs and free the selem now. - */ - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - /* Instead of using the vanilla call_rcu(), - * bpf_mem_cache_free will be able to reuse selem - * immediately. + /* + * While it is okay to call bpf_obj_free_fields() that unpins uptr when + * reuse_now == true, keep it in bpf_selem_free_rcu() for simplicity. */ - bpf_mem_cache_free(&smap->selem_ma, selem); + call_rcu(&selem->rcu, bpf_selem_free_rcu); return; } @@ -264,7 +245,6 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) { struct bpf_local_storage_elem *selem; - struct bpf_local_storage_map *smap; struct hlist_node *n; /* The "_safe" iteration is needed. @@ -272,10 +252,8 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) * but bpf_selem_free will use the selem->rcu_head * which is union-ized with the selem->free_node. */ - hlist_for_each_entry_safe(selem, n, list, free_node) { - smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - bpf_selem_free(selem, smap, reuse_now); - } + hlist_for_each_entry_safe(selem, n, list, free_node) + bpf_selem_free(selem, reuse_now); } /* local_storage->lock must be held and selem->local_storage == local_storage. @@ -284,7 +262,7 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) */ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem, - bool uncharge_mem, struct hlist_head *free_selem_list) + struct hlist_head *free_selem_list) { struct bpf_local_storage_map *smap; bool free_local_storage; @@ -297,8 +275,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor * The owner may be freed once the last selem is unlinked * from local_storage. */ - if (uncharge_mem) - mem_uncharge(smap, owner, smap->elem_size); + mem_uncharge(smap, owner, smap->elem_size); free_local_storage = hlist_is_singular_node(&selem->snode, &local_storage->list); @@ -336,47 +313,11 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor return free_local_storage; } -static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage, - struct bpf_local_storage_map *storage_smap, - struct bpf_local_storage_elem *selem) -{ - - struct bpf_local_storage_map *selem_smap; - - /* local_storage->smap may be NULL. If it is, get the bpf_ma - * from any selem in the local_storage->list. The bpf_ma of all - * local_storage and selem should have the same value - * for the same map type. - * - * If the local_storage->list is already empty, the caller will not - * care about the bpf_ma value also because the caller is not - * responsible to free the local_storage. - */ - - if (storage_smap) - return storage_smap->bpf_ma; - - if (!selem) { - struct hlist_node *n; - - n = rcu_dereference_check(hlist_first_rcu(&local_storage->list), - bpf_rcu_lock_held()); - if (!n) - return false; - - selem = hlist_entry(n, struct bpf_local_storage_elem, snode); - } - selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - - return selem_smap->bpf_ma; -} - static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, bool reuse_now) { - struct bpf_local_storage_map *storage_smap; struct bpf_local_storage *local_storage; - bool bpf_ma, free_local_storage = false; + bool free_local_storage = false; HLIST_HEAD(selem_free_list); unsigned long flags; @@ -386,20 +327,17 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); - storage_smap = rcu_dereference_check(local_storage->smap, - bpf_rcu_lock_held()); - bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem); raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true, &selem_free_list); + local_storage, selem, &selem_free_list); raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&selem_free_list, reuse_now); if (free_local_storage) - bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now); + bpf_local_storage_free(local_storage, reuse_now); } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, @@ -434,7 +372,6 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, unsigned long flags; raw_spin_lock_irqsave(&b->lock, flags); - RCU_INIT_POINTER(SDATA(selem)->smap, smap); hlist_add_head_rcu(&selem->map_node, &b->list); raw_spin_unlock_irqrestore(&b->lock, flags); } @@ -493,8 +430,9 @@ int bpf_local_storage_alloc(void *owner, if (err) return err; - if (smap->bpf_ma) - storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags); + if (smap->use_kmalloc_nolock) + storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage), + __GFP_ZERO, NUMA_NO_NODE); else storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), gfp_flags | __GFP_NOWARN); @@ -507,6 +445,7 @@ int bpf_local_storage_alloc(void *owner, INIT_HLIST_HEAD(&storage->list); raw_spin_lock_init(&storage->lock); storage->owner = owner; + storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; bpf_selem_link_storage_nolock(storage, first_selem); bpf_selem_link_map(smap, first_selem); @@ -528,22 +467,12 @@ int bpf_local_storage_alloc(void *owner, bpf_selem_unlink_map(first_selem); err = -EAGAIN; goto uncharge; - - /* Note that even first_selem was linked to smap's - * bucket->list, first_selem can be freed immediately - * (instead of kfree_rcu) because - * bpf_local_storage_map_free() does a - * synchronize_rcu_mult (waiting for both sleepable and - * normal programs) before walking the bucket->list. - * Hence, no one is accessing selem from the - * bucket->list under rcu_read_lock(). - */ } return 0; uncharge: - bpf_local_storage_free(storage, smap, smap->bpf_ma, true); + bpf_local_storage_free(storage, true); mem_uncharge(smap, owner, sizeof(*storage)); return err; } @@ -582,13 +511,13 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (err) return ERR_PTR(err); - selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags); + selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags); if (!selem) return ERR_PTR(-ENOMEM); err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags); if (err) { - bpf_selem_free(selem, smap, true); + bpf_selem_free(selem, true); mem_uncharge(smap, owner, smap->elem_size); return ERR_PTR(err); } @@ -616,7 +545,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, /* A lookup has just been done before and concluded a new selem is * needed. The chance of an unnecessary alloc is unlikely. */ - alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags); + alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags); if (!alloc_selem) return ERR_PTR(-ENOMEM); @@ -656,7 +585,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), - true, &old_selem_free_list); + &old_selem_free_list); } unlock: @@ -664,7 +593,7 @@ unlock: bpf_selem_free_list(&old_selem_free_list, false); if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); - bpf_selem_free(alloc_selem, smap, true); + bpf_selem_free(alloc_selem, true); } return err ? ERR_PTR(err) : SDATA(selem); } @@ -730,16 +659,12 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) { - struct bpf_local_storage_map *storage_smap; struct bpf_local_storage_elem *selem; - bool bpf_ma, free_storage = false; + bool free_storage = false; HLIST_HEAD(free_selem_list); struct hlist_node *n; unsigned long flags; - storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held()); - bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL); - /* Neither the bpf_prog nor the bpf_map's syscall * could be modifying the local_storage->list now. * Thus, no elem can be added to or deleted from the @@ -762,14 +687,14 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * of the loop will set the free_cgroup_storage to true. */ free_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true, &free_selem_list); + local_storage, selem, &free_selem_list); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&free_selem_list, true); if (free_storage) - bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true); + bpf_local_storage_free(local_storage, true); } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) @@ -782,20 +707,10 @@ u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) return usage; } -/* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory. - * A deadlock free allocator is useful for storage that the bpf prog can easily - * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf. - * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses - * memory immediately. To be reuse-immediate safe, the owner destruction - * code path needs to go through a rcu grace period before calling - * bpf_local_storage_destroy(). - * - * When bpf_ma == false, the kmalloc and kfree are used. - */ struct bpf_map * bpf_local_storage_map_alloc(union bpf_attr *attr, struct bpf_local_storage_cache *cache, - bool bpf_ma) + bool use_kmalloc_nolock) { struct bpf_local_storage_map *smap; unsigned int i; @@ -829,20 +744,9 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, /* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non * preemptible context. Thus, enforce all storages to use - * bpf_mem_alloc when CONFIG_PREEMPT_RT is enabled. + * kmalloc_nolock() when CONFIG_PREEMPT_RT is enabled. */ - smap->bpf_ma = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : bpf_ma; - if (smap->bpf_ma) { - err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false); - if (err) - goto free_smap; - - err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false); - if (err) { - bpf_mem_alloc_destroy(&smap->selem_ma); - goto free_smap; - } - } + smap->use_kmalloc_nolock = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : use_kmalloc_nolock; smap->cache_idx = bpf_local_storage_cache_idx_get(cache); return &smap->map; @@ -912,12 +816,9 @@ void bpf_local_storage_map_free(struct bpf_map *map, */ synchronize_rcu(); - if (smap->bpf_ma) { + if (smap->use_kmalloc_nolock) { rcu_barrier_tasks_trace(); - if (!rcu_trace_implies_rcu_gp()) - rcu_barrier(); - bpf_mem_alloc_destroy(&smap->selem_ma); - bpf_mem_alloc_destroy(&smap->storage_ma); + rcu_barrier(); } kvfree(smap->buckets); bpf_map_area_free(smap); diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 0a59df1c550a..7cb6e8d4282c 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -51,6 +51,7 @@ BTF_ID(func, bpf_lsm_key_getsecurity) BTF_ID(func, bpf_lsm_audit_rule_match) #endif BTF_ID(func, bpf_lsm_ismaclabel) +BTF_ID(func, bpf_lsm_file_alloc_security) BTF_SET_END(bpf_lsm_disabled_hooks) /* List of LSM hooks that should operate on 'current' cgroup regardless diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index a41e6730edcf..278490683d28 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -1162,6 +1162,7 @@ bool bpf_struct_ops_get(const void *kdata) map = __bpf_map_inc_not_zero(&st_map->map, false); return !IS_ERR(map); } +EXPORT_SYMBOL_GPL(bpf_struct_ops_get); void bpf_struct_ops_put(const void *kdata) { @@ -1173,6 +1174,7 @@ void bpf_struct_ops_put(const void *kdata) bpf_map_put(&st_map->map); } +EXPORT_SYMBOL_GPL(bpf_struct_ops_put); u32 bpf_struct_ops_id(const void *kdata) { diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 248f517d66d0..69988af44b37 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1665,7 +1665,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * returned value != 1 during execution. In all other cases, 0 is returned. */ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, - struct sockaddr *uaddr, + struct sockaddr_unsized *uaddr, int *uaddrlen, enum cgroup_bpf_attach_type atype, void *t_ctx, @@ -1676,7 +1676,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, .uaddr = uaddr, .t_ctx = t_ctx, }; - struct sockaddr_storage unspec; + struct sockaddr_storage storage; struct cgroup *cgrp; int ret; @@ -1688,8 +1688,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, return 0; if (!ctx.uaddr) { - memset(&unspec, 0, sizeof(unspec)); - ctx.uaddr = (struct sockaddr *)&unspec; + memset(&storage, 0, sizeof(storage)); + ctx.uaddr = (struct sockaddr_unsized *)&storage; ctx.uaddrlen = 0; } else { ctx.uaddrlen = *uaddrlen; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d595fe512498..c8ae6ab31651 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1450,6 +1450,23 @@ void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other) bpf_prog_clone_free(fp_other); } +static void adjust_insn_arrays(struct bpf_prog *prog, u32 off, u32 len) +{ +#ifdef CONFIG_BPF_SYSCALL + struct bpf_map *map; + int i; + + if (len <= 1) + return; + + for (i = 0; i < prog->aux->used_map_cnt; i++) { + map = prog->aux->used_maps[i]; + if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) + bpf_insn_array_adjust(map, off, len); + } +#endif +} + struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) { struct bpf_insn insn_buff[16], aux[2]; @@ -1505,6 +1522,9 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) clone = tmp; insn_delta = rewritten - 1; + /* Instructions arrays must be updated using absolute xlated offsets */ + adjust_insn_arrays(clone, prog->aux->subprog_start + i, rewritten); + /* Walk new program and skip insns we just inserted. */ insn = clone->insnsi + i + insn_delta; insn_cnt += insn_delta; @@ -1688,6 +1708,7 @@ bool bpf_opcode_in_insntable(u8 code) [BPF_LD | BPF_IND | BPF_B] = true, [BPF_LD | BPF_IND | BPF_H] = true, [BPF_LD | BPF_IND | BPF_W] = true, + [BPF_JMP | BPF_JA | BPF_X] = true, [BPF_JMP | BPF_JCOND] = true, }; #undef BPF_INSN_3_TBL @@ -3129,8 +3150,9 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, return -EFAULT; } -int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, - void *addr1, void *addr2) +int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, + enum bpf_text_poke_type new_t, void *old_addr, + void *new_addr) { return -ENOTSUPP; } diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 20883c6b1546..f8a3c7eb451e 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -358,6 +358,9 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose(cbs->private_data, "(%02x) goto pc%+d\n", insn->code, insn->off); + } else if (insn->code == (BPF_JMP | BPF_JA | BPF_X)) { + verbose(cbs->private_data, "(%02x) gotox r%d\n", + insn->code, insn->dst_reg); } else if (insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO) { verbose(cbs->private_data, "(%02x) may_goto pc%+d\n", diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c2fcd0cd51e5..c8a9b27f8663 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -215,19 +215,6 @@ static bool htab_has_extra_elems(struct bpf_htab *htab) return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab); } -static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem *elem) -{ - if (btf_record_has_field(htab->map.record, BPF_TIMER)) - bpf_obj_free_timer(htab->map.record, - htab_elem_value(elem, htab->map.key_size)); - if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) - bpf_obj_free_workqueue(htab->map.record, - htab_elem_value(elem, htab->map.key_size)); - if (btf_record_has_field(htab->map.record, BPF_TASK_WORK)) - bpf_obj_free_task_work(htab->map.record, - htab_elem_value(elem, htab->map.key_size)); -} - static void htab_free_prealloced_internal_structs(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; @@ -240,7 +227,8 @@ static void htab_free_prealloced_internal_structs(struct bpf_htab *htab) struct htab_elem *elem; elem = get_htab_elem(htab, i); - htab_free_internal_structs(htab, elem); + bpf_map_free_internal_structs(&htab->map, + htab_elem_value(elem, htab->map.key_size)); cond_resched(); } } @@ -669,8 +657,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) struct htab_elem *l; u32 hash, key_size; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -947,15 +934,21 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, void *value, bool onallcpus) { + void *ptr; + if (!onallcpus) { /* copy true value_size bytes */ - copy_map_value(&htab->map, this_cpu_ptr(pptr), value); + ptr = this_cpu_ptr(pptr); + copy_map_value(&htab->map, ptr, value); + bpf_obj_free_fields(htab->map.record, ptr); } else { u32 size = round_up(htab->map.value_size, 8); int off = 0, cpu; for_each_possible_cpu(cpu) { - copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off); + ptr = per_cpu_ptr(pptr, cpu); + copy_map_value_long(&htab->map, ptr, value + off); + bpf_obj_free_fields(htab->map.record, ptr); off += size; } } @@ -1098,8 +1091,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1206,8 +1198,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1275,8 +1266,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1338,8 +1328,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1416,8 +1405,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1452,8 +1440,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1509,8 +1496,9 @@ static void htab_free_malloced_internal_structs(struct bpf_htab *htab) struct htab_elem *l; hlist_nulls_for_each_entry(l, n, head, hash_node) { - /* We only free timer on uref dropping to zero */ - htab_free_internal_structs(htab, l); + /* We only free internal structs on uref dropping to zero */ + bpf_map_free_internal_structs(&htab->map, + htab_elem_value(l, htab->map.key_size)); } cond_resched_rcu(); } @@ -1521,13 +1509,14 @@ static void htab_map_free_internal_structs(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - /* We only free timer and workqueue on uref dropping to zero */ - if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { - if (!htab_is_prealloc(htab)) - htab_free_malloced_internal_structs(htab); - else - htab_free_prealloced_internal_structs(htab); - } + /* We only free internal structs on uref dropping to zero */ + if (!bpf_map_has_internal_structs(map)) + return; + + if (htab_is_prealloc(htab)) + htab_free_prealloced_internal_structs(htab); + else + htab_free_malloced_internal_structs(htab); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e4007fea4909..db72b96f9c8c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -28,6 +28,7 @@ #include <linux/verification.h> #include <linux/task_work.h> #include <linux/irq_work.h> +#include <linux/buildid.h> #include "../../lib/kstrtox.h" @@ -42,8 +43,7 @@ */ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); return (unsigned long) map->ops->map_lookup_elem(map, key); } @@ -59,8 +59,7 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = { BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, void *, value, u64, flags) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); return map->ops->map_update_elem(map, key, value, flags); } @@ -77,8 +76,7 @@ const struct bpf_func_proto bpf_map_update_elem_proto = { BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); return map->ops->map_delete_elem(map, key); } @@ -134,8 +132,7 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = { BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu); } @@ -777,9 +774,11 @@ int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs) { int nest_level; + preempt_disable(); nest_level = this_cpu_inc_return(bpf_bprintf_nest_level); if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) { this_cpu_dec(bpf_bprintf_nest_level); + preempt_enable(); return -EBUSY; } *bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]); @@ -792,6 +791,7 @@ void bpf_put_buffers(void) if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0)) return; this_cpu_dec(bpf_bprintf_nest_level); + preempt_enable(); } void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) @@ -1660,6 +1660,13 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = { .arg2_btf_id = BPF_PTR_POISON, }; +struct bpf_dynptr_file_impl { + struct freader freader; + /* 64 bit offset and size overriding 32 bit ones in bpf_dynptr_kern */ + u64 offset; + u64 size; +}; + /* Since the upper 8 bits of dynptr->size is reserved, the * maximum supported size is 2^24 - 1. */ @@ -1688,23 +1695,65 @@ static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *pt return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT; } -u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) +u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) { + if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) { + struct bpf_dynptr_file_impl *df = ptr->data; + + return df->size; + } + return ptr->size & DYNPTR_SIZE_MASK; } -static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size) +static void bpf_dynptr_advance_offset(struct bpf_dynptr_kern *ptr, u64 off) +{ + if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) { + struct bpf_dynptr_file_impl *df = ptr->data; + + df->offset += off; + return; + } + ptr->offset += off; +} + +static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u64 new_size) { u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK; - ptr->size = new_size | metadata; + if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) { + struct bpf_dynptr_file_impl *df = ptr->data; + + df->size = new_size; + return; + } + ptr->size = (u32)new_size | metadata; } -int bpf_dynptr_check_size(u32 size) +int bpf_dynptr_check_size(u64 size) { return size > DYNPTR_MAX_SIZE ? -E2BIG : 0; } +static int bpf_file_fetch_bytes(struct bpf_dynptr_file_impl *df, u64 offset, void *buf, u64 len) +{ + const void *ptr; + + if (!buf) + return -EINVAL; + + df->freader.buf = buf; + df->freader.buf_sz = len; + ptr = freader_fetch(&df->freader, offset + df->offset, len); + if (!ptr) + return df->freader.err; + + if (ptr != buf) /* Force copying into the buffer */ + memcpy(buf, ptr, len); + + return 0; +} + void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size) { @@ -1719,7 +1768,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) memset(ptr, 0, sizeof(*ptr)); } -BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr) +BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u64, size, u64, flags, struct bpf_dynptr_kern *, ptr) { int err; @@ -1754,8 +1803,8 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE, }; -static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *src, - u32 offset, u64 flags) +static int __bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr_kern *src, + u64 offset, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1785,14 +1834,16 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s case BPF_DYNPTR_TYPE_SKB_META: memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len); return 0; + case BPF_DYNPTR_TYPE_FILE: + return bpf_file_fetch_bytes(src->data, offset, dst, len); default: WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); return -EFAULT; } } -BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, - u32, offset, u64, flags) +BPF_CALL_5(bpf_dynptr_read, void *, dst, u64, len, const struct bpf_dynptr_kern *, src, + u64, offset, u64, flags) { return __bpf_dynptr_read(dst, len, src, offset, flags); } @@ -1808,8 +1859,8 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .arg5_type = ARG_ANYTHING, }; -int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, - u32 len, u64 flags) +int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src, + u64 len, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1842,18 +1893,16 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, return -EINVAL; return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); case BPF_DYNPTR_TYPE_SKB_META: - if (flags) - return -EINVAL; - memmove(bpf_skb_meta_pointer(dst->data, dst->offset + offset), src, len); - return 0; + return __bpf_skb_meta_store_bytes(dst->data, dst->offset + offset, src, + len, flags); default: WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); return -EFAULT; } } -BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, - u32, len, u64, flags) +BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u64, offset, void *, src, + u64, len, u64, flags) { return __bpf_dynptr_write(dst, offset, src, len, flags); } @@ -1869,7 +1918,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = { .arg5_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) +BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u64, offset, u64, len) { enum bpf_dynptr_type type; int err; @@ -2684,12 +2733,12 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ -__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, - void *buffer__opt, u32 buffer__szk) +__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, + void *buffer__opt, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; enum bpf_dynptr_type type; - u32 len = buffer__szk; + u64 len = buffer__szk; int err; if (!ptr->data) @@ -2723,6 +2772,9 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, } case BPF_DYNPTR_TYPE_SKB_META: return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset); + case BPF_DYNPTR_TYPE_FILE: + err = bpf_file_fetch_bytes(ptr->data, offset, buffer__opt, buffer__szk); + return err ? NULL : buffer__opt; default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; @@ -2771,8 +2823,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ -__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, - void *buffer__opt, u32 buffer__szk) +__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, + void *buffer__opt, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; @@ -2804,10 +2856,10 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk); } -__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end) +__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; - u32 size; + u64 size; if (!ptr->data || start > end) return -EINVAL; @@ -2817,7 +2869,7 @@ __bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end if (start > size || end > size) return -ERANGE; - ptr->offset += start; + bpf_dynptr_advance_offset(ptr, start); bpf_dynptr_set_size(ptr, end - start); return 0; @@ -2840,7 +2892,7 @@ __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p) return __bpf_dynptr_is_rdonly(ptr); } -__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr *p) +__bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; @@ -2877,14 +2929,14 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, * Copies data from source dynptr to destination dynptr. * Returns 0 on success; negative error, otherwise. */ -__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, - struct bpf_dynptr *src_ptr, u32 src_off, u32 size) +__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off, + struct bpf_dynptr *src_ptr, u64 src_off, u64 size) { struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; void *src_slice, *dst_slice; char buf[256]; - u32 off; + u64 off; src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size); dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size); @@ -2906,7 +2958,7 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, off = 0; while (off < size) { - u32 chunk_sz = min_t(u32, sizeof(buf), size - off); + u64 chunk_sz = min_t(u64, sizeof(buf), size - off); int err; err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0); @@ -2932,10 +2984,10 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, * at @offset with the constant byte @val. * Returns 0 on success; negative error, otherwise. */ - __bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u32 offset, u32 size, u8 val) - { +__bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val) +{ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; - u32 chunk_sz, write_off; + u64 chunk_sz, write_off; char buf[256]; void* slice; int err; @@ -2954,11 +3006,11 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, return err; /* Non-linear data under the dynptr, write from a local buffer */ - chunk_sz = min_t(u32, sizeof(buf), size); + chunk_sz = min_t(u64, sizeof(buf), size); memset(buf, val, chunk_sz); for (write_off = 0; write_off < size; write_off += chunk_sz) { - chunk_sz = min_t(u32, sizeof(buf), size - write_off); + chunk_sz = min_t(u64, sizeof(buf), size - write_off); err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0); if (err) return err; @@ -3678,34 +3730,21 @@ err_out: return -EFAULT; } -/** - * bpf_strnstr - Find the first substring in a length-limited string - * @s1__ign: The string to be searched - * @s2__ign: The string to search for - * @len: the maximum number of characters to search - * - * Return: - * * >=0 - Index of the first character of the first occurrence of @s2__ign - * within the first @len characters of @s1__ign - * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign - * * %-EFAULT - Cannot read one of the strings - * * %-E2BIG - One of the strings is too large - * * %-ERANGE - One of the strings is outside of kernel address space - */ -__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len) +static int __bpf_strnstr(const char *s1, const char *s2, size_t len, + bool ignore_case) { char c1, c2; int i, j; - if (!copy_from_kernel_nofault_allowed(s1__ign, 1) || - !copy_from_kernel_nofault_allowed(s2__ign, 1)) { + if (!copy_from_kernel_nofault_allowed(s1, 1) || + !copy_from_kernel_nofault_allowed(s2, 1)) { return -ERANGE; } guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) { - __get_kernel_nofault(&c2, s2__ign + j, char, err_out); + __get_kernel_nofault(&c2, s2 + j, char, err_out); if (c2 == '\0') return i; /* @@ -3715,7 +3754,13 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len */ if (i + j == len) break; - __get_kernel_nofault(&c1, s1__ign + j, char, err_out); + __get_kernel_nofault(&c1, s1 + j, char, err_out); + + if (ignore_case) { + c1 = tolower(c1); + c2 = tolower(c2); + } + if (c1 == '\0') return -ENOENT; if (c1 != c2) @@ -3725,7 +3770,7 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len return -E2BIG; if (i + j == len) return -ENOENT; - s1__ign++; + s1++; } return -E2BIG; err_out: @@ -3747,8 +3792,69 @@ err_out: */ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign) { - return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX); + return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, false); +} + +/** + * bpf_strcasestr - Find the first substring in a string, ignoring the case of + * the characters + * @s1__ign: The string to be searched + * @s2__ign: The string to search for + * + * Return: + * * >=0 - Index of the first character of the first occurrence of @s2__ign + * within @s1__ign + * * %-ENOENT - @s2__ign is not a substring of @s1__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strcasestr(const char *s1__ign, const char *s2__ign) +{ + return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, true); } + +/** + * bpf_strnstr - Find the first substring in a length-limited string + * @s1__ign: The string to be searched + * @s2__ign: The string to search for + * @len: the maximum number of characters to search + * + * Return: + * * >=0 - Index of the first character of the first occurrence of @s2__ign + * within the first @len characters of @s1__ign + * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, + size_t len) +{ + return __bpf_strnstr(s1__ign, s2__ign, len, false); +} + +/** + * bpf_strncasestr - Find the first substring in a length-limited string, + * ignoring the case of the characters + * @s1__ign: The string to be searched + * @s2__ign: The string to search for + * @len: the maximum number of characters to search + * + * Return: + * * >=0 - Index of the first character of the first occurrence of @s2__ign + * within the first @len characters of @s1__ign + * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strncasestr(const char *s1__ign, const char *s2__ign, + size_t len) +{ + return __bpf_strnstr(s1__ign, s2__ign, len, true); +} + #ifdef CONFIG_KEYS /** * bpf_lookup_user_key - lookup a key by its serial @@ -4206,6 +4312,54 @@ __bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task, return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); } +static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep, + struct bpf_dynptr_kern *ptr) +{ + struct bpf_dynptr_file_impl *state; + + /* flags is currently unsupported */ + if (flags) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + state = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_dynptr_file_impl)); + if (!state) { + bpf_dynptr_set_null(ptr); + return -ENOMEM; + } + state->offset = 0; + state->size = U64_MAX; /* Don't restrict size, as file may change anyways */ + freader_init_from_file(&state->freader, NULL, 0, file, may_sleep); + bpf_dynptr_init(ptr, state, BPF_DYNPTR_TYPE_FILE, 0, 0); + bpf_dynptr_set_rdonly(ptr); + return 0; +} + +__bpf_kfunc int bpf_dynptr_from_file(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit) +{ + return make_file_dynptr(file, flags, false, (struct bpf_dynptr_kern *)ptr__uninit); +} + +int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit) +{ + return make_file_dynptr(file, flags, true, (struct bpf_dynptr_kern *)ptr__uninit); +} + +__bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr) +{ + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)dynptr; + struct bpf_dynptr_file_impl *df = ptr->data; + + if (!df) + return 0; + + freader_cleanup(&df->freader); + bpf_mem_free(&bpf_global_ma, df); + bpf_dynptr_set_null(ptr); + return 0; +} + __bpf_kfunc_end_defs(); static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work) @@ -4376,13 +4530,17 @@ BTF_ID_FLAGS(func, bpf_strnlen); BTF_ID_FLAGS(func, bpf_strspn); BTF_ID_FLAGS(func, bpf_strcspn); BTF_ID_FLAGS(func, bpf_strstr); +BTF_ID_FLAGS(func, bpf_strcasestr); BTF_ID_FLAGS(func, bpf_strnstr); +BTF_ID_FLAGS(func, bpf_strncasestr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_file, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_file_discard) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { @@ -4423,7 +4581,7 @@ late_initcall(kfunc_init); /* Get a pointer to dynptr data up to len bytes for read only access. If * the dynptr doesn't have continuous data up to len bytes, return NULL. */ -const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len) +const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len) { const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr; @@ -4434,9 +4592,19 @@ const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len) * the dynptr doesn't have continuous data up to len bytes, or the dynptr * is read only, return NULL. */ -void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len) +void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len) { if (__bpf_dynptr_is_rdonly(ptr)) return NULL; return (void *)__bpf_dynptr_data(ptr, len); } + +void bpf_map_free_internal_structs(struct bpf_map *map, void *val) +{ + if (btf_record_has_field(map->record, BPF_TIMER)) + bpf_obj_free_timer(map->record, val); + if (btf_record_has_field(map->record, BPF_WORKQUEUE)) + bpf_obj_free_workqueue(map->record, val); + if (btf_record_has_field(map->record, BPF_TASK_WORK)) + bpf_obj_free_task_work(map->record, val); +} diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 81780bcf8d25..9f866a010dad 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -144,8 +144,7 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, struct inode *dir) { - d_instantiate(dentry, inode); - dget(dentry); + d_make_persistent(dentry, inode); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } @@ -420,16 +419,12 @@ static int bpf_iter_link_pin_kernel(struct dentry *parent, struct dentry *dentry; int ret; - inode_lock(parent->d_inode); - dentry = lookup_noperm(&QSTR(name), parent); - if (IS_ERR(dentry)) { - inode_unlock(parent->d_inode); + dentry = simple_start_creating(parent, name); + if (IS_ERR(dentry)) return PTR_ERR(dentry); - } ret = bpf_mkobj_ops(dentry, mode, link, &bpf_link_iops, &bpf_iter_fops); - dput(dentry); - inode_unlock(parent->d_inode); + simple_done_creating(dentry); return ret; } @@ -1080,7 +1075,7 @@ static void bpf_kill_super(struct super_block *sb) { struct bpf_mount_opts *opts = sb->s_fs_info; - kill_litter_super(sb); + kill_anon_super(sb); kfree(opts); } diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index 1e6538f59a78..60db5d655495 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -34,7 +34,7 @@ * - read and write marks propagation. * - The propagation phase is a textbook live variable data flow analysis: * - * state[cc, i].live_after = U [state[cc, s].live_before for s in insn_successors(i)] + * state[cc, i].live_after = U [state[cc, s].live_before for s in bpf_insn_successors(i)] * state[cc, i].live_before = * (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read * @@ -54,7 +54,7 @@ * The equation for "must_write_acc" propagation looks as follows: * * state[cc, i].must_write_acc = - * ∩ [state[cc, s].must_write_acc for s in insn_successors(i)] + * ∩ [state[cc, s].must_write_acc for s in bpf_insn_successors(i)] * U state[cc, i].must_write * * (An intersection of all "must_write_acc" for instruction successors @@ -447,7 +447,12 @@ int bpf_jmp_offset(struct bpf_insn *insn) __diag_push(); __diag_ignore_all("-Woverride-init", "Allow field initialization overrides for opcode_info_tbl"); -inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) +/* + * Returns an array of instructions succ, with succ->items[0], ..., + * succ->items[n-1] with successor instructions, where n=succ->cnt + */ +inline struct bpf_iarray * +bpf_insn_successors(struct bpf_verifier_env *env, u32 idx) { static const struct opcode_info { bool can_jump; @@ -474,19 +479,29 @@ inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) _J(BPF_JSET, {.can_jump = true, .can_fallthrough = true}), #undef _J }; + struct bpf_prog *prog = env->prog; struct bpf_insn *insn = &prog->insnsi[idx]; const struct opcode_info *opcode_info; - int i = 0, insn_sz; + struct bpf_iarray *succ, *jt; + int insn_sz; + + jt = env->insn_aux_data[idx].jt; + if (unlikely(jt)) + return jt; + + /* pre-allocated array of size up to 2; reset cnt, as it may have been used already */ + succ = env->succ; + succ->cnt = 0; opcode_info = &opcode_info_tbl[BPF_CLASS(insn->code) | BPF_OP(insn->code)]; insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; if (opcode_info->can_fallthrough) - succ[i++] = idx + insn_sz; + succ->items[succ->cnt++] = idx + insn_sz; if (opcode_info->can_jump) - succ[i++] = idx + bpf_jmp_offset(insn) + 1; + succ->items[succ->cnt++] = idx + bpf_jmp_offset(insn) + 1; - return i; + return succ; } __diag_pop(); @@ -524,6 +539,8 @@ static int propagate_to_outer_instance(struct bpf_verifier_env *env, this_subprog_start = callchain_subprog_start(callchain); outer_instance = get_outer_instance(env, instance); + if (IS_ERR(outer_instance)) + return PTR_ERR(outer_instance); callsite = callchain->callsites[callchain->curframe - 1]; reset_stack_write_marks(env, outer_instance, callsite); @@ -546,11 +563,12 @@ static inline bool update_insn(struct bpf_verifier_env *env, struct bpf_insn_aux_data *aux = env->insn_aux_data; u64 new_before, new_after, must_write_acc; struct per_frame_masks *insn, *succ_insn; - u32 succ_num, s, succ[2]; + struct bpf_iarray *succ; + u32 s; bool changed; - succ_num = bpf_insn_successors(env->prog, insn_idx, succ); - if (unlikely(succ_num == 0)) + succ = bpf_insn_successors(env, insn_idx); + if (succ->cnt == 0) return false; changed = false; @@ -562,8 +580,8 @@ static inline bool update_insn(struct bpf_verifier_env *env, * of successors plus all "must_write" slots of instruction itself. */ must_write_acc = U64_MAX; - for (s = 0; s < succ_num; ++s) { - succ_insn = get_frame_masks(instance, frame, succ[s]); + for (s = 0; s < succ->cnt; ++s) { + succ_insn = get_frame_masks(instance, frame, succ->items[s]); new_after |= succ_insn->live_before; must_write_acc &= succ_insn->must_write_acc; } diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index f50533169cc3..a0c3b35de2ce 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -461,6 +461,7 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) [PTR_TO_ARENA] = "arena", [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", + [PTR_TO_INSN] = "insn", [PTR_TO_MAP_KEY] = "map_key", [CONST_PTR_TO_DYNPTR] = "dynptr_ptr", }; @@ -500,6 +501,8 @@ const char *dynptr_type_str(enum bpf_dynptr_type type) return "xdp"; case BPF_DYNPTR_TYPE_SKB_META: return "skb_meta"; + case BPF_DYNPTR_TYPE_FILE: + return "file"; case BPF_DYNPTR_TYPE_INVALID: return "<invalid>"; default: diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c index 37b80a23ae1a..99c63d982c5d 100644 --- a/kernel/bpf/range_tree.c +++ b/kernel/bpf/range_tree.c @@ -2,7 +2,6 @@ /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ #include <linux/interval_tree_generic.h> #include <linux/slab.h> -#include <linux/bpf_mem_alloc.h> #include <linux/bpf.h> #include "range_tree.h" @@ -21,7 +20,7 @@ * in commit 6772fcc8890a ("xfs: convert xbitmap to interval tree"). * * The implementation relies on external lock to protect rbtree-s. - * The alloc/free of range_node-s is done via bpf_mem_alloc. + * The alloc/free of range_node-s is done via kmalloc_nolock(). * * bpf arena is using range_tree to represent unallocated slots. * At init time: @@ -150,9 +149,7 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len) range_it_insert(rn, rt); /* Add a range */ - migrate_disable(); - new_rn = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node)); - migrate_enable(); + new_rn = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE); if (!new_rn) return -ENOMEM; new_rn->rn_start = last + 1; @@ -172,9 +169,7 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len) } else { /* in the middle of the clearing range */ range_it_remove(rn, rt); - migrate_disable(); - bpf_mem_free(&bpf_global_ma, rn); - migrate_enable(); + kfree_nolock(rn); } } return 0; @@ -227,9 +222,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len) range_it_remove(right, rt); left->rn_last = right->rn_last; range_it_insert(left, rt); - migrate_disable(); - bpf_mem_free(&bpf_global_ma, right); - migrate_enable(); + kfree_nolock(right); } else if (left) { /* Combine with the left range */ range_it_remove(left, rt); @@ -241,9 +234,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len) right->rn_start = start; range_it_insert(right, rt); } else { - migrate_disable(); - left = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node)); - migrate_enable(); + left = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE); if (!left) return -ENOMEM; left->rn_start = start; @@ -259,7 +250,7 @@ void range_tree_destroy(struct range_tree *rt) while ((rn = range_it_iter_first(rt, 0, -1U))) { range_it_remove(rn, rt); - bpf_mem_free(&bpf_global_ma, rn); + kfree_nolock(rn); } } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index d706c4b7f532..f6a075ffac63 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -13,7 +13,7 @@ #include <linux/btf_ids.h> #include <asm/rqspinlock.h> -#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) +#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE | BPF_F_RB_OVERWRITE) /* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ #define RINGBUF_PGOFF \ @@ -30,6 +30,7 @@ struct bpf_ringbuf { u64 mask; struct page **pages; int nr_pages; + bool overwrite_mode; rqspinlock_t spinlock ____cacheline_aligned_in_smp; /* For user-space producer ring buffers, an atomic_t busy bit is used * to synchronize access to the ring buffers in the kernel, rather than @@ -73,6 +74,7 @@ struct bpf_ringbuf { unsigned long consumer_pos __aligned(PAGE_SIZE); unsigned long producer_pos __aligned(PAGE_SIZE); unsigned long pending_pos; + unsigned long overwrite_pos; /* position after the last overwritten record */ char data[] __aligned(PAGE_SIZE); }; @@ -166,7 +168,7 @@ static void bpf_ringbuf_notify(struct irq_work *work) * considering that the maximum value of data_sz is (4GB - 1), there * will be no overflow, so just note the size limit in the comments. */ -static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) +static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node, bool overwrite_mode) { struct bpf_ringbuf *rb; @@ -183,17 +185,25 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) rb->consumer_pos = 0; rb->producer_pos = 0; rb->pending_pos = 0; + rb->overwrite_mode = overwrite_mode; return rb; } static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) { + bool overwrite_mode = false; struct bpf_ringbuf_map *rb_map; if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); + if (attr->map_flags & BPF_F_RB_OVERWRITE) { + if (attr->map_type != BPF_MAP_TYPE_RINGBUF) + return ERR_PTR(-EINVAL); + overwrite_mode = true; + } + if (attr->key_size || attr->value_size || !is_power_of_2(attr->max_entries) || !PAGE_ALIGNED(attr->max_entries)) @@ -205,7 +215,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) bpf_map_init_from_attr(&rb_map->map, attr); - rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); + rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node, overwrite_mode); if (!rb_map->rb) { bpf_map_area_free(rb_map); return ERR_PTR(-ENOMEM); @@ -295,13 +305,26 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); } +/* + * Return an estimate of the available data in the ring buffer. + * Note: the returned value can exceed the actual ring buffer size because the + * function is not synchronized with the producer. The producer acquires the + * ring buffer's spinlock, but this function does not. + */ static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) { - unsigned long cons_pos, prod_pos; + unsigned long cons_pos, prod_pos, over_pos; cons_pos = smp_load_acquire(&rb->consumer_pos); - prod_pos = smp_load_acquire(&rb->producer_pos); - return prod_pos - cons_pos; + + if (unlikely(rb->overwrite_mode)) { + over_pos = smp_load_acquire(&rb->overwrite_pos); + prod_pos = smp_load_acquire(&rb->producer_pos); + return prod_pos - max(cons_pos, over_pos); + } else { + prod_pos = smp_load_acquire(&rb->producer_pos); + return prod_pos - cons_pos; + } } static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb) @@ -404,11 +427,43 @@ bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) return (void*)((addr & PAGE_MASK) - off); } +static bool bpf_ringbuf_has_space(const struct bpf_ringbuf *rb, + unsigned long new_prod_pos, + unsigned long cons_pos, + unsigned long pend_pos) +{ + /* + * No space if oldest not yet committed record until the newest + * record span more than (ringbuf_size - 1). + */ + if (new_prod_pos - pend_pos > rb->mask) + return false; + + /* Ok, we have space in overwrite mode */ + if (unlikely(rb->overwrite_mode)) + return true; + + /* + * No space if producer position advances more than (ringbuf_size - 1) + * ahead of consumer position when not in overwrite mode. + */ + if (new_prod_pos - cons_pos > rb->mask) + return false; + + return true; +} + +static u32 bpf_ringbuf_round_up_hdr_len(u32 hdr_len) +{ + hdr_len &= ~BPF_RINGBUF_DISCARD_BIT; + return round_up(hdr_len + BPF_RINGBUF_HDR_SZ, 8); +} + static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) { - unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, flags; + unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, over_pos, flags; struct bpf_ringbuf_hdr *hdr; - u32 len, pg_off, tmp_size, hdr_len; + u32 len, pg_off, hdr_len; if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) return NULL; @@ -431,24 +486,43 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) hdr_len = READ_ONCE(hdr->len); if (hdr_len & BPF_RINGBUF_BUSY_BIT) break; - tmp_size = hdr_len & ~BPF_RINGBUF_DISCARD_BIT; - tmp_size = round_up(tmp_size + BPF_RINGBUF_HDR_SZ, 8); - pend_pos += tmp_size; + pend_pos += bpf_ringbuf_round_up_hdr_len(hdr_len); } rb->pending_pos = pend_pos; - /* check for out of ringbuf space: - * - by ensuring producer position doesn't advance more than - * (ringbuf_size - 1) ahead - * - by ensuring oldest not yet committed record until newest - * record does not span more than (ringbuf_size - 1) - */ - if (new_prod_pos - cons_pos > rb->mask || - new_prod_pos - pend_pos > rb->mask) { + if (!bpf_ringbuf_has_space(rb, new_prod_pos, cons_pos, pend_pos)) { raw_res_spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } + /* + * In overwrite mode, advance overwrite_pos when the ring buffer is full. + * The key points are to stay on record boundaries and consume enough records + * to fit the new one. + */ + if (unlikely(rb->overwrite_mode)) { + over_pos = rb->overwrite_pos; + while (new_prod_pos - over_pos > rb->mask) { + hdr = (void *)rb->data + (over_pos & rb->mask); + hdr_len = READ_ONCE(hdr->len); + /* + * The bpf_ringbuf_has_space() check above ensures we won’t + * step over a record currently being worked on by another + * producer. + */ + over_pos += bpf_ringbuf_round_up_hdr_len(hdr_len); + } + /* + * smp_store_release(&rb->producer_pos, new_prod_pos) at + * the end of the function ensures that when consumer sees + * the updated rb->producer_pos, it always sees the updated + * rb->overwrite_pos, so when consumer reads overwrite_pos + * after smp_load_acquire(r->producer_pos), the overwrite_pos + * will always be valid. + */ + WRITE_ONCE(rb->overwrite_pos, over_pos); + } + hdr = (void *)rb->data + (prod_pos & rb->mask); pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); hdr->len = size | BPF_RINGBUF_BUSY_BIT; @@ -578,6 +652,8 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) return smp_load_acquire(&rb->consumer_pos); case BPF_RB_PROD_POS: return smp_load_acquire(&rb->producer_pos); + case BPF_RB_OVERWRITE_POS: + return smp_load_acquire(&rb->overwrite_pos); default: return 0; } diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index a00561b1d3e5..f7d0c8d4644e 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -89,15 +89,14 @@ struct rqspinlock_timeout { DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); EXPORT_SYMBOL_GPL(rqspinlock_held_locks); -static bool is_lock_released(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts) +static bool is_lock_released(rqspinlock_t *lock, u32 mask) { if (!(atomic_read_acquire(&lock->val) & (mask))) return true; return false; } -static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask, - struct rqspinlock_timeout *ts) +static noinline int check_deadlock_AA(rqspinlock_t *lock) { struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); int cnt = min(RES_NR_HELD, rqh->cnt); @@ -118,8 +117,7 @@ static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask, * more locks, which reduce to ABBA). This is not exhaustive, and we rely on * timeouts as the final line of defense. */ -static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask, - struct rqspinlock_timeout *ts) +static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask) { struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); int rqh_cnt = min(RES_NR_HELD, rqh->cnt); @@ -142,7 +140,7 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask, * Let's ensure to break out of this loop if the lock is available for * us to potentially acquire. */ - if (is_lock_released(lock, mask, ts)) + if (is_lock_released(lock, mask)) return 0; /* @@ -198,33 +196,21 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask, return 0; } -static noinline int check_deadlock(rqspinlock_t *lock, u32 mask, - struct rqspinlock_timeout *ts) -{ - int ret; - - ret = check_deadlock_AA(lock, mask, ts); - if (ret) - return ret; - ret = check_deadlock_ABBA(lock, mask, ts); - if (ret) - return ret; - - return 0; -} - static noinline int check_timeout(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts) { - u64 time = ktime_get_mono_fast_ns(); u64 prev = ts->cur; + u64 time; if (!ts->timeout_end) { - ts->cur = time; - ts->timeout_end = time + ts->duration; + if (check_deadlock_AA(lock)) + return -EDEADLK; + ts->cur = ktime_get_mono_fast_ns(); + ts->timeout_end = ts->cur + ts->duration; return 0; } + time = ktime_get_mono_fast_ns(); if (time > ts->timeout_end) return -ETIMEDOUT; @@ -234,7 +220,7 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask, */ if (prev + NSEC_PER_MSEC < time) { ts->cur = time; - return check_deadlock(lock, mask, ts); + return check_deadlock_ABBA(lock, mask); } return 0; @@ -278,6 +264,10 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) int val, ret = 0; RES_INIT_TIMEOUT(ts); + /* + * The fast path is not invoked for the TAS fallback, so we must grab + * the deadlock detection entry here. + */ grab_held_lock_entry(lock); /* @@ -400,10 +390,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) goto queue; } - /* - * Grab an entry in the held locks array, to enable deadlock detection. - */ - grab_held_lock_entry(lock); + /* Deadlock detection entry already held after failing fast path. */ /* * We're pending, wait for the owner to go away. @@ -450,12 +437,21 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) * queuing. */ queue: - lockevent_inc(lock_slowpath); /* - * Grab deadlock detection entry for the queue path. + * Do not queue if we're a waiter and someone is attempting this lock on + * the same CPU. In case of NMIs, this prevents long timeouts where we + * interrupt the pending waiter, and the owner, that will eventually + * signal the head of our queue, both of which are logically but not + * physically part of the queue, hence outside the scope of the idx > 0 + * check above for the trylock fallback. */ - grab_held_lock_entry(lock); + if (check_deadlock_AA(lock)) { + ret = -EDEADLK; + goto err_release_entry; + } + lockevent_inc(lock_slowpath); + /* Deadlock detection entry already held after failing fast path. */ node = this_cpu_ptr(&rqnodes[0].mcs); idx = node->count++; tail = encode_tail(smp_processor_id(), idx); @@ -467,19 +463,17 @@ queue: * not be nested NMIs taking spinlocks. That may not be true in * some architectures even though the chance of needing more than * 4 nodes will still be extremely unlikely. When that happens, - * we fall back to spinning on the lock directly without using - * any MCS node. This is not the most elegant solution, but is - * simple enough. + * we fall back to attempting a trylock operation without using + * any MCS node. Unlike qspinlock which cannot fail, we have the + * option of failing the slow path, and under contention, such a + * trylock spinning will likely be treated unfairly due to lack of + * queueing, hence do not spin. */ - if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) { + if (unlikely(idx >= _Q_MAX_NODES || (in_nmi() && idx > 0))) { lockevent_inc(lock_no_node); - RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); - while (!queued_spin_trylock(lock)) { - if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) { - lockevent_inc(rqspinlock_lock_timeout); - goto err_release_node; - } - cpu_relax(); + if (!queued_spin_trylock(lock)) { + ret = -EDEADLK; + goto err_release_node; } goto release; } @@ -540,7 +534,7 @@ queue: val = arch_mcs_spin_lock_contended(&node->locked); if (val == RES_TIMEOUT_VAL) { - ret = -EDEADLK; + ret = -ETIMEDOUT; goto waitq_timeout; } @@ -575,6 +569,14 @@ queue: val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK)); + /* Disable queue destruction when we detect deadlocks. */ + if (ret == -EDEADLK) { + if (!next) + next = smp_cond_load_relaxed(&node->next, (VAL)); + arch_mcs_spin_unlock_contended(&next->locked); + goto err_release_node; + } + waitq_timeout: if (ret) { /* diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 8f1dacaf01fe..da3d328f5c15 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -42,6 +42,28 @@ static inline int stack_map_data_size(struct bpf_map *map) sizeof(struct bpf_stack_build_id) : sizeof(u64); } +/** + * stack_map_calculate_max_depth - Calculate maximum allowed stack trace depth + * @size: Size of the buffer/map value in bytes + * @elem_size: Size of each stack trace element + * @flags: BPF stack trace flags (BPF_F_USER_STACK, BPF_F_USER_BUILD_ID, ...) + * + * Return: Maximum number of stack trace entries that can be safely stored + */ +static u32 stack_map_calculate_max_depth(u32 size, u32 elem_size, u64 flags) +{ + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; + u32 max_depth; + u32 curr_sysctl_max_stack = READ_ONCE(sysctl_perf_event_max_stack); + + max_depth = size / elem_size; + max_depth += skip; + if (max_depth > curr_sysctl_max_stack) + return curr_sysctl_max_stack; + + return max_depth; +} + static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { u64 elem_size = sizeof(struct stack_map_bucket) + @@ -229,8 +251,8 @@ static long __bpf_get_stackid(struct bpf_map *map, { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *new_bucket, *old_bucket; + u32 hash, id, trace_nr, trace_len, i, max_depth; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; - u32 hash, id, trace_nr, trace_len, i; bool user = flags & BPF_F_USER_STACK; u64 *ips; bool hash_matches; @@ -239,7 +261,8 @@ static long __bpf_get_stackid(struct bpf_map *map, /* skipping more than usable stack trace */ return -EFAULT; - trace_nr = trace->nr - skip; + max_depth = stack_map_calculate_max_depth(map->value_size, stack_map_data_size(map), flags); + trace_nr = min_t(u32, trace->nr - skip, max_depth - skip); trace_len = trace_nr * sizeof(u64); ips = trace->ip + skip; hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); @@ -300,20 +323,17 @@ static long __bpf_get_stackid(struct bpf_map *map, BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { - u32 max_depth = map->value_size / stack_map_data_size(map); - u32 skip = flags & BPF_F_SKIP_FIELD_MASK; + u32 elem_size = stack_map_data_size(map); bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; bool kernel = !user; + u32 max_depth; if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; - max_depth += skip; - if (max_depth > sysctl_perf_event_max_stack) - max_depth = sysctl_perf_event_max_stack; - + max_depth = stack_map_calculate_max_depth(map->value_size, elem_size, flags); trace = get_perf_callchain(regs, kernel, user, max_depth, false, false, 0); @@ -371,15 +391,11 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, return -EFAULT; nr_kernel = count_kernel_ip(trace); + __u64 nr = trace->nr; /* save original */ if (kernel) { - __u64 nr = trace->nr; - trace->nr = nr_kernel; ret = __bpf_get_stackid(map, trace, flags); - - /* restore nr */ - trace->nr = nr; } else { /* user */ u64 skip = flags & BPF_F_SKIP_FIELD_MASK; @@ -390,6 +406,10 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; ret = __bpf_get_stackid(map, trace, flags); } + + /* restore nr */ + trace->nr = nr; + return ret; } @@ -406,7 +426,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, struct perf_callchain_entry *trace_in, void *buf, u32 size, u64 flags, bool may_fault) { - u32 trace_nr, copy_len, elem_size, num_elem, max_depth; + u32 trace_nr, copy_len, elem_size, max_depth; bool user_build_id = flags & BPF_F_USER_BUILD_ID; bool crosstask = task && task != current; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; @@ -438,21 +458,20 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, goto clear; } - num_elem = size / elem_size; - max_depth = num_elem + skip; - if (sysctl_perf_event_max_stack < max_depth) - max_depth = sysctl_perf_event_max_stack; + max_depth = stack_map_calculate_max_depth(size, elem_size, flags); if (may_fault) rcu_read_lock(); /* need RCU for perf's callchain below */ - if (trace_in) + if (trace_in) { trace = trace_in; - else if (kernel && task) + trace->nr = min_t(u32, trace->nr, max_depth); + } else if (kernel && task) { trace = get_callchain_entry_for_task(task, max_depth); - else + } else { trace = get_perf_callchain(regs, kernel, user, max_depth, crosstask, false, 0); + } if (unlikely(!trace) || trace->nr < skip) { if (may_fault) @@ -461,7 +480,6 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, } trace_nr = trace->nr - skip; - trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; copy_len = trace_nr * elem_size; ips = trace->ip + skip; diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index ff16c631951b..0b6bc3f30335 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -4,111 +4,10 @@ #include <linux/bpf.h> #include <linux/filter.h> #include <linux/bpf_mem_alloc.h> -#include <linux/percpu.h> -#include <linux/refcount.h> #include <linux/gfp.h> #include <linux/memory.h> -#include <linux/local_lock.h> #include <linux/mutex.h> -/* - * Simple per-CPU NMI-safe bump allocation mechanism, backed by the NMI-safe - * try_alloc_pages()/free_pages_nolock() primitives. We allocate a page and - * stash it in a local per-CPU variable, and bump allocate from the page - * whenever items need to be printed to a stream. Each page holds a global - * atomic refcount in its first 4 bytes, and then records of variable length - * that describe the printed messages. Once the global refcount has dropped to - * zero, it is a signal to free the page back to the kernel's page allocator, - * given all the individual records in it have been consumed. - * - * It is possible the same page is used to serve allocations across different - * programs, which may be consumed at different times individually, hence - * maintaining a reference count per-page is critical for correct lifetime - * tracking. - * - * The bpf_stream_page code will be replaced to use kmalloc_nolock() once it - * lands. - */ -struct bpf_stream_page { - refcount_t ref; - u32 consumed; - char buf[]; -}; - -/* Available room to add data to a refcounted page. */ -#define BPF_STREAM_PAGE_SZ (PAGE_SIZE - offsetofend(struct bpf_stream_page, consumed)) - -static DEFINE_PER_CPU(local_trylock_t, stream_local_lock) = INIT_LOCAL_TRYLOCK(stream_local_lock); -static DEFINE_PER_CPU(struct bpf_stream_page *, stream_pcpu_page); - -static bool bpf_stream_page_local_lock(unsigned long *flags) -{ - return local_trylock_irqsave(&stream_local_lock, *flags); -} - -static void bpf_stream_page_local_unlock(unsigned long *flags) -{ - local_unlock_irqrestore(&stream_local_lock, *flags); -} - -static void bpf_stream_page_free(struct bpf_stream_page *stream_page) -{ - struct page *p; - - if (!stream_page) - return; - p = virt_to_page(stream_page); - free_pages_nolock(p, 0); -} - -static void bpf_stream_page_get(struct bpf_stream_page *stream_page) -{ - refcount_inc(&stream_page->ref); -} - -static void bpf_stream_page_put(struct bpf_stream_page *stream_page) -{ - if (refcount_dec_and_test(&stream_page->ref)) - bpf_stream_page_free(stream_page); -} - -static void bpf_stream_page_init(struct bpf_stream_page *stream_page) -{ - refcount_set(&stream_page->ref, 1); - stream_page->consumed = 0; -} - -static struct bpf_stream_page *bpf_stream_page_replace(void) -{ - struct bpf_stream_page *stream_page, *old_stream_page; - struct page *page; - - page = alloc_pages_nolock(/* Don't account */ 0, NUMA_NO_NODE, 0); - if (!page) - return NULL; - stream_page = page_address(page); - bpf_stream_page_init(stream_page); - - old_stream_page = this_cpu_read(stream_pcpu_page); - if (old_stream_page) - bpf_stream_page_put(old_stream_page); - this_cpu_write(stream_pcpu_page, stream_page); - return stream_page; -} - -static int bpf_stream_page_check_room(struct bpf_stream_page *stream_page, int len) -{ - int min = offsetof(struct bpf_stream_elem, str[0]); - int consumed = stream_page->consumed; - int total = BPF_STREAM_PAGE_SZ; - int rem = max(0, total - consumed - min); - - /* Let's give room of at least 8 bytes. */ - WARN_ON_ONCE(rem % 8 != 0); - rem = rem < 8 ? 0 : rem; - return min(len, rem); -} - static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len) { init_llist_node(&elem->node); @@ -116,54 +15,12 @@ static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len) elem->consumed_len = 0; } -static struct bpf_stream_page *bpf_stream_page_from_elem(struct bpf_stream_elem *elem) -{ - unsigned long addr = (unsigned long)elem; - - return (struct bpf_stream_page *)PAGE_ALIGN_DOWN(addr); -} - -static struct bpf_stream_elem *bpf_stream_page_push_elem(struct bpf_stream_page *stream_page, int len) -{ - u32 consumed = stream_page->consumed; - - stream_page->consumed += round_up(offsetof(struct bpf_stream_elem, str[len]), 8); - return (struct bpf_stream_elem *)&stream_page->buf[consumed]; -} - -static struct bpf_stream_elem *bpf_stream_page_reserve_elem(int len) -{ - struct bpf_stream_elem *elem = NULL; - struct bpf_stream_page *page; - int room = 0; - - page = this_cpu_read(stream_pcpu_page); - if (!page) - page = bpf_stream_page_replace(); - if (!page) - return NULL; - - room = bpf_stream_page_check_room(page, len); - if (room != len) - page = bpf_stream_page_replace(); - if (!page) - return NULL; - bpf_stream_page_get(page); - room = bpf_stream_page_check_room(page, len); - WARN_ON_ONCE(room != len); - - elem = bpf_stream_page_push_elem(page, room); - bpf_stream_elem_init(elem, room); - return elem; -} - static struct bpf_stream_elem *bpf_stream_elem_alloc(int len) { const int max_len = ARRAY_SIZE((struct bpf_bprintf_buffers){}.buf); struct bpf_stream_elem *elem; - unsigned long flags; + size_t alloc_size; - BUILD_BUG_ON(max_len > BPF_STREAM_PAGE_SZ); /* * Length denotes the amount of data to be written as part of stream element, * thus includes '\0' byte. We're capped by how much bpf_bprintf_buffers can @@ -172,10 +29,13 @@ static struct bpf_stream_elem *bpf_stream_elem_alloc(int len) if (len < 0 || len > max_len) return NULL; - if (!bpf_stream_page_local_lock(&flags)) + alloc_size = offsetof(struct bpf_stream_elem, str[len]); + elem = kmalloc_nolock(alloc_size, __GFP_ZERO, -1); + if (!elem) return NULL; - elem = bpf_stream_page_reserve_elem(len); - bpf_stream_page_local_unlock(&flags); + + bpf_stream_elem_init(elem, len); + return elem; } @@ -231,10 +91,7 @@ static struct bpf_stream *bpf_stream_get(enum bpf_stream_id stream_id, struct bp static void bpf_stream_free_elem(struct bpf_stream_elem *elem) { - struct bpf_stream_page *p; - - p = bpf_stream_page_from_elem(elem); - bpf_stream_page_put(p); + kfree_nolock(elem); } static void bpf_stream_free_list(struct llist_node *list) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6cde6a46babf..4ff82144f885 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -158,7 +158,7 @@ static void maybe_wait_bpf_programs(struct bpf_map *map) */ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) - synchronize_rcu(); + synchronize_rcu_expedited(); } static void unpin_uptr_kaddr(void *kaddr) @@ -1162,7 +1162,7 @@ static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr if (map->ops->map_get_unmapped_area) return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags); #ifdef CONFIG_MMU - return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(filp, addr, len, pgoff, flags); #else return addr; #endif @@ -1234,6 +1234,7 @@ int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) return src - orig_src; } +EXPORT_SYMBOL_GPL(bpf_obj_name_cpy); int map_check_no_btf(const struct bpf_map *map, const struct btf *btf, @@ -1493,6 +1494,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_MAP_TYPE_STRUCT_OPS: case BPF_MAP_TYPE_CPUMAP: case BPF_MAP_TYPE_ARENA: + case BPF_MAP_TYPE_INSN_ARRAY: if (!bpf_token_capable(token, CAP_BPF)) goto put_token; break; @@ -1585,7 +1587,8 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) goto free_map; } } else if (attr->excl_prog_hash_size) { - return -EINVAL; + err = -EINVAL; + goto free_map; } err = security_bpf_map_create(map, attr, token, uattr.is_kernel); @@ -1724,9 +1727,6 @@ static int map_lookup_elem(union bpf_attr *attr) if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) return -EINVAL; - if (attr->flags & ~BPF_F_LOCK) - return -EINVAL; - CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) @@ -1734,9 +1734,9 @@ static int map_lookup_elem(union bpf_attr *attr) if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) return -EPERM; - if ((attr->flags & BPF_F_LOCK) && - !btf_record_has_field(map->record, BPF_SPIN_LOCK)) - return -EINVAL; + err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK); + if (err) + return err; key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) @@ -1799,11 +1799,9 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - if ((attr->flags & BPF_F_LOCK) && - !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { - err = -EINVAL; + err = bpf_map_check_op_flags(map, attr->flags, ~0); + if (err) goto err_put; - } key = ___bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { @@ -2007,13 +2005,9 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, void *key, *value; int err = 0; - if (attr->batch.elem_flags & ~BPF_F_LOCK) - return -EINVAL; - - if ((attr->batch.elem_flags & BPF_F_LOCK) && - !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { - return -EINVAL; - } + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + if (err) + return err; value_size = bpf_map_value_size(map); @@ -2070,12 +2064,9 @@ int generic_map_lookup_batch(struct bpf_map *map, u32 value_size, cp, max_count; int err; - if (attr->batch.elem_flags & ~BPF_F_LOCK) - return -EINVAL; - - if ((attr->batch.elem_flags & BPF_F_LOCK) && - !btf_record_has_field(map->record, BPF_SPIN_LOCK)) - return -EINVAL; + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + if (err) + return err; value_size = bpf_map_value_size(map); @@ -2462,6 +2453,9 @@ void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog) struct bpf_prog_stats *stats; unsigned int flags; + if (unlikely(!prog->stats)) + return; + stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->misses); @@ -2853,6 +2847,23 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr return err; } +static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog) +{ + int err; + int i; + + for (i = 0; i < prog->aux->used_map_cnt; i++) { + if (prog->aux->used_maps[i]->map_type != BPF_MAP_TYPE_INSN_ARRAY) + continue; + + err = bpf_insn_array_ready(prog->aux->used_maps[i]); + if (err) + return err; + } + + return 0; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD keyring_id @@ -3082,6 +3093,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (err < 0) goto free_used_maps; + err = bpf_prog_mark_insn_arrays_ready(prog); + if (err < 0) + goto free_used_maps; + err = bpf_prog_alloc_id(prog); if (err) goto free_used_maps; @@ -5034,19 +5049,19 @@ static int bpf_prog_get_info_by_fd(struct file *file, struct bpf_insn *insns_sanitized; bool fault; - if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) { + if (!prog->blinded || bpf_dump_raw_ok(file->f_cred)) { + insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); + if (!insns_sanitized) + return -ENOMEM; + uinsns = u64_to_user_ptr(info.xlated_prog_insns); + ulen = min_t(u32, info.xlated_prog_len, ulen); + fault = copy_to_user(uinsns, insns_sanitized, ulen); + kfree(insns_sanitized); + if (fault) + return -EFAULT; + } else { info.xlated_prog_insns = 0; - goto done; } - insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); - if (!insns_sanitized) - return -ENOMEM; - uinsns = u64_to_user_ptr(info.xlated_prog_insns); - ulen = min_t(u32, info.xlated_prog_len, ulen); - fault = copy_to_user(uinsns, insns_sanitized, ulen); - kfree(insns_sanitized); - if (fault) - return -EFAULT; } if (bpf_prog_is_offloaded(prog->aux)) { diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index f2cb0b097093..976d89011b15 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -175,23 +175,42 @@ out: return tr; } -static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) +static int bpf_trampoline_update_fentry(struct bpf_trampoline *tr, u32 orig_flags, + void *old_addr, void *new_addr) { + enum bpf_text_poke_type new_t = BPF_MOD_CALL, old_t = BPF_MOD_CALL; void *ip = tr->func.addr; + + if (!new_addr) + new_t = BPF_MOD_NOP; + else if (bpf_trampoline_use_jmp(tr->flags)) + new_t = BPF_MOD_JUMP; + + if (!old_addr) + old_t = BPF_MOD_NOP; + else if (bpf_trampoline_use_jmp(orig_flags)) + old_t = BPF_MOD_JUMP; + + return bpf_arch_text_poke(ip, old_t, new_t, old_addr, new_addr); +} + +static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, + void *old_addr) +{ int ret; if (tr->func.ftrace_managed) ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false); else - ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); + ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL); return ret; } -static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr, +static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, + void *old_addr, void *new_addr, bool lock_direct_mutex) { - void *ip = tr->func.addr; int ret; if (tr->func.ftrace_managed) { @@ -200,7 +219,8 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad else ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr); } else { - ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr); + ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, + new_addr); } return ret; } @@ -220,10 +240,12 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) } if (tr->func.ftrace_managed) { - ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); + ret = ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); + if (ret) + return ret; ret = register_ftrace_direct(tr->fops, (long)new_addr); } else { - ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); + ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr); } return ret; @@ -334,8 +356,9 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im) * call_rcu_tasks() is not necessary. */ if (im->ip_after_call) { - int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP, - NULL, im->ip_epilogue); + int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_NOP, + BPF_MOD_JUMP, NULL, + im->ip_epilogue); WARN_ON(err); if (IS_ENABLED(CONFIG_TASKS_RCU)) call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks); @@ -408,7 +431,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut return PTR_ERR(tlinks); if (total == 0) { - err = unregister_fentry(tr, tr->cur_image->image); + err = unregister_fentry(tr, orig_flags, tr->cur_image->image); bpf_tramp_image_put(tr->cur_image); tr->cur_image = NULL; goto out; @@ -432,9 +455,20 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS again: - if ((tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) && - (tr->flags & BPF_TRAMP_F_CALL_ORIG)) - tr->flags |= BPF_TRAMP_F_ORIG_STACK; + if (tr->flags & BPF_TRAMP_F_CALL_ORIG) { + if (tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) { + /* The BPF_TRAMP_F_SKIP_FRAME can be cleared in the + * first try, reset it in the second try. + */ + tr->flags |= BPF_TRAMP_F_ORIG_STACK | BPF_TRAMP_F_SKIP_FRAME; + } else if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_JMP)) { + /* Use "jmp" instead of "call" for the trampoline + * in the origin call case, and we don't need to + * skip the frame. + */ + tr->flags &= ~BPF_TRAMP_F_SKIP_FRAME; + } + } #endif size = arch_bpf_trampoline_size(&tr->func.model, tr->flags, @@ -465,10 +499,18 @@ again: if (err) goto out_free; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP + if (bpf_trampoline_use_jmp(tr->flags)) + tr->fops->flags |= FTRACE_OPS_FL_JMP; + else + tr->fops->flags &= ~FTRACE_OPS_FL_JMP; +#endif + WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) /* progs already running at this address */ - err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex); + err = modify_fentry(tr, orig_flags, tr->cur_image->image, + im->image, lock_direct_mutex); else /* first time registering */ err = register_fentry(tr, im->image); @@ -491,8 +533,15 @@ again: tr->cur_image = im; out: /* If any error happens, restore previous flags */ - if (err) + if (err) { tr->flags = orig_flags; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP + if (bpf_trampoline_use_jmp(tr->flags)) + tr->fops->flags |= FTRACE_OPS_FL_JMP; + else + tr->fops->flags &= ~FTRACE_OPS_FL_JMP; +#endif + } kfree(tlinks); return err; @@ -568,7 +617,8 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, if (err) return err; tr->extension_prog = link->link.prog; - return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, + return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP, + BPF_MOD_JUMP, NULL, link->link.prog->bpf_func); } if (cnt >= BPF_MAX_TRAMP_LINKS) @@ -616,6 +666,7 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, + BPF_MOD_NOP, tr->extension_prog->bpf_func, NULL); tr->extension_prog = NULL; guard(mutex)(&tgt_prog->aux->ext_mutex); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fbe4bb91c564..f0ca69f888fa 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -209,8 +209,6 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env); static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env); static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg); -static void specialize_kfunc(struct bpf_verifier_env *env, - u32 func_id, u16 offset, unsigned long *addr); static bool is_trusted_reg(const struct bpf_reg_state *reg); static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) @@ -515,6 +513,7 @@ static bool is_callback_calling_kfunc(u32 btf_id); static bool is_bpf_throw_kfunc(struct bpf_insn *insn); static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id); +static bool is_task_work_add_kfunc(u32 func_id); static bool is_sync_callback_calling_function(enum bpf_func_id func_id) { @@ -547,6 +546,21 @@ static bool is_async_callback_calling_insn(struct bpf_insn *insn) (bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm)); } +static bool is_async_cb_sleepable(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + /* bpf_timer callbacks are never sleepable. */ + if (bpf_helper_call(insn) && insn->imm == BPF_FUNC_timer_set_callback) + return false; + + /* bpf_wq and bpf_task_work callbacks are always sleepable. */ + if (bpf_pseudo_kfunc_call(insn) && insn->off == 0 && + (is_bpf_wq_set_callback_impl_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm))) + return true; + + verifier_bug(env, "unhandled async callback in is_async_cb_sleepable"); + return false; +} + static bool is_may_goto_insn(struct bpf_insn *insn) { return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO; @@ -676,6 +690,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) return BPF_DYNPTR_TYPE_XDP; case DYNPTR_TYPE_SKB_META: return BPF_DYNPTR_TYPE_SKB_META; + case DYNPTR_TYPE_FILE: + return BPF_DYNPTR_TYPE_FILE; default: return BPF_DYNPTR_TYPE_INVALID; } @@ -694,6 +710,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) return DYNPTR_TYPE_XDP; case BPF_DYNPTR_TYPE_SKB_META: return DYNPTR_TYPE_SKB_META; + case BPF_DYNPTR_TYPE_FILE: + return DYNPTR_TYPE_FILE; default: return 0; } @@ -701,7 +719,7 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) static bool dynptr_type_refcounted(enum bpf_dynptr_type type) { - return type == BPF_DYNPTR_TYPE_RINGBUF; + return type == BPF_DYNPTR_TYPE_RINGBUF || type == BPF_DYNPTR_TYPE_FILE; } static void __mark_dynptr_reg(struct bpf_reg_state *reg, @@ -812,6 +830,15 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re struct bpf_func_state *state = func(env, reg); int spi, ref_obj_id, i; + /* + * This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot + * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr + * is safe to do directly. + */ + if (reg->type == CONST_PTR_TO_DYNPTR) { + verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released"); + return -EFAULT; + } spi = dynptr_get_spi(env, reg); if (spi < 0) return spi; @@ -1410,7 +1437,7 @@ static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf dst->acquired_refs = src->acquired_refs; dst->active_locks = src->active_locks; dst->active_preempt_locks = src->active_preempt_locks; - dst->active_rcu_lock = src->active_rcu_lock; + dst->active_rcu_locks = src->active_rcu_locks; dst->active_irq_id = src->active_irq_id; dst->active_lock_id = src->active_lock_id; dst->active_lock_ptr = src->active_lock_ptr; @@ -2093,7 +2120,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT); if (!elem) - return NULL; + return ERR_PTR(-ENOMEM); elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; @@ -2103,12 +2130,12 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, env->stack_size++; err = copy_verifier_state(&elem->st, cur); if (err) - return NULL; + return ERR_PTR(-ENOMEM); elem->st.speculative |= speculative; if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { verbose(env, "The sequence of %d jumps is too complex.\n", env->stack_size); - return NULL; + return ERR_PTR(-E2BIG); } if (elem->st.parent) { ++elem->st.parent->branches; @@ -2903,7 +2930,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT); if (!elem) - return NULL; + return ERR_PTR(-ENOMEM); elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; @@ -2915,7 +2942,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, verbose(env, "The sequence of %d jumps is too complex for async cb.\n", env->stack_size); - return NULL; + return ERR_PTR(-E2BIG); } /* Unlike push_stack() do not copy_verifier_state(). * The caller state doesn't matter. @@ -2926,7 +2953,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, elem->st.in_sleepable = is_sleepable; frame = kzalloc(sizeof(*frame), GFP_KERNEL_ACCOUNT); if (!frame) - return NULL; + return ERR_PTR(-ENOMEM); init_func_state(env, frame, BPF_MAIN_FUNC /* callsite */, 0 /* frameno within this callchain */, @@ -3097,6 +3124,9 @@ struct bpf_kfunc_btf_tab { u32 nr_descs; }; +static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, + int insn_idx); + static int kfunc_desc_cmp_by_id_off(const void *a, const void *b) { const struct bpf_kfunc_desc *d0 = a; @@ -3114,7 +3144,7 @@ static int kfunc_btf_cmp_by_off(const void *a, const void *b) return d0->offset - d1->offset; } -static const struct bpf_kfunc_desc * +static struct bpf_kfunc_desc * find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset) { struct bpf_kfunc_desc desc = { @@ -3237,12 +3267,12 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) { const struct btf_type *func, *func_proto; struct bpf_kfunc_btf_tab *btf_tab; + struct btf_func_model func_model; struct bpf_kfunc_desc_tab *tab; struct bpf_prog_aux *prog_aux; struct bpf_kfunc_desc *desc; const char *func_name; struct btf *desc_btf; - unsigned long call_imm; unsigned long addr; int err; @@ -3326,19 +3356,6 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) func_name); return -EINVAL; } - specialize_kfunc(env, func_id, offset, &addr); - - if (bpf_jit_supports_far_kfunc_call()) { - call_imm = func_id; - } else { - call_imm = BPF_CALL_IMM(addr); - /* Check whether the relative offset overflows desc->imm */ - if ((unsigned long)(s32)call_imm != call_imm) { - verbose(env, "address of kernel function %s is out of range\n", - func_name); - return -EINVAL; - } - } if (bpf_dev_bound_kfunc_id(func_id)) { err = bpf_dev_bound_kfunc_check(&env->log, prog_aux); @@ -3346,18 +3363,20 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return err; } + err = btf_distill_func_proto(&env->log, desc_btf, + func_proto, func_name, + &func_model); + if (err) + return err; + desc = &tab->descs[tab->nr_descs++]; desc->func_id = func_id; - desc->imm = call_imm; desc->offset = offset; desc->addr = addr; - err = btf_distill_func_proto(&env->log, desc_btf, - func_proto, func_name, - &desc->func_model); - if (!err) - sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), - kfunc_desc_cmp_by_id_off, NULL); - return err; + desc->func_model = func_model; + sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), + kfunc_desc_cmp_by_id_off, NULL); + return 0; } static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b) @@ -3372,16 +3391,43 @@ static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b) return 0; } -static void sort_kfunc_descs_by_imm_off(struct bpf_prog *prog) +static int set_kfunc_desc_imm(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc) +{ + unsigned long call_imm; + + if (bpf_jit_supports_far_kfunc_call()) { + call_imm = desc->func_id; + } else { + call_imm = BPF_CALL_IMM(desc->addr); + /* Check whether the relative offset overflows desc->imm */ + if ((unsigned long)(s32)call_imm != call_imm) { + verbose(env, "address of kernel func_id %u is out of range\n", + desc->func_id); + return -EINVAL; + } + } + desc->imm = call_imm; + return 0; +} + +static int sort_kfunc_descs_by_imm_off(struct bpf_verifier_env *env) { struct bpf_kfunc_desc_tab *tab; + int i, err; - tab = prog->aux->kfunc_tab; + tab = env->prog->aux->kfunc_tab; if (!tab) - return; + return 0; + + for (i = 0; i < tab->nr_descs; i++) { + err = set_kfunc_desc_imm(env, &tab->descs[i]); + if (err) + return err; + } sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off, NULL); + return 0; } bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog) @@ -3509,8 +3555,12 @@ static int check_subprogs(struct bpf_verifier_env *env) subprog[cur_subprog].has_ld_abs = true; if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) goto next; - if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) + if (BPF_OP(code) == BPF_CALL) goto next; + if (BPF_OP(code) == BPF_EXIT) { + subprog[cur_subprog].exit_idx = i; + goto next; + } off = i + bpf_jmp_offset(&insn[i]) + 1; if (off < subprog_start || off >= subprog_end) { verbose(env, "jump out of range from insn %d to %d\n", i, off); @@ -4392,6 +4442,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, bt_reg_mask(bt)); return -EFAULT; } + if (insn->src_reg == BPF_REG_0 && insn->imm == BPF_FUNC_tail_call + && subseq_idx - idx != 1) { + if (bt_subprog_enter(bt)) + return -EFAULT; + } } else if (opcode == BPF_EXIT) { bool r0_precise; @@ -5826,8 +5881,7 @@ bad_type: static bool in_sleepable(struct bpf_verifier_env *env) { - return env->prog->sleepable || - (env->cur_state && env->cur_state->in_sleepable); + return env->cur_state->in_sleepable; } /* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock() @@ -5835,7 +5889,7 @@ static bool in_sleepable(struct bpf_verifier_env *env) */ static bool in_rcu_cs(struct bpf_verifier_env *env) { - return env->cur_state->active_rcu_lock || + return env->cur_state->active_rcu_locks || env->cur_state->active_locks || !in_sleepable(env); } @@ -5988,6 +6042,18 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, return 0; } +/* + * Return the size of the memory region accessible from a pointer to map value. + * For INSN_ARRAY maps whole bpf_insn_array->ips array is accessible. + */ +static u32 map_mem_size(const struct bpf_map *map) +{ + if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) + return map->max_entries * sizeof(long); + + return map->value_size; +} + /* check read/write into a map element with possible variable offset */ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed, @@ -5997,11 +6063,11 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; struct bpf_map *map = reg->map_ptr; + u32 mem_size = map_mem_size(map); struct btf_record *rec; int err, i; - err = check_mem_region_access(env, regno, off, size, map->value_size, - zero_size_allowed); + err = check_mem_region_access(env, regno, off, size, mem_size, zero_size_allowed); if (err) return err; @@ -6416,6 +6482,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, break; case PTR_TO_MAP_VALUE: pointer_desc = "value "; + if (reg->map_ptr->map_type == BPF_MAP_TYPE_INSN_ARRAY) + strict = true; break; case PTR_TO_CTX: pointer_desc = "context "; @@ -7039,6 +7107,9 @@ BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state) { /* RCU trusted: these fields are trusted in RCU CS and can be NULL */ BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) { struct file __rcu *exe_file; +#ifdef CONFIG_MEMCG + struct task_struct __rcu *owner; +#endif }; /* skb->sk, req->sk are not RCU protected, but we mark them as such @@ -7078,6 +7149,11 @@ BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) { struct sock *sk; }; +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct) { + struct mm_struct *vm_mm; + struct file *vm_file; +}; + static bool type_is_rcu(struct bpf_verifier_env *env, struct bpf_reg_state *reg, const char *field_name, u32 btf_id) @@ -7119,6 +7195,7 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env, { BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted_or_null"); @@ -7502,10 +7579,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (t == BPF_READ && value_regno >= 0) { struct bpf_map *map = reg->map_ptr; - /* if map is read-only, track its contents as scalars */ + /* + * If map is read-only, track its contents as scalars, + * unless it is an insn array (see the special case below) + */ if (tnum_is_const(reg->var_off) && bpf_map_is_rdonly(map) && - map->ops->map_direct_value_addr) { + map->ops->map_direct_value_addr && + map->map_type != BPF_MAP_TYPE_INSN_ARRAY) { int map_off = off + reg->var_off.value; u64 val = 0; @@ -7516,6 +7597,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn regs[value_regno].type = SCALAR_VALUE; __mark_reg_known(®s[value_regno], val); + } else if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) { + if (bpf_size != BPF_DW) { + verbose(env, "Invalid read of %d bytes from insn_array\n", + size); + return -EACCES; + } + copy_register_state(®s[value_regno], reg); + regs[value_regno].type = PTR_TO_INSN; } else { mark_reg_unknown(env, regs, value_regno); } @@ -8464,6 +8553,9 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, case BPF_TASK_WORK: field_off = map->record->task_work_off; break; + case BPF_WORKQUEUE: + field_off = map->record->wq_off; + break; default: verifier_bug(env, "unsupported BTF field type: %s\n", struct_name); return -EINVAL; @@ -8505,13 +8597,17 @@ static int process_wq_func(struct bpf_verifier_env *env, int regno, { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; struct bpf_map *map = reg->map_ptr; - u64 val = reg->var_off.value; + int err; - if (map->record->wq_off != val + reg->off) { - verbose(env, "off %lld doesn't point to 'struct bpf_wq' that is at %d\n", - val + reg->off, map->record->wq_off); - return -EINVAL; + err = check_map_field_pointer(env, regno, BPF_WORKQUEUE); + if (err) + return err; + + if (meta->map.ptr) { + verifier_bug(env, "Two map pointers in a bpf_wq helper"); + return -EFAULT; } + meta->map.uid = reg->map_uid; meta->map.ptr = map; return 0; @@ -9016,8 +9112,8 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, prev_st = find_prev_entry(env, cur_st->parent, insn_idx); /* branch out active iter state */ queued_st = push_stack(env, insn_idx + 1, insn_idx, false); - if (!queued_st) - return -ENOMEM; + if (IS_ERR(queued_st)) + return PTR_ERR(queued_st); queued_iter = get_iter_from_state(queued_st, meta); queued_iter->iter.state = BPF_ITER_STATE_ACTIVE; @@ -10054,6 +10150,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_map_push_elem) goto error; break; + case BPF_MAP_TYPE_INSN_ARRAY: + goto error; default: break; } @@ -10368,8 +10466,6 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env, struct bpf_func_state *callee, int insn_idx); -static bool is_task_work_add_kfunc(u32 func_id); - static int set_callee_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, int insn_idx); @@ -10588,10 +10684,9 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, insn_idx, subprog, - is_bpf_wq_set_callback_impl_kfunc(insn->imm) || - is_task_work_add_kfunc(insn->imm)); - if (!async_cb) - return -EFAULT; + is_async_cb_sleepable(env, insn)); + if (IS_ERR(async_cb)) + return PTR_ERR(async_cb); callee = async_cb->frame[0]; callee->async_entry_cnt = caller->async_entry_cnt + 1; @@ -10607,8 +10702,8 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins * proceed with next instruction within current frame. */ callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false); - if (!callback_state) - return -ENOMEM; + if (IS_ERR(callback_state)) + return PTR_ERR(callback_state); err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb, callback_state); @@ -10648,7 +10743,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } if (env->subprog_info[subprog].might_sleep && - (env->cur_state->active_rcu_lock || env->cur_state->active_preempt_locks || + (env->cur_state->active_rcu_locks || env->cur_state->active_preempt_locks || env->cur_state->active_irq_id || !in_sleepable(env))) { verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n" "i.e., in a RCU/IRQ/preempt-disabled section, or in\n" @@ -10662,8 +10757,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return err; } - verbose(env, "Func#%d ('%s') is global and assumed valid.\n", - subprog, sub_name); + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, "Func#%d ('%s') is global and assumed valid.\n", + subprog, sub_name); if (env->subprog_info[subprog].changes_pkt_data) clear_all_pkt_pointers(env); /* mark global subprog for verifying after main prog */ @@ -10976,6 +11072,10 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) bool in_callback_fn; int err; + err = bpf_update_live_stack(env); + if (err) + return err; + callee = state->frame[state->curframe]; r0 = &callee->regs[BPF_REG_0]; if (r0->type == PTR_TO_STACK) { @@ -11226,7 +11326,7 @@ static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit return -EINVAL; } - if (check_lock && env->cur_state->active_rcu_lock) { + if (check_lock && env->cur_state->active_rcu_locks) { verbose(env, "%s cannot be used inside bpf_rcu_read_lock-ed region\n", prefix); return -EINVAL; } @@ -11361,6 +11461,15 @@ static int get_helper_proto(struct bpf_verifier_env *env, int func_id, return *ptr && (*ptr)->func ? 0 : -EINVAL; } +/* Check if we're in a sleepable context. */ +static inline bool in_sleepable_context(struct bpf_verifier_env *env) +{ + return !env->cur_state->active_rcu_locks && + !env->cur_state->active_preempt_locks && + !env->cur_state->active_irq_id && + in_sleepable(env); +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -11421,15 +11530,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return err; } - if (env->cur_state->active_rcu_lock) { + if (env->cur_state->active_rcu_locks) { if (fn->might_sleep) { verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n", func_id_name(func_id), func_id); return -EINVAL; } - - if (in_sleepable(env) && is_storage_get_function(func_id)) - env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } if (env->cur_state->active_preempt_locks) { @@ -11438,9 +11544,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn func_id_name(func_id), func_id); return -EINVAL; } - - if (in_sleepable(env) && is_storage_get_function(func_id)) - env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } if (env->cur_state->active_irq_id) { @@ -11449,11 +11552,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn func_id_name(func_id), func_id); return -EINVAL; } - - if (in_sleepable(env) && is_storage_get_function(func_id)) - env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } + /* Track non-sleepable context for helpers. */ + if (!in_sleepable_context(env)) + env->insn_aux_data[insn_idx].non_sleepable = true; + meta.func_id = func_id; /* check args */ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { @@ -11484,15 +11588,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (meta.release_regno) { err = -EINVAL; - /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot - * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr - * is safe to do directly. - */ if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { - if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) { - verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released"); - return -EFAULT; - } err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) { u32 ref_obj_id = meta.ref_obj_id; @@ -11886,6 +11982,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn env->prog->call_get_func_ip = true; } + if (func_id == BPF_FUNC_tail_call) { + if (env->cur_state->curframe) { + struct bpf_verifier_state *branch; + + mark_reg_scratched(env, BPF_REG_0); + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + if (IS_ERR(branch)) + return PTR_ERR(branch); + clear_all_pkt_pointers(env); + mark_reg_unknown(env, regs, BPF_REG_0); + err = prepare_func_exit(env, &env->insn_idx); + if (err) + return err; + env->insn_idx--; + } else { + changes_data = false; + } + } + if (changes_data) clear_all_pkt_pointers(env); return 0; @@ -12260,6 +12375,8 @@ enum special_kfunc_type { KF_bpf_res_spin_unlock, KF_bpf_res_spin_lock_irqsave, KF_bpf_res_spin_unlock_irqrestore, + KF_bpf_dynptr_from_file, + KF_bpf_dynptr_file_discard, KF___bpf_trap, KF_bpf_task_work_schedule_signal_impl, KF_bpf_task_work_schedule_resume_impl, @@ -12332,6 +12449,8 @@ BTF_ID(func, bpf_res_spin_lock) BTF_ID(func, bpf_res_spin_unlock) BTF_ID(func, bpf_res_spin_lock_irqsave) BTF_ID(func, bpf_res_spin_unlock_irqrestore) +BTF_ID(func, bpf_dynptr_from_file) +BTF_ID(func, bpf_dynptr_file_discard) BTF_ID(func, __bpf_trap) BTF_ID(func, bpf_task_work_schedule_signal_impl) BTF_ID(func, bpf_task_work_schedule_resume_impl) @@ -13295,6 +13414,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ dynptr_arg_type |= DYNPTR_TYPE_XDP; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb_meta]) { dynptr_arg_type |= DYNPTR_TYPE_SKB_META; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { + dynptr_arg_type |= DYNPTR_TYPE_FILE; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) { + dynptr_arg_type |= DYNPTR_TYPE_FILE; + meta->release_regno = regno; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && (dynptr_arg_type & MEM_UNINIT)) { enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type; @@ -13829,9 +13953,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_reg_state *regs; branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); - if (!branch) { + if (IS_ERR(branch)) { verbose(env, "failed to push state for failed lock acquisition\n"); - return -ENOMEM; + return PTR_ERR(branch); } regs = branch->frame[branch->curframe]->regs; @@ -13863,6 +13987,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EACCES; } + /* Track non-sleepable context for kfuncs, same as for helpers. */ + if (!in_sleepable_context(env)) + insn_aux->non_sleepable = true; + /* Check the arguments */ err = check_kfunc_args(env, &meta, insn_idx); if (err < 0) @@ -13909,36 +14037,33 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, preempt_disable = is_kfunc_bpf_preempt_disable(&meta); preempt_enable = is_kfunc_bpf_preempt_enable(&meta); - if (env->cur_state->active_rcu_lock) { + if (rcu_lock) { + env->cur_state->active_rcu_locks++; + } else if (rcu_unlock) { struct bpf_func_state *state; struct bpf_reg_state *reg; u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER); - if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) { - verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n"); - return -EACCES; - } - - if (rcu_lock) { - verbose(env, "nested rcu read lock (kernel function %s)\n", func_name); + if (env->cur_state->active_rcu_locks == 0) { + verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name); return -EINVAL; - } else if (rcu_unlock) { + } + if (--env->cur_state->active_rcu_locks == 0) { bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({ if (reg->type & MEM_RCU) { reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); reg->type |= PTR_UNTRUSTED; } })); - env->cur_state->active_rcu_lock = false; - } else if (sleepable) { - verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name); - return -EACCES; } - } else if (rcu_lock) { - env->cur_state->active_rcu_lock = true; - } else if (rcu_unlock) { - verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name); - return -EINVAL; + } else if (sleepable && env->cur_state->active_rcu_locks) { + verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name); + return -EACCES; + } + + if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) { + verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n"); + return -EACCES; } if (env->cur_state->active_preempt_locks) { @@ -13971,12 +14096,18 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ if (meta.release_regno) { - err = release_reference(env, regs[meta.release_regno].ref_obj_id); - if (err) { - verbose(env, "kfunc %s#%d reference has not been acquired before\n", - func_name, meta.func_id); - return err; + struct bpf_reg_state *reg = ®s[meta.release_regno]; + + if (meta.initialized_dynptr.ref_obj_id) { + err = unmark_stack_slots_dynptr(env, reg); + } else { + err = release_reference(env, reg->ref_obj_id); + if (err) + verbose(env, "kfunc %s#%d reference has not been acquired before\n", + func_name, meta.func_id); } + if (err) + return err; } if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || @@ -14282,16 +14413,15 @@ struct bpf_sanitize_info { bool mask_to_left; }; -static struct bpf_verifier_state * -sanitize_speculative_path(struct bpf_verifier_env *env, - const struct bpf_insn *insn, - u32 next_idx, u32 curr_idx) +static int sanitize_speculative_path(struct bpf_verifier_env *env, + const struct bpf_insn *insn, + u32 next_idx, u32 curr_idx) { struct bpf_verifier_state *branch; struct bpf_reg_state *regs; branch = push_stack(env, next_idx, curr_idx, true); - if (branch && insn) { + if (!IS_ERR(branch) && insn) { regs = branch->frame[branch->curframe]->regs; if (BPF_SRC(insn->code) == BPF_K) { mark_reg_unknown(env, regs, insn->dst_reg); @@ -14300,7 +14430,7 @@ sanitize_speculative_path(struct bpf_verifier_env *env, mark_reg_unknown(env, regs, insn->src_reg); } } - return branch; + return PTR_ERR_OR_ZERO(branch); } static int sanitize_ptr_alu(struct bpf_verifier_env *env, @@ -14319,7 +14449,6 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, u8 opcode = BPF_OP(insn->code); u32 alu_state, alu_limit; struct bpf_reg_state tmp; - bool ret; int err; if (can_skip_alu_sanitation(env, insn)) @@ -14392,11 +14521,12 @@ do_sim: tmp = *dst_reg; copy_register_state(dst_reg, ptr_reg); } - ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1, - env->insn_idx); - if (!ptr_is_dst_reg && ret) + err = sanitize_speculative_path(env, NULL, env->insn_idx + 1, env->insn_idx); + if (err < 0) + return REASON_STACK; + if (!ptr_is_dst_reg) *dst_reg = tmp; - return !ret ? REASON_STACK : 0; + return 0; } static void sanitize_mark_insn_seen(struct bpf_verifier_env *env) @@ -15950,6 +16080,30 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value; s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value; + if (reg1 == reg2) { + switch (opcode) { + case BPF_JGE: + case BPF_JLE: + case BPF_JSGE: + case BPF_JSLE: + case BPF_JEQ: + return 1; + case BPF_JGT: + case BPF_JLT: + case BPF_JSGT: + case BPF_JSLT: + case BPF_JNE: + return 0; + case BPF_JSET: + if (tnum_is_const(t1)) + return t1.value != 0; + else + return (smin1 <= 0 && smax1 >= 0) ? -1 : 1; + default: + return -1; + } + } + switch (opcode) { case BPF_JEQ: /* constants, umin/umax and smin/smax checks would be @@ -16396,6 +16550,13 @@ static int reg_set_min_max(struct bpf_verifier_env *env, if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE) return 0; + /* We compute branch direction for same SCALAR_VALUE registers in + * is_scalar_branch_taken(). For unknown branch directions (e.g., BPF_JSET) + * on the same registers, we don't need to adjust the min/max values. + */ + if (false_reg1 == false_reg2) + return 0; + /* fallthrough (FALSE) branch */ regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32); reg_bounds_sync(false_reg1); @@ -16716,8 +16877,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, /* branch out 'fallthrough' insn as a new state to explore */ queued_st = push_stack(env, idx + 1, idx, false); - if (!queued_st) - return -ENOMEM; + if (IS_ERR(queued_st)) + return PTR_ERR(queued_st); queued_st->may_goto_depth++; if (prev_st) @@ -16795,10 +16956,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, * the fall-through branch for simulation under speculative * execution. */ - if (!env->bypass_spec_v1 && - !sanitize_speculative_path(env, insn, *insn_idx + 1, - *insn_idx)) - return -EFAULT; + if (!env->bypass_spec_v1) { + err = sanitize_speculative_path(env, insn, *insn_idx + 1, *insn_idx); + if (err < 0) + return err; + } if (env->log.level & BPF_LOG_LEVEL) print_insn_state(env, this_branch, this_branch->curframe); *insn_idx += insn->off; @@ -16808,11 +16970,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, * program will go. If needed, push the goto branch for * simulation under speculative execution. */ - if (!env->bypass_spec_v1 && - !sanitize_speculative_path(env, insn, - *insn_idx + insn->off + 1, - *insn_idx)) - return -EFAULT; + if (!env->bypass_spec_v1) { + err = sanitize_speculative_path(env, insn, *insn_idx + insn->off + 1, + *insn_idx); + if (err < 0) + return err; + } if (env->log.level & BPF_LOG_LEVEL) print_insn_state(env, this_branch, this_branch->curframe); return 0; @@ -16833,10 +16996,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return err; } - other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, - false); - if (!other_branch) - return -EFAULT; + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, false); + if (IS_ERR(other_branch)) + return PTR_ERR(other_branch); other_branch_regs = other_branch->frame[other_branch->curframe]->regs; if (BPF_SRC(insn->code) == BPF_X) { @@ -17019,7 +17181,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) } dst_reg->type = PTR_TO_MAP_VALUE; dst_reg->off = aux->map_off; - WARN_ON_ONCE(map->max_entries != 1); + WARN_ON_ONCE(map->map_type != BPF_MAP_TYPE_INSN_ARRAY && + map->max_entries != 1); /* We want reg->id to be same (0) as map_value is not distinct */ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD || insn->src_reg == BPF_PSEUDO_MAP_IDX) { @@ -17771,6 +17934,247 @@ static int mark_fastcall_patterns(struct bpf_verifier_env *env) return 0; } +static struct bpf_iarray *iarray_realloc(struct bpf_iarray *old, size_t n_elem) +{ + size_t new_size = sizeof(struct bpf_iarray) + n_elem * sizeof(old->items[0]); + struct bpf_iarray *new; + + new = kvrealloc(old, new_size, GFP_KERNEL_ACCOUNT); + if (!new) { + /* this is what callers always want, so simplify the call site */ + kvfree(old); + return NULL; + } + + new->cnt = n_elem; + return new; +} + +static int copy_insn_array(struct bpf_map *map, u32 start, u32 end, u32 *items) +{ + struct bpf_insn_array_value *value; + u32 i; + + for (i = start; i <= end; i++) { + value = map->ops->map_lookup_elem(map, &i); + /* + * map_lookup_elem of an array map will never return an error, + * but not checking it makes some static analysers to worry + */ + if (IS_ERR(value)) + return PTR_ERR(value); + else if (!value) + return -EINVAL; + items[i - start] = value->xlated_off; + } + return 0; +} + +static int cmp_ptr_to_u32(const void *a, const void *b) +{ + return *(u32 *)a - *(u32 *)b; +} + +static int sort_insn_array_uniq(u32 *items, int cnt) +{ + int unique = 1; + int i; + + sort(items, cnt, sizeof(items[0]), cmp_ptr_to_u32, NULL); + + for (i = 1; i < cnt; i++) + if (items[i] != items[unique - 1]) + items[unique++] = items[i]; + + return unique; +} + +/* + * sort_unique({map[start], ..., map[end]}) into off + */ +static int copy_insn_array_uniq(struct bpf_map *map, u32 start, u32 end, u32 *off) +{ + u32 n = end - start + 1; + int err; + + err = copy_insn_array(map, start, end, off); + if (err) + return err; + + return sort_insn_array_uniq(off, n); +} + +/* + * Copy all unique offsets from the map + */ +static struct bpf_iarray *jt_from_map(struct bpf_map *map) +{ + struct bpf_iarray *jt; + int err; + int n; + + jt = iarray_realloc(NULL, map->max_entries); + if (!jt) + return ERR_PTR(-ENOMEM); + + n = copy_insn_array_uniq(map, 0, map->max_entries - 1, jt->items); + if (n < 0) { + err = n; + goto err_free; + } + if (n == 0) { + err = -EINVAL; + goto err_free; + } + jt->cnt = n; + return jt; + +err_free: + kvfree(jt); + return ERR_PTR(err); +} + +/* + * Find and collect all maps which fit in the subprog. Return the result as one + * combined jump table in jt->items (allocated with kvcalloc) + */ +static struct bpf_iarray *jt_from_subprog(struct bpf_verifier_env *env, + int subprog_start, int subprog_end) +{ + struct bpf_iarray *jt = NULL; + struct bpf_map *map; + struct bpf_iarray *jt_cur; + int i; + + for (i = 0; i < env->insn_array_map_cnt; i++) { + /* + * TODO (when needed): collect only jump tables, not static keys + * or maps for indirect calls + */ + map = env->insn_array_maps[i]; + + jt_cur = jt_from_map(map); + if (IS_ERR(jt_cur)) { + kvfree(jt); + return jt_cur; + } + + /* + * This is enough to check one element. The full table is + * checked to fit inside the subprog later in create_jt() + */ + if (jt_cur->items[0] >= subprog_start && jt_cur->items[0] < subprog_end) { + u32 old_cnt = jt ? jt->cnt : 0; + jt = iarray_realloc(jt, old_cnt + jt_cur->cnt); + if (!jt) { + kvfree(jt_cur); + return ERR_PTR(-ENOMEM); + } + memcpy(jt->items + old_cnt, jt_cur->items, jt_cur->cnt << 2); + } + + kvfree(jt_cur); + } + + if (!jt) { + verbose(env, "no jump tables found for subprog starting at %u\n", subprog_start); + return ERR_PTR(-EINVAL); + } + + jt->cnt = sort_insn_array_uniq(jt->items, jt->cnt); + return jt; +} + +static struct bpf_iarray * +create_jt(int t, struct bpf_verifier_env *env) +{ + static struct bpf_subprog_info *subprog; + int subprog_start, subprog_end; + struct bpf_iarray *jt; + int i; + + subprog = bpf_find_containing_subprog(env, t); + subprog_start = subprog->start; + subprog_end = (subprog + 1)->start; + jt = jt_from_subprog(env, subprog_start, subprog_end); + if (IS_ERR(jt)) + return jt; + + /* Check that the every element of the jump table fits within the given subprogram */ + for (i = 0; i < jt->cnt; i++) { + if (jt->items[i] < subprog_start || jt->items[i] >= subprog_end) { + verbose(env, "jump table for insn %d points outside of the subprog [%u,%u]\n", + t, subprog_start, subprog_end); + kvfree(jt); + return ERR_PTR(-EINVAL); + } + } + + return jt; +} + +/* "conditional jump with N edges" */ +static int visit_gotox_insn(int t, struct bpf_verifier_env *env) +{ + int *insn_stack = env->cfg.insn_stack; + int *insn_state = env->cfg.insn_state; + bool keep_exploring = false; + struct bpf_iarray *jt; + int i, w; + + jt = env->insn_aux_data[t].jt; + if (!jt) { + jt = create_jt(t, env); + if (IS_ERR(jt)) + return PTR_ERR(jt); + + env->insn_aux_data[t].jt = jt; + } + + mark_prune_point(env, t); + for (i = 0; i < jt->cnt; i++) { + w = jt->items[i]; + if (w < 0 || w >= env->prog->len) { + verbose(env, "indirect jump out of range from insn %d to %d\n", t, w); + return -EINVAL; + } + + mark_jmp_point(env, w); + + /* EXPLORED || DISCOVERED */ + if (insn_state[w]) + continue; + + if (env->cfg.cur_stack >= env->prog->len) + return -E2BIG; + + insn_stack[env->cfg.cur_stack++] = w; + insn_state[w] |= DISCOVERED; + keep_exploring = true; + } + + return keep_exploring ? KEEP_EXPLORING : DONE_EXPLORING; +} + +static int visit_tailcall_insn(struct bpf_verifier_env *env, int t) +{ + static struct bpf_subprog_info *subprog; + struct bpf_iarray *jt; + + if (env->insn_aux_data[t].jt) + return 0; + + jt = iarray_realloc(NULL, 2); + if (!jt) + return -ENOMEM; + + subprog = bpf_find_containing_subprog(env, t); + jt->items[0] = t + 1; + jt->items[1] = subprog->exit_idx; + env->insn_aux_data[t].jt = jt; + return 0; +} + /* Visits the instruction at index t and returns one of the following: * < 0 - an error occurred * DONE_EXPLORING - the instruction was fully explored @@ -17831,6 +18235,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) mark_subprog_might_sleep(env, t); if (bpf_helper_changes_pkt_data(insn->imm)) mark_subprog_changes_pkt_data(env, t); + if (insn->imm == BPF_FUNC_tail_call) + visit_tailcall_insn(env, t); } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; @@ -17863,8 +18269,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); case BPF_JA: - if (BPF_SRC(insn->code) != BPF_K) - return -EINVAL; + if (BPF_SRC(insn->code) == BPF_X) + return visit_gotox_insn(t, env); if (BPF_CLASS(insn->code) == BPF_JMP) off = insn->off; @@ -17991,8 +18397,9 @@ err_free: */ static int compute_postorder(struct bpf_verifier_env *env) { - u32 cur_postorder, i, top, stack_sz, s, succ_cnt, succ[2]; + u32 cur_postorder, i, top, stack_sz, s; int *stack = NULL, *postorder = NULL, *state = NULL; + struct bpf_iarray *succ; postorder = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); state = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); @@ -18016,11 +18423,11 @@ static int compute_postorder(struct bpf_verifier_env *env) stack_sz--; continue; } - succ_cnt = bpf_insn_successors(env->prog, top, succ); - for (s = 0; s < succ_cnt; ++s) { - if (!state[succ[s]]) { - stack[stack_sz++] = succ[s]; - state[succ[s]] |= DISCOVERED; + succ = bpf_insn_successors(env, top); + for (s = 0; s < succ->cnt; ++s) { + if (!state[succ->items[s]]) { + stack[stack_sz++] = succ->items[s]; + state[succ->items[s]] |= DISCOVERED; } } state[top] |= EXPLORED; @@ -18792,6 +19199,10 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno; case PTR_TO_ARENA: return true; + case PTR_TO_INSN: + return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 && + rold->off == rcur->off && range_within(rold, rcur) && + tnum_in(rold->var_off, rcur->var_off); default: return regs_exact(rold, rcur, idmap); } @@ -18972,7 +19383,7 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c if (old->active_preempt_locks != cur->active_preempt_locks) return false; - if (old->active_rcu_lock != cur->active_rcu_lock) + if (old->active_rcu_locks != cur->active_rcu_locks) return false; if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap)) @@ -19144,7 +19555,7 @@ static int propagate_precision(struct bpf_verifier_env *env, bt_set_frame_slot(&env->bt, fr, i); first = false; } - if (!first) + if (!first && (env->log.level & BPF_LOG_LEVEL2)) verbose(env, "\n"); } @@ -19784,9 +20195,6 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env, return PROCESS_BPF_EXIT; if (env->cur_state->curframe) { - err = bpf_update_live_stack(env); - if (err) - return err; /* exit from nested function */ err = prepare_func_exit(env, &env->insn_idx); if (err) @@ -19801,6 +20209,99 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env, return PROCESS_BPF_EXIT; } +static int indirect_jump_min_max_index(struct bpf_verifier_env *env, + int regno, + struct bpf_map *map, + u32 *pmin_index, u32 *pmax_index) +{ + struct bpf_reg_state *reg = reg_state(env, regno); + u64 min_index, max_index; + const u32 size = 8; + + if (check_add_overflow(reg->umin_value, reg->off, &min_index) || + (min_index > (u64) U32_MAX * size)) { + verbose(env, "the sum of R%u umin_value %llu and off %u is too big\n", + regno, reg->umin_value, reg->off); + return -ERANGE; + } + if (check_add_overflow(reg->umax_value, reg->off, &max_index) || + (max_index > (u64) U32_MAX * size)) { + verbose(env, "the sum of R%u umax_value %llu and off %u is too big\n", + regno, reg->umax_value, reg->off); + return -ERANGE; + } + + min_index /= size; + max_index /= size; + + if (max_index >= map->max_entries) { + verbose(env, "R%u points to outside of jump table: [%llu,%llu] max_entries %u\n", + regno, min_index, max_index, map->max_entries); + return -EINVAL; + } + + *pmin_index = min_index; + *pmax_index = max_index; + return 0; +} + +/* gotox *dst_reg */ +static int check_indirect_jump(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + struct bpf_verifier_state *other_branch; + struct bpf_reg_state *dst_reg; + struct bpf_map *map; + u32 min_index, max_index; + int err = 0; + int n; + int i; + + dst_reg = reg_state(env, insn->dst_reg); + if (dst_reg->type != PTR_TO_INSN) { + verbose(env, "R%d has type %s, expected PTR_TO_INSN\n", + insn->dst_reg, reg_type_str(env, dst_reg->type)); + return -EINVAL; + } + + map = dst_reg->map_ptr; + if (verifier_bug_if(!map, env, "R%d has an empty map pointer", insn->dst_reg)) + return -EFAULT; + + if (verifier_bug_if(map->map_type != BPF_MAP_TYPE_INSN_ARRAY, env, + "R%d has incorrect map type %d", insn->dst_reg, map->map_type)) + return -EFAULT; + + err = indirect_jump_min_max_index(env, insn->dst_reg, map, &min_index, &max_index); + if (err) + return err; + + /* Ensure that the buffer is large enough */ + if (!env->gotox_tmp_buf || env->gotox_tmp_buf->cnt < max_index - min_index + 1) { + env->gotox_tmp_buf = iarray_realloc(env->gotox_tmp_buf, + max_index - min_index + 1); + if (!env->gotox_tmp_buf) + return -ENOMEM; + } + + n = copy_insn_array_uniq(map, min_index, max_index, env->gotox_tmp_buf->items); + if (n < 0) + return n; + if (n == 0) { + verbose(env, "register R%d doesn't point to any offset in map id=%d\n", + insn->dst_reg, map->id); + return -EINVAL; + } + + for (i = 0; i < n - 1; i++) { + other_branch = push_stack(env, env->gotox_tmp_buf->items[i], + env->insn_idx, env->cur_state->speculative); + if (IS_ERR(other_branch)) + return PTR_ERR(other_branch); + } + env->insn_idx = env->gotox_tmp_buf->items[n-1]; + return 0; +} + static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) { int err; @@ -19903,6 +20404,15 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) mark_reg_scratched(env, BPF_REG_0); } else if (opcode == BPF_JA) { + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->src_reg != BPF_REG_0 || + insn->imm != 0 || insn->off != 0) { + verbose(env, "BPF_JA|BPF_X uses reserved fields\n"); + return -EINVAL; + } + return check_indirect_jump(env, insn); + } + if (BPF_SRC(insn->code) != BPF_K || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0 || @@ -20419,6 +20929,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_QUEUE: case BPF_MAP_TYPE_STACK: case BPF_MAP_TYPE_ARENA: + case BPF_MAP_TYPE_INSN_ARRAY: break; default: verbose(env, @@ -20490,6 +21001,15 @@ static int __add_used_map(struct bpf_verifier_env *env, struct bpf_map *map) env->used_maps[env->used_map_cnt++] = map; + if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) { + err = bpf_insn_array_init(map, env->prog); + if (err) { + verbose(env, "Failed to properly initialize insn array\n"); + return err; + } + env->insn_array_maps[env->insn_array_map_cnt++] = map; + } + return env->used_map_cnt - 1; } @@ -20736,6 +21256,33 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len } } +static void release_insn_arrays(struct bpf_verifier_env *env) +{ + int i; + + for (i = 0; i < env->insn_array_map_cnt; i++) + bpf_insn_array_release(env->insn_array_maps[i]); +} + +static void adjust_insn_arrays(struct bpf_verifier_env *env, u32 off, u32 len) +{ + int i; + + if (len == 1) + return; + + for (i = 0; i < env->insn_array_map_cnt; i++) + bpf_insn_array_adjust(env->insn_array_maps[i], off, len); +} + +static void adjust_insn_arrays_after_remove(struct bpf_verifier_env *env, u32 off, u32 len) +{ + int i; + + for (i = 0; i < env->insn_array_map_cnt; i++) + bpf_insn_array_adjust_after_remove(env->insn_array_maps[i], off, len); +} + static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len) { struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; @@ -20777,6 +21324,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of } adjust_insn_aux_data(env, new_prog, off, len); adjust_subprog_starts(env, off, len); + adjust_insn_arrays(env, off, len); adjust_poke_descs(new_prog, off, len); return new_prog; } @@ -20939,6 +21487,27 @@ static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off, return 0; } +/* + * Clean up dynamically allocated fields of aux data for instructions [start, ...] + */ +static void clear_insn_aux_data(struct bpf_verifier_env *env, int start, int len) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + struct bpf_insn *insns = env->prog->insnsi; + int end = start + len; + int i; + + for (i = start; i < end; i++) { + if (aux_data[i].jt) { + kvfree(aux_data[i].jt); + aux_data[i].jt = NULL; + } + + if (bpf_is_ldimm64(&insns[i])) + i++; + } +} + static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) { struct bpf_insn_aux_data *aux_data = env->insn_aux_data; @@ -20948,6 +21517,9 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) if (bpf_prog_is_offloaded(env->prog->aux)) bpf_prog_offload_remove_insns(env, off, cnt); + /* Should be called before bpf_remove_insns, as it uses prog->insnsi */ + clear_insn_aux_data(env, off, cnt); + err = bpf_remove_insns(env->prog, off, cnt); if (err) return err; @@ -20960,6 +21532,8 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) if (err) return err; + adjust_insn_arrays_after_remove(env, off, cnt); + memmove(aux_data + off, aux_data + off + cnt, sizeof(*aux_data) * (orig_prog_len - off - cnt)); @@ -21499,6 +22073,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) struct bpf_insn *insn; void *old_bpf_func; int err, num_exentries; + int old_len, subprog_start_adjustment = 0; if (env->subprog_cnt <= 1) return 0; @@ -21573,6 +22148,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->func_idx = i; /* Below members will be freed only at prog->aux */ func[i]->aux->btf = prog->aux->btf; + func[i]->aux->subprog_start = subprog_start + subprog_start_adjustment; func[i]->aux->func_info = prog->aux->func_info; func[i]->aux->func_info_cnt = prog->aux->func_info_cnt; func[i]->aux->poke_tab = prog->aux->poke_tab; @@ -21602,6 +22178,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->jited_linfo = prog->aux->jited_linfo; func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; func[i]->aux->arena = prog->aux->arena; + func[i]->aux->used_maps = env->used_maps; + func[i]->aux->used_map_cnt = env->used_map_cnt; num_exentries = 0; insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { @@ -21626,7 +22204,15 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->might_sleep = env->subprog_info[i].might_sleep; if (!i) func[i]->aux->exception_boundary = env->seen_exception; + + /* + * To properly pass the absolute subprog start to jit + * all instruction adjustments should be accumulated + */ + old_len = func[i]->len; func[i] = bpf_int_jit_compile(func[i]); + subprog_start_adjustment += func[i]->len - old_len; + if (!func[i]->jited) { err = -ENOTSUPP; goto out_free; @@ -21679,6 +22265,15 @@ static int jit_subprogs(struct bpf_verifier_env *env) cond_resched(); } + /* + * Cleanup func[i]->aux fields which aren't required + * or can become invalid in future + */ + for (i = 0; i < env->subprog_cnt; i++) { + func[i]->aux->used_maps = NULL; + func[i]->aux->used_map_cnt = 0; + } + /* finally lock prog and jit images for all functions and * populate kallsysm. Begin at the first subprogram, since * bpf_prog_load will add the kallsyms for the main program. @@ -21808,46 +22403,47 @@ static int fixup_call_args(struct bpf_verifier_env *env) } /* replace a generic kfunc with a specialized version if necessary */ -static void specialize_kfunc(struct bpf_verifier_env *env, - u32 func_id, u16 offset, unsigned long *addr) +static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, int insn_idx) { struct bpf_prog *prog = env->prog; bool seen_direct_write; void *xdp_kfunc; bool is_rdonly; + u32 func_id = desc->func_id; + u16 offset = desc->offset; + unsigned long addr = desc->addr; + + if (offset) /* return if module BTF is used */ + return 0; if (bpf_dev_bound_kfunc_id(func_id)) { xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id); - if (xdp_kfunc) { - *addr = (unsigned long)xdp_kfunc; - return; - } + if (xdp_kfunc) + addr = (unsigned long)xdp_kfunc; /* fallback to default kfunc when not supported by netdev */ - } - - if (offset) - return; - - if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { + } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { seen_direct_write = env->seen_direct_write; is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE); if (is_rdonly) - *addr = (unsigned long)bpf_dynptr_from_skb_rdonly; + addr = (unsigned long)bpf_dynptr_from_skb_rdonly; /* restore env->seen_direct_write to its original value, since * may_access_direct_pkt_data mutates it */ env->seen_direct_write = seen_direct_write; + } else if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr]) { + if (bpf_lsm_has_d_inode_locked(prog)) + addr = (unsigned long)bpf_set_dentry_xattr_locked; + } else if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr]) { + if (bpf_lsm_has_d_inode_locked(prog)) + addr = (unsigned long)bpf_remove_dentry_xattr_locked; + } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { + if (!env->insn_aux_data[insn_idx].non_sleepable) + addr = (unsigned long)bpf_dynptr_from_file_sleepable; } - - if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr] && - bpf_lsm_has_d_inode_locked(prog)) - *addr = (unsigned long)bpf_set_dentry_xattr_locked; - - if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr] && - bpf_lsm_has_d_inode_locked(prog)) - *addr = (unsigned long)bpf_remove_dentry_xattr_locked; + desc->addr = addr; + return 0; } static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux, @@ -21870,7 +22466,8 @@ static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux, static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn *insn_buf, int insn_idx, int *cnt) { - const struct bpf_kfunc_desc *desc; + struct bpf_kfunc_desc *desc; + int err; if (!insn->imm) { verbose(env, "invalid kernel function call not eliminated in verifier pass\n"); @@ -21890,6 +22487,10 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EFAULT; } + err = specialize_kfunc(env, desc, insn_idx); + if (err) + return err; + if (!bpf_jit_supports_far_kfunc_call()) insn->imm = BPF_CALL_IMM(desc->addr); if (insn->off) @@ -22485,8 +23086,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) } if (is_storage_get_function(insn->imm)) { - if (!in_sleepable(env) || - env->insn_aux_data[i + delta].storage_get_func_atomic) + if (env->insn_aux_data[i + delta].non_sleepable) insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC); else insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL); @@ -22919,7 +23519,9 @@ next_insn: } } - sort_kfunc_descs_by_imm_off(env->prog); + ret = sort_kfunc_descs_by_imm_off(env); + if (ret) + return ret; return 0; } @@ -23156,6 +23758,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) state->curframe = 0; state->speculative = false; state->branches = 1; + state->in_sleepable = env->prog->sleepable; state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL_ACCOUNT); if (!state->frame[0]) { kfree(state); @@ -23175,7 +23778,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) struct bpf_subprog_arg_info *arg; struct bpf_reg_state *reg; - verbose(env, "Validating %s() func#%d...\n", sub_name, subprog); + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, "Validating %s() func#%d...\n", sub_name, subprog); ret = btf_prepare_func_args(env, subprog); if (ret) goto out; @@ -24278,14 +24882,13 @@ static int compute_live_registers(struct bpf_verifier_env *env) for (i = 0; i < env->cfg.cur_postorder; ++i) { int insn_idx = env->cfg.insn_postorder[i]; struct insn_live_regs *live = &state[insn_idx]; - int succ_num; - u32 succ[2]; + struct bpf_iarray *succ; u16 new_out = 0; u16 new_in = 0; - succ_num = bpf_insn_successors(env->prog, insn_idx, succ); - for (int s = 0; s < succ_num; ++s) - new_out |= state[succ[s]].in; + succ = bpf_insn_successors(env, insn_idx); + for (int s = 0; s < succ->cnt; ++s) + new_out |= state[succ->items[s]].in; new_in = (new_out & ~live->def) | live->use; if (new_out != live->out || new_in != live->in) { live->in = new_in; @@ -24338,11 +24941,11 @@ static int compute_scc(struct bpf_verifier_env *env) const u32 insn_cnt = env->prog->len; int stack_sz, dfs_sz, err = 0; u32 *stack, *pre, *low, *dfs; - u32 succ_cnt, i, j, t, w; + u32 i, j, t, w; u32 next_preorder_num; u32 next_scc_id; bool assign_scc; - u32 succ[2]; + struct bpf_iarray *succ; next_preorder_num = 1; next_scc_id = 1; @@ -24449,12 +25052,12 @@ dfs_continue: stack[stack_sz++] = w; } /* Visit 'w' successors */ - succ_cnt = bpf_insn_successors(env->prog, w, succ); - for (j = 0; j < succ_cnt; ++j) { - if (pre[succ[j]]) { - low[w] = min(low[w], low[succ[j]]); + succ = bpf_insn_successors(env, w); + for (j = 0; j < succ->cnt; ++j) { + if (pre[succ->items[j]]) { + low[w] = min(low[w], low[succ->items[j]]); } else { - dfs[dfs_sz++] = succ[j]; + dfs[dfs_sz++] = succ->items[j]; goto dfs_continue; } } @@ -24471,8 +25074,8 @@ dfs_continue: * or if component has a self reference. */ assign_scc = stack[stack_sz - 1] != w; - for (j = 0; j < succ_cnt; ++j) { - if (succ[j] == w) { + for (j = 0; j < succ->cnt; ++j) { + if (succ->items[j] == w) { assign_scc = true; break; } @@ -24534,6 +25137,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 goto err_free_env; for (i = 0; i < len; i++) env->insn_aux_data[i].orig_idx = i; + env->succ = iarray_realloc(NULL, 2); + if (!env->succ) + goto err_free_env; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; @@ -24757,6 +25363,8 @@ skip_full_check: adjust_btf_func(env); err_release_maps: + if (ret) + release_insn_arrays(env); if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_used_maps() will release them. @@ -24777,11 +25385,14 @@ err_release_maps: err_unlock: if (!is_priv) mutex_unlock(&bpf_verifier_lock); + clear_insn_aux_data(env, 0, env->prog->len); vfree(env->insn_aux_data); err_free_env: bpf_stack_liveness_free(env); kvfree(env->cfg.insn_postorder); kvfree(env->scc_info); + kvfree(env->succ); + kvfree(env->gotox_tmp_buf); kvfree(env); return ret; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index ae1eb7a85eb4..e717208cfb18 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -60,6 +60,7 @@ #include <linux/sched/deadline.h> #include <linux/psi.h> #include <linux/nstree.h> +#include <linux/irq_work.h> #include <net/sock.h> #define CREATE_TRACE_POINTS @@ -287,6 +288,7 @@ static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); +static void cgroup_rt_init(void); #ifdef CONFIG_DEBUG_CGROUP_REF #define CGROUP_REF_FN_ATTRS noinline @@ -941,7 +943,8 @@ static void css_set_move_task(struct task_struct *task, /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race - * against cgroup_exit()/cgroup_free() dropping the css_set. + * against cgroup_task_dead()/cgroup_task_free() dropping + * the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); @@ -4701,6 +4704,7 @@ void cgroup_file_notify(struct cgroup_file *cfile) } spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } +EXPORT_SYMBOL_GPL(cgroup_file_notify); /** * cgroup_file_show - show or hide a hidden cgroup file @@ -6354,6 +6358,7 @@ int __init cgroup_init(void) BUG_ON(ss_rstat_init(NULL)); get_user_ns(init_cgroup_ns.user_ns); + cgroup_rt_init(); cgroup_lock(); @@ -6967,19 +6972,29 @@ void cgroup_post_fork(struct task_struct *child, } /** - * cgroup_exit - detach cgroup from exiting task + * cgroup_task_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process * * Description: Detach cgroup from @tsk. * */ -void cgroup_exit(struct task_struct *tsk) +void cgroup_task_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; - struct css_set *cset; int i; - spin_lock_irq(&css_set_lock); + /* see cgroup_post_fork() for details */ + do_each_subsys_mask(ss, i, have_exit_callback) { + ss->exit(tsk); + } while_each_subsys_mask(); +} + +static void do_cgroup_task_dead(struct task_struct *tsk) +{ + struct css_set *cset; + unsigned long flags; + + spin_lock_irqsave(&css_set_lock, flags); WARN_ON_ONCE(list_empty(&tsk->cg_list)); cset = task_css_set(tsk); @@ -6997,15 +7012,61 @@ void cgroup_exit(struct task_struct *tsk) test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags))) cgroup_update_frozen(task_dfl_cgroup(tsk)); - spin_unlock_irq(&css_set_lock); + spin_unlock_irqrestore(&css_set_lock, flags); +} - /* see cgroup_post_fork() for details */ - do_each_subsys_mask(ss, i, have_exit_callback) { - ss->exit(tsk); - } while_each_subsys_mask(); +#ifdef CONFIG_PREEMPT_RT +/* + * cgroup_task_dead() is called from finish_task_switch() which doesn't allow + * scheduling even in RT. As the task_dead path requires grabbing css_set_lock, + * this lead to sleeping in the invalid context warning bug. css_set_lock is too + * big to become a raw_spinlock. The task_dead path doesn't need to run + * synchronously but can't be delayed indefinitely either as the dead task pins + * the cgroup and task_struct can be pinned indefinitely. Bounce through lazy + * irq_work to allow batching while ensuring timely completion. + */ +static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks); +static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork); + +static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork) +{ + struct llist_node *lnode; + struct task_struct *task, *next; + + lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks)); + llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) { + do_cgroup_task_dead(task); + put_task_struct(task); + } +} + +static void __init cgroup_rt_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu)); + per_cpu(cgrp_dead_tasks_iwork, cpu) = + IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn); + } +} + +void cgroup_task_dead(struct task_struct *task) +{ + get_task_struct(task); + llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks)); + irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork)); } +#else /* CONFIG_PREEMPT_RT */ +static void __init cgroup_rt_init(void) {} -void cgroup_release(struct task_struct *task) +void cgroup_task_dead(struct task_struct *task) +{ + do_cgroup_task_dead(task); +} +#endif /* CONFIG_PREEMPT_RT */ + +void cgroup_task_release(struct task_struct *task) { struct cgroup_subsys *ss; int ssid; @@ -7013,6 +7074,11 @@ void cgroup_release(struct task_struct *task) do_each_subsys_mask(ss, ssid, have_release_callback) { ss->release(task); } while_each_subsys_mask(); +} + +void cgroup_task_free(struct task_struct *task) +{ + struct css_set *cset = task_css_set(task); if (!list_empty(&task->cg_list)) { spin_lock_irq(&css_set_lock); @@ -7020,11 +7086,7 @@ void cgroup_release(struct task_struct *task) list_del_init(&task->cg_list); spin_unlock_irq(&css_set_lock); } -} -void cgroup_free(struct task_struct *task) -{ - struct css_set *cset = task_css_set(task); put_css_set(cset); } diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 337608f408ce..01976c8e7d49 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -155,13 +155,17 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - /* number of valid local child partitions */ - int nr_subparts; - /* partition root state */ int partition_root_state; /* + * Whether cpuset is a remote partition. + * It used to be a list anchoring all remote partitions — we can switch back + * to a list if we need to iterate over the remote partitions. + */ + bool remote_partition; + + /* * number of SCHED_DEADLINE tasks attached to this cpuset, so that we * know when to rebuild associated root domain bandwidth information. */ @@ -175,9 +179,6 @@ struct cpuset { /* Handle for cpuset.cpus.partition */ struct cgroup_file partition_file; - /* Remote partition silbling list anchored at remote_children */ - struct list_head remote_sibling; - /* Used to merge intersecting subsets for generate_sched_domains */ struct uf_node node; }; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4aaad07b0bd1..6e6eb09b8db6 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -82,14 +82,18 @@ static cpumask_var_t subpartitions_cpus; static cpumask_var_t isolated_cpus; /* + * isolated_cpus updating flag (protected by cpuset_mutex) + * Set if isolated_cpus is going to be updated in the current + * cpuset_mutex crtical section. + */ +static bool isolated_cpus_updating; + +/* * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot */ static cpumask_var_t boot_hk_cpus; static bool have_boot_isolcpus; -/* List of remote partition root children */ -static struct list_head remote_children; - /* * A flag to force sched domain rebuild at the end of an operation. * It can be set in @@ -212,7 +216,7 @@ static struct cpuset top_cpuset = { BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, .relax_domain_level = -1, - .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), + .remote_partition = false, }; /* @@ -352,33 +356,55 @@ static inline bool is_in_v2_mode(void) (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } +static inline bool cpuset_is_populated(struct cpuset *cs) +{ + lockdep_assert_held(&cpuset_mutex); + + /* Cpusets in the process of attaching should be considered as populated */ + return cgroup_is_populated(cs->css.cgroup) || + cs->attach_in_progress; +} + /** * partition_is_populated - check if partition has tasks * @cs: partition root to be checked * @excluded_child: a child cpuset to be excluded in task checking * Return: true if there are tasks, false otherwise * - * It is assumed that @cs is a valid partition root. @excluded_child should - * be non-NULL when this cpuset is going to become a partition itself. + * @cs should be a valid partition root or going to become a partition root. + * @excluded_child should be non-NULL when this cpuset is going to become a + * partition itself. + * + * Note that a remote partition is not allowed underneath a valid local + * or remote partition. So if a non-partition root child is populated, + * the whole partition is considered populated. */ static inline bool partition_is_populated(struct cpuset *cs, struct cpuset *excluded_child) { - struct cgroup_subsys_state *css; - struct cpuset *child; + struct cpuset *cp; + struct cgroup_subsys_state *pos_css; - if (cs->css.cgroup->nr_populated_csets) + /* + * We cannot call cs_is_populated(cs) directly, as + * nr_populated_domain_children may include populated + * csets from descendants that are partitions. + */ + if (cs->css.cgroup->nr_populated_csets || + cs->attach_in_progress) return true; - if (!excluded_child && !cs->nr_subparts) - return cgroup_is_populated(cs->css.cgroup); rcu_read_lock(); - cpuset_for_each_child(child, css, cs) { - if (child == excluded_child) + cpuset_for_each_descendant_pre(cp, pos_css, cs) { + if (cp == cs || cp == excluded_child) continue; - if (is_partition_valid(child)) + + if (is_partition_valid(cp)) { + pos_css = css_rightmost_descendant(pos_css); continue; - if (cgroup_is_populated(child->css.cgroup)) { + } + + if (cpuset_is_populated(cp)) { rcu_read_unlock(); return true; } @@ -663,7 +689,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) * be changed to have empty cpus_allowed or mems_allowed. */ ret = -ENOSPC; - if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) { + if (cpuset_is_populated(cur)) { if (!cpumask_empty(cur->cpus_allowed) && cpumask_empty(trial->cpus_allowed)) goto out; @@ -1302,7 +1328,6 @@ static void reset_partition_data(struct cpuset *cs) lockdep_assert_held(&callback_lock); - cs->nr_subparts = 0; if (cpumask_empty(cs->exclusive_cpus)) { cpumask_clear(cs->effective_xcpus); if (is_cpu_exclusive(cs)) @@ -1325,6 +1350,8 @@ static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus cpumask_or(isolated_cpus, isolated_cpus, xcpus); else cpumask_andnot(isolated_cpus, isolated_cpus, xcpus); + + isolated_cpus_updating = true; } /* @@ -1332,15 +1359,12 @@ static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus * @new_prs: new partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be added - * Return: true if isolated_cpus modified, false otherwise * * Remote partition if parent == NULL */ -static bool partition_xcpus_add(int new_prs, struct cpuset *parent, +static void partition_xcpus_add(int new_prs, struct cpuset *parent, struct cpumask *xcpus) { - bool isolcpus_updated; - WARN_ON_ONCE(new_prs < 0); lockdep_assert_held(&callback_lock); if (!parent) @@ -1350,13 +1374,11 @@ static bool partition_xcpus_add(int new_prs, struct cpuset *parent, if (parent == &top_cpuset) cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus); - isolcpus_updated = (new_prs != parent->partition_root_state); - if (isolcpus_updated) + if (new_prs != parent->partition_root_state) isolated_cpus_update(parent->partition_root_state, new_prs, xcpus); cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); - return isolcpus_updated; } /* @@ -1364,15 +1386,12 @@ static bool partition_xcpus_add(int new_prs, struct cpuset *parent, * @old_prs: old partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be removed - * Return: true if isolated_cpus modified, false otherwise * * Remote partition if parent == NULL */ -static bool partition_xcpus_del(int old_prs, struct cpuset *parent, +static void partition_xcpus_del(int old_prs, struct cpuset *parent, struct cpumask *xcpus) { - bool isolcpus_updated; - WARN_ON_ONCE(old_prs < 0); lockdep_assert_held(&callback_lock); if (!parent) @@ -1381,30 +1400,95 @@ static bool partition_xcpus_del(int old_prs, struct cpuset *parent, if (parent == &top_cpuset) cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus); - isolcpus_updated = (old_prs != parent->partition_root_state); - if (isolcpus_updated) + if (old_prs != parent->partition_root_state) isolated_cpus_update(old_prs, parent->partition_root_state, xcpus); cpumask_and(xcpus, xcpus, cpu_active_mask); cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); - return isolcpus_updated; } -static void update_isolation_cpumasks(bool isolcpus_updated) +/* + * isolated_cpus_can_update - check for isolated & nohz_full conflicts + * @add_cpus: cpu mask for cpus that are going to be isolated + * @del_cpus: cpu mask for cpus that are no longer isolated, can be NULL + * Return: false if there is conflict, true otherwise + * + * If nohz_full is enabled and we have isolated CPUs, their combination must + * still leave housekeeping CPUs. + * + * TBD: Should consider merging this function into + * prstate_housekeeping_conflict(). + */ +static bool isolated_cpus_can_update(struct cpumask *add_cpus, + struct cpumask *del_cpus) { - int ret; + cpumask_var_t full_hk_cpus; + int res = true; - lockdep_assert_cpus_held(); + if (!housekeeping_enabled(HK_TYPE_KERNEL_NOISE)) + return true; - if (!isolcpus_updated) + if (del_cpus && cpumask_weight_and(del_cpus, + housekeeping_cpumask(HK_TYPE_KERNEL_NOISE))) + return true; + + if (!alloc_cpumask_var(&full_hk_cpus, GFP_KERNEL)) + return false; + + cpumask_and(full_hk_cpus, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE), + housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_andnot(full_hk_cpus, full_hk_cpus, isolated_cpus); + cpumask_and(full_hk_cpus, full_hk_cpus, cpu_active_mask); + if (!cpumask_weight_andnot(full_hk_cpus, add_cpus)) + res = false; + + free_cpumask_var(full_hk_cpus); + return res; +} + +/* + * prstate_housekeeping_conflict - check for partition & housekeeping conflicts + * @prstate: partition root state to be checked + * @new_cpus: cpu mask + * Return: true if there is conflict, false otherwise + * + * CPUs outside of boot_hk_cpus, if defined, can only be used in an + * isolated partition. + */ +static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) +{ + if (!have_boot_isolcpus) + return false; + + if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus)) + return true; + + return false; +} + +/* + * update_isolation_cpumasks - Update external isolation related CPU masks + * + * The following external CPU masks will be updated if necessary: + * - workqueue unbound cpumask + */ +static void update_isolation_cpumasks(void) +{ + int ret; + + if (!isolated_cpus_updating) return; + lockdep_assert_cpus_held(); + ret = workqueue_unbound_exclude_cpumask(isolated_cpus); WARN_ON_ONCE(ret < 0); ret = tmigr_isolated_exclude_cpumask(isolated_cpus); WARN_ON_ONCE(ret < 0); + + isolated_cpus_updating = false; } /** @@ -1508,7 +1592,7 @@ static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs) static inline bool is_remote_partition(struct cpuset *cs) { - return !list_empty(&cs->remote_sibling); + return cs->remote_partition; } static inline bool is_local_partition(struct cpuset *cs) @@ -1529,8 +1613,6 @@ static inline bool is_local_partition(struct cpuset *cs) static int remote_partition_enable(struct cpuset *cs, int new_prs, struct tmpmasks *tmp) { - bool isolcpus_updated; - /* * The user must have sysadmin privilege. */ @@ -1552,13 +1634,17 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) || cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) return PERR_INVCPUS; + if (((new_prs == PRS_ISOLATED) && + !isolated_cpus_can_update(tmp->new_cpus, NULL)) || + prstate_housekeeping_conflict(new_prs, tmp->new_cpus)) + return PERR_HKEEPING; spin_lock_irq(&callback_lock); - isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); - list_add(&cs->remote_sibling, &remote_children); + partition_xcpus_add(new_prs, NULL, tmp->new_cpus); + cs->remote_partition = true; cpumask_copy(cs->effective_xcpus, tmp->new_cpus); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(isolcpus_updated); + update_isolation_cpumasks(); cpuset_force_rebuild(); cs->prs_err = 0; @@ -1581,15 +1667,12 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, */ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) { - bool isolcpus_updated; - WARN_ON_ONCE(!is_remote_partition(cs)); WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); spin_lock_irq(&callback_lock); - list_del_init(&cs->remote_sibling); - isolcpus_updated = partition_xcpus_del(cs->partition_root_state, - NULL, cs->effective_xcpus); + cs->remote_partition = false; + partition_xcpus_del(cs->partition_root_state, NULL, cs->effective_xcpus); if (cs->prs_err) cs->partition_root_state = -cs->partition_root_state; else @@ -1599,7 +1682,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) compute_excpus(cs, cs->effective_xcpus); reset_partition_data(cs); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(isolcpus_updated); + update_isolation_cpumasks(); cpuset_force_rebuild(); /* @@ -1624,7 +1707,6 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, { bool adding, deleting; int prs = cs->partition_root_state; - int isolcpus_updated = 0; if (WARN_ON_ONCE(!is_remote_partition(cs))) return; @@ -1651,15 +1733,18 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) || cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)) cs->prs_err = PERR_NOCPUS; + else if ((prs == PRS_ISOLATED) && + !isolated_cpus_can_update(tmp->addmask, tmp->delmask)) + cs->prs_err = PERR_HKEEPING; if (cs->prs_err) goto invalidate; } spin_lock_irq(&callback_lock); if (adding) - isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask); + partition_xcpus_add(prs, NULL, tmp->addmask); if (deleting) - isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask); + partition_xcpus_del(prs, NULL, tmp->delmask); /* * Need to update effective_xcpus and exclusive_cpus now as * update_sibling_cpumasks() below may iterate back to the same cs. @@ -1668,7 +1753,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, if (xcpus) cpumask_copy(cs->exclusive_cpus, xcpus); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(isolcpus_updated); + update_isolation_cpumasks(); if (adding || deleting) cpuset_force_rebuild(); @@ -1683,26 +1768,6 @@ invalidate: remote_partition_disable(cs, tmp); } -/* - * prstate_housekeeping_conflict - check for partition & housekeeping conflicts - * @prstate: partition root state to be checked - * @new_cpus: cpu mask - * Return: true if there is conflict, false otherwise - * - * CPUs outside of boot_hk_cpus, if defined, can only be used in an - * isolated partition. - */ -static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) -{ - if (!have_boot_isolcpus) - return false; - - if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus)) - return true; - - return false; -} - /** * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset * @cs: The cpuset that requests change in partition root state @@ -1749,9 +1814,8 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, int deleting; /* Deleting cpus from parent's effective_cpus */ int old_prs, new_prs; int part_error = PERR_NONE; /* Partition error? */ - int subparts_delta = 0; - int isolcpus_updated = 0; struct cpumask *xcpus = user_xcpus(cs); + int parent_prs = parent->partition_root_state; bool nocpu; lockdep_assert_held(&cpuset_mutex); @@ -1774,10 +1838,9 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (is_partition_valid(parent)) adding = cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); - if (old_prs > 0) { + if (old_prs > 0) new_prs = -old_prs; - subparts_delta--; - } + goto write_error; } @@ -1816,6 +1879,10 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (prstate_housekeeping_conflict(new_prs, xcpus)) return PERR_HKEEPING; + if ((new_prs == PRS_ISOLATED) && (new_prs != parent_prs) && + !isolated_cpus_can_update(xcpus, NULL)) + return PERR_HKEEPING; + if (tasks_nocpu_error(parent, cs, xcpus)) return PERR_NOCPUS; @@ -1832,7 +1899,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); deleting = true; - subparts_delta++; } else if (cmd == partcmd_disable) { /* * May need to add cpus back to parent's effective_cpus @@ -1843,7 +1909,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (is_partition_valid(cs)) { cpumask_copy(tmp->addmask, cs->effective_xcpus); adding = true; - subparts_delta--; } new_prs = PRS_MEMBER; } else if (newmask) { @@ -1871,6 +1936,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, * * For invalid partition: * delmask = newmask & parent->effective_xcpus + * The partition may become valid soon. */ if (is_partition_invalid(cs)) { adding = false; @@ -1885,6 +1951,23 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, deleting = cpumask_and(tmp->delmask, tmp->delmask, parent->effective_xcpus); } + + /* + * TBD: Invalidate a currently valid child root partition may + * still break isolated_cpus_can_update() rule if parent is an + * isolated partition. + */ + if (is_partition_valid(cs) && (old_prs != parent_prs)) { + if ((parent_prs == PRS_ROOT) && + /* Adding to parent means removing isolated CPUs */ + !isolated_cpus_can_update(tmp->delmask, tmp->addmask)) + part_error = PERR_HKEEPING; + if ((parent_prs == PRS_ISOLATED) && + /* Adding to parent means adding isolated CPUs */ + !isolated_cpus_can_update(tmp->addmask, tmp->delmask)) + part_error = PERR_HKEEPING; + } + /* * The new CPUs to be removed from parent's effective CPUs * must be present. @@ -1966,17 +2049,13 @@ write_error: switch (cs->partition_root_state) { case PRS_ROOT: case PRS_ISOLATED: - if (part_error) { + if (part_error) new_prs = -old_prs; - subparts_delta--; - } break; case PRS_INVALID_ROOT: case PRS_INVALID_ISOLATED: - if (!part_error) { + if (!part_error) new_prs = -old_prs; - subparts_delta++; - } break; } } @@ -2005,28 +2084,20 @@ write_error: * newly deleted ones will be added back to effective_cpus. */ spin_lock_irq(&callback_lock); - if (old_prs != new_prs) { + if (old_prs != new_prs) cs->partition_root_state = new_prs; - if (new_prs <= 0) - cs->nr_subparts = 0; - } + /* * Adding to parent's effective_cpus means deletion CPUs from cs * and vice versa. */ if (adding) - isolcpus_updated += partition_xcpus_del(old_prs, parent, - tmp->addmask); + partition_xcpus_del(old_prs, parent, tmp->addmask); if (deleting) - isolcpus_updated += partition_xcpus_add(new_prs, parent, - tmp->delmask); + partition_xcpus_add(new_prs, parent, tmp->delmask); - if (is_partition_valid(parent)) { - parent->nr_subparts += subparts_delta; - WARN_ON_ONCE(parent->nr_subparts < 0); - } spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(isolcpus_updated); + update_isolation_cpumasks(); if ((old_prs != new_prs) && (cmd == partcmd_update)) update_partition_exclusive_flag(cs, new_prs); @@ -2108,8 +2179,6 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, */ spin_lock_irq(&callback_lock); make_partition_invalid(child); - cs->nr_subparts--; - child->nr_subparts = 0; spin_unlock_irq(&callback_lock); notify_partition_change(child, old_prs); continue; @@ -2138,7 +2207,6 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, { struct cpuset *cp; struct cgroup_subsys_state *pos_css; - bool need_rebuild_sched_domains = false; int old_prs, new_prs; rcu_read_lock(); @@ -2302,15 +2370,12 @@ get_css: if (!cpumask_empty(cp->cpus_allowed) && is_sched_load_balance(cp) && (!cpuset_v2() || is_partition_valid(cp))) - need_rebuild_sched_domains = true; + cpuset_force_rebuild(); rcu_read_lock(); css_put(&cp->css); } rcu_read_unlock(); - - if (need_rebuild_sched_domains) - cpuset_force_rebuild(); } /** @@ -2848,21 +2913,19 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, */ retval = nodelist_parse(buf, trialcs->mems_allowed); if (retval < 0) - goto done; + return retval; if (!nodes_subset(trialcs->mems_allowed, - top_cpuset.mems_allowed)) { - retval = -EINVAL; - goto done; - } + top_cpuset.mems_allowed)) + return -EINVAL; + + /* No change? nothing to do */ + if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) + return 0; - if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { - retval = 0; /* Too easy - nothing to do */ - goto done; - } retval = validate_change(cs, trialcs); if (retval < 0) - goto done; + return retval; check_insane_mems_config(&trialcs->mems_allowed); @@ -2872,8 +2935,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, /* use trialcs->mems_allowed as a temp variable */ update_nodemasks_hier(cs, &trialcs->mems_allowed); -done: - return retval; + return 0; } bool current_cpuset_is_being_rebound(void) @@ -3011,7 +3073,12 @@ static int update_prstate(struct cpuset *cs, int new_prs) * A change in load balance state only, no change in cpumasks. * Need to update isolated_cpus. */ - isolcpus_updated = true; + if (((new_prs == PRS_ISOLATED) && + !isolated_cpus_can_update(cs->effective_xcpus, NULL)) || + prstate_housekeeping_conflict(new_prs, cs->effective_xcpus)) + err = PERR_HKEEPING; + else + isolcpus_updated = true; } else { /* * Switching back to member is always allowed even if it @@ -3046,7 +3113,7 @@ out: else if (isolcpus_updated) isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(isolcpus_updated); + update_isolation_cpumasks(); /* Force update if switching back to member & update effective_xcpus */ update_cpumasks_hier(cs, &tmpmask, !new_prs); @@ -3552,7 +3619,6 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; - INIT_LIST_HEAD(&cs->remote_sibling); /* Set CS_MEMORY_MIGRATE for default hierarchy */ if (cpuset_v2()) @@ -3823,7 +3889,6 @@ int __init cpuset_init(void) nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); - INIT_LIST_HEAD(&remote_children); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); @@ -4024,7 +4089,6 @@ static void cpuset_handle_hotplug(void) */ if (!cpumask_empty(subpartitions_cpus)) { if (cpumask_subset(&new_cpus, subpartitions_cpus)) { - top_cpuset.nr_subparts = 0; cpumask_clear(subpartitions_cpus); } else { cpumask_andnot(&new_cpus, &new_cpus, @@ -4119,24 +4183,13 @@ void __init cpuset_init_smp(void) BUG_ON(!cpuset_migrate_mm_wq); } -/** - * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. - * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. - * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. - * - * Description: Returns the cpumask_var_t cpus_allowed of the cpuset - * attached to the specified @tsk. Guaranteed to return some non-empty - * subset of cpu_active_mask, even if this means going outside the - * tasks cpuset, except when the task is in the top cpuset. - **/ - -void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) +/* + * Return cpus_allowed mask from a task's cpuset. + */ +static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) { - unsigned long flags; struct cpuset *cs; - spin_lock_irqsave(&callback_lock, flags); - cs = task_cs(tsk); if (cs != &top_cpuset) guarantee_active_cpus(tsk, pmask); @@ -4156,7 +4209,39 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) if (!cpumask_intersects(pmask, cpu_active_mask)) cpumask_copy(pmask, possible_mask); } +} +/** + * cpuset_cpus_allowed_locked - return cpus_allowed mask from a task's cpuset. + * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. + * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. + * + * Similir to cpuset_cpus_allowed() except that the caller must have acquired + * cpuset_mutex. + */ +void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) +{ + lockdep_assert_held(&cpuset_mutex); + __cpuset_cpus_allowed_locked(tsk, pmask); +} + +/** + * cpuset_cpus_allowed - return cpus_allowed mask from a task's cpuset. + * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. + * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. + * + * Description: Returns the cpumask_var_t cpus_allowed of the cpuset + * attached to the specified @tsk. Guaranteed to return some non-empty + * subset of cpu_active_mask, even if this means going outside the + * tasks cpuset, except when the task is in the top cpuset. + **/ + +void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) +{ + unsigned long flags; + + spin_lock_irqsave(&callback_lock, flags); + __cpuset_cpus_allowed_locked(tsk, pmask); spin_unlock_irqrestore(&callback_lock, flags); } diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index e81327d2cd63..9f6ab7dabf67 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -83,7 +83,7 @@ CONFIG_SLUB_DEBUG_ON=y # # Debug Oops, Lockups and Hangs # -# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC=0 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set CONFIG_DEBUG_ATOMIC_SLEEP=y CONFIG_DETECT_HUNG_TASK=y diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index b0f0d15085db..7481fbb947d3 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -173,7 +173,7 @@ int cpu_cluster_pm_exit(void) EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit); #ifdef CONFIG_PM -static int cpu_pm_suspend(void) +static int cpu_pm_suspend(void *data) { int ret; @@ -185,20 +185,24 @@ static int cpu_pm_suspend(void) return ret; } -static void cpu_pm_resume(void) +static void cpu_pm_resume(void *data) { cpu_cluster_pm_exit(); cpu_pm_exit(); } -static struct syscore_ops cpu_pm_syscore_ops = { +static const struct syscore_ops cpu_pm_syscore_ops = { .suspend = cpu_pm_suspend, .resume = cpu_pm_resume, }; +static struct syscore cpu_pm_syscore = { + .ops = &cpu_pm_syscore_ops, +}; + static int cpu_pm_init(void) { - register_syscore_ops(&cpu_pm_syscore_ops); + register_syscore(&cpu_pm_syscore); return 0; } core_initcall(cpu_pm_init); diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index 87bf4d41eabb..62e60e0223cf 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -524,6 +524,9 @@ void __init reserve_crashkernel_cma(unsigned long long cma_size) #ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY static __init int insert_crashkernel_resources(void) { + if (!arch_add_crash_res_to_iomem()) + return 0; + if (crashk_res.start < crashk_res.end) insert_resource(&iomem_resource, &crashk_res); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index b12b9db75c1d..61c1690058ed 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -589,24 +589,41 @@ static void kdb_msg_write(const char *msg, int msg_len) */ cookie = console_srcu_read_lock(); for_each_console_srcu(c) { - if (!(console_srcu_read_flags(c) & CON_ENABLED)) + short flags = console_srcu_read_flags(c); + + if (!console_is_usable(c, flags, true)) continue; if (c == dbg_io_ops->cons) continue; - if (!c->write) - continue; - /* - * Set oops_in_progress to encourage the console drivers to - * disregard their internal spin locks: in the current calling - * context the risk of deadlock is a bigger problem than risks - * due to re-entering the console driver. We operate directly on - * oops_in_progress rather than using bust_spinlocks() because - * the calls bust_spinlocks() makes on exit are not appropriate - * for this calling context. - */ - ++oops_in_progress; - c->write(c, msg, msg_len); - --oops_in_progress; + + if (flags & CON_NBCON) { + struct nbcon_write_context wctxt = { }; + + /* + * Do not continue if the console is NBCON and the context + * can't be acquired. + */ + if (!nbcon_kdb_try_acquire(c, &wctxt)) + continue; + + nbcon_write_context_set_buf(&wctxt, (char *)msg, msg_len); + + c->write_atomic(c, &wctxt); + nbcon_kdb_release(&wctxt); + } else { + /* + * Set oops_in_progress to encourage the console drivers to + * disregard their internal spin locks: in the current calling + * context the risk of deadlock is a bigger problem than risks + * due to re-entering the console driver. We operate directly on + * oops_in_progress rather than using bust_spinlocks() because + * the calls bust_spinlocks() makes on exit are not appropriate + * for this calling context. + */ + ++oops_in_progress; + c->write(c, msg, msg_len); + --oops_in_progress; + } touch_nmi_watchdog(); } console_srcu_read_unlock(cookie); diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index d9b9dcba6ff7..d8fd6f779f79 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -42,6 +42,7 @@ #include <linux/memblock.h> #include <linux/err.h> #include <linux/sizes.h> +#include <linux/dma-buf/heaps/cma.h> #include <linux/dma-map-ops.h> #include <linux/cma.h> #include <linux/nospec.h> @@ -241,6 +242,8 @@ void __init dma_contiguous_reserve(phys_addr_t limit) } if (selected_size && !dma_contiguous_default_area) { + int ret; + pr_debug("%s: reserving %ld MiB for global area\n", __func__, (unsigned long)selected_size / SZ_1M); @@ -248,6 +251,10 @@ void __init dma_contiguous_reserve(phys_addr_t limit) selected_limit, &dma_contiguous_default_area, fixed); + + ret = dma_heap_cma_register_heap(dma_contiguous_default_area); + if (ret) + pr_warn("Couldn't register default CMA heap."); } } @@ -493,6 +500,10 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) pr_info("Reserved memory: created CMA memory pool at %pa, size %ld MiB\n", &rmem->base, (unsigned long)rmem->size / SZ_1M); + err = dma_heap_cma_register_heap(cma); + if (err) + pr_warn("Couldn't register CMA heap."); + return 0; } RESERVEDMEM_OF_DECLARE(cma, "shared-dma-pool", rmem_cma_setup); diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index f973e7e73c90..50c3fe2a1d55 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -479,8 +479,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, } break; case PCI_P2PDMA_MAP_BUS_ADDR: - sg->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state, - sg_phys(sg)); + sg->dma_address = pci_p2pdma_bus_addr_map( + p2pdma_state.mem, sg_phys(sg)); sg_dma_len(sg) = sg->length; sg_dma_mark_bus_address(sg); continue; diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c index 92de80e5b057..16a51736a2a3 100644 --- a/kernel/dma/dummy.c +++ b/kernel/dma/dummy.c @@ -11,17 +11,16 @@ static int dma_dummy_mmap(struct device *dev, struct vm_area_struct *vma, return -ENXIO; } -static dma_addr_t dma_dummy_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) +static dma_addr_t dma_dummy_map_phys(struct device *dev, phys_addr_t phys, + size_t size, enum dma_data_direction dir, unsigned long attrs) { return DMA_MAPPING_ERROR; } -static void dma_dummy_unmap_page(struct device *dev, dma_addr_t dma_handle, +static void dma_dummy_unmap_phys(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir, unsigned long attrs) { /* - * Dummy ops doesn't support map_page, so unmap_page should never be + * Dummy ops doesn't support map_phys, so unmap_page should never be * called. */ WARN_ON_ONCE(true); @@ -51,8 +50,8 @@ static int dma_dummy_supported(struct device *hwdev, u64 mask) const struct dma_map_ops dma_dummy_ops = { .mmap = dma_dummy_mmap, - .map_page = dma_dummy_map_page, - .unmap_page = dma_dummy_unmap_page, + .map_phys = dma_dummy_map_phys, + .unmap_phys = dma_dummy_unmap_phys, .map_sg = dma_dummy_map_sg, .unmap_sg = dma_dummy_unmap_sg, .dma_supported = dma_dummy_supported, diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index cc19a3efea89..794041a39e65 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -11,13 +11,13 @@ #include <linux/dma-mapping.h> #include <linux/kernel.h> #include <linux/kthread.h> -#include <linux/map_benchmark.h> #include <linux/math64.h> #include <linux/module.h> #include <linux/pci.h> #include <linux/platform_device.h> #include <linux/slab.h> #include <linux/timekeeping.h> +#include <uapi/linux/map_benchmark.h> struct map_benchmark_data { struct map_benchmark bparam; diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index fe7472f13b10..37163eb49f9f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -157,7 +157,7 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, { const struct dma_map_ops *ops = get_dma_ops(dev); bool is_mmio = attrs & DMA_ATTR_MMIO; - dma_addr_t addr; + dma_addr_t addr = DMA_MAPPING_ERROR; BUG_ON(!valid_dma_direction(dir)); @@ -169,21 +169,8 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, addr = dma_direct_map_phys(dev, phys, size, dir, attrs); else if (use_dma_iommu(dev)) addr = iommu_dma_map_phys(dev, phys, size, dir, attrs); - else if (is_mmio) { - if (!ops->map_resource) - return DMA_MAPPING_ERROR; - - addr = ops->map_resource(dev, phys, size, dir, attrs); - } else { - struct page *page = phys_to_page(phys); - size_t offset = offset_in_page(phys); - - /* - * The dma_ops API contract for ops->map_page() requires - * kmappable memory, while ops->map_resource() does not. - */ - addr = ops->map_page(dev, page, offset, size, dir, attrs); - } + else if (ops->map_phys) + addr = ops->map_phys(dev, phys, size, dir, attrs); if (!is_mmio) kmsan_handle_dma(phys, size, dir); @@ -223,11 +210,8 @@ void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, dma_direct_unmap_phys(dev, addr, size, dir, attrs); else if (use_dma_iommu(dev)) iommu_dma_unmap_phys(dev, addr, size, dir, attrs); - else if (is_mmio) { - if (ops->unmap_resource) - ops->unmap_resource(dev, addr, size, dir, attrs); - } else - ops->unmap_page(dev, addr, size, dir, attrs); + else if (ops->unmap_phys) + ops->unmap_phys(dev, addr, size, dir, attrs); trace_dma_unmap_phys(dev, addr, size, dir, attrs); debug_dma_unmap_phys(dev, addr, size, dir); } diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c index 6f9d604d9d40..20caf9cabf69 100644 --- a/kernel/dma/ops_helpers.c +++ b/kernel/dma/ops_helpers.c @@ -64,6 +64,7 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size, { const struct dma_map_ops *ops = get_dma_ops(dev); struct page *page; + phys_addr_t phys; page = dma_alloc_contiguous(dev, size, gfp); if (!page) @@ -71,11 +72,12 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size, if (!page) return NULL; + phys = page_to_phys(page); if (use_dma_iommu(dev)) - *dma_handle = iommu_dma_map_phys(dev, page_to_phys(page), size, - dir, DMA_ATTR_SKIP_CPU_SYNC); + *dma_handle = iommu_dma_map_phys(dev, phys, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); else - *dma_handle = ops->map_page(dev, page, 0, size, dir, + *dma_handle = ops->map_phys(dev, phys, size, dir, DMA_ATTR_SKIP_CPU_SYNC); if (*dma_handle == DMA_MAPPING_ERROR) { dma_free_contiguous(dev, page, size); @@ -94,8 +96,8 @@ void dma_common_free_pages(struct device *dev, size_t size, struct page *page, if (use_dma_iommu(dev)) iommu_dma_unmap_phys(dev, dma_handle, size, dir, DMA_ATTR_SKIP_CPU_SYNC); - else if (ops->unmap_page) - ops->unmap_page(dev, dma_handle, size, dir, + else if (ops->unmap_phys) + ops->unmap_phys(dev, dma_handle, size, dir, DMA_ATTR_SKIP_CPU_SYNC); dma_free_contiguous(dev, page, size); } diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index ee45dee33d49..26392badc36b 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -93,7 +93,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, page = dma_alloc_from_contiguous(NULL, 1 << order, order, false); if (!page) - page = alloc_pages(gfp, order); + page = alloc_pages(gfp | __GFP_NOWARN, order); } while (!page && order-- > 0); if (!page) goto out; diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 0d37da3d95b6..a547c7693135 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -61,8 +61,6 @@ */ #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) -#define INVALID_PHYS_ADDR (~(phys_addr_t)0) - /** * struct io_tlb_slot - IO TLB slot descriptor * @orig_addr: The original address corresponding to a mapped entry. diff --git a/kernel/exit.c b/kernel/exit.c index b9667ffcf7b3..8a87021211ae 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -251,13 +251,11 @@ repeat: memset(&post, 0, sizeof(post)); /* don't need to get the RCU readlock here - the process is dead and - * can't be modifying its own credentials. But shut RCU-lockdep up */ - rcu_read_lock(); + * can't be modifying its own credentials. */ dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); - rcu_read_unlock(); pidfs_exit(p); - cgroup_release(p); + cgroup_task_release(p); /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */ thread_pid = task_pid(p); @@ -974,7 +972,7 @@ void __noreturn do_exit(long code) exit_thread(tsk); sched_autogroup_exit_task(tsk); - cgroup_exit(tsk); + cgroup_task_exit(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint diff --git a/kernel/fork.c b/kernel/fork.c index 83e05d6f2307..b1f3915d5f8e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -106,9 +106,9 @@ #include <linux/pidfs.h> #include <linux/tick.h> #include <linux/unwind_deferred.h> - -#include <asm/pgalloc.h> +#include <linux/pgalloc.h> #include <linux/uaccess.h> + #include <asm/mmu_context.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -208,15 +208,62 @@ struct vm_stack { struct vm_struct *stack_vm_area; }; +static struct vm_struct *alloc_thread_stack_node_from_cache(struct task_struct *tsk, int node) +{ + struct vm_struct *vm_area; + unsigned int i; + + /* + * If the node has memory, we are guaranteed the stacks are backed by local pages. + * Otherwise the pages are arbitrary. + * + * Note that depending on cpuset it is possible we will get migrated to a different + * node immediately after allocating here, so this does *not* guarantee locality for + * arbitrary callers. + */ + scoped_guard(preempt) { + if (node != NUMA_NO_NODE && numa_node_id() != node) + return NULL; + + for (i = 0; i < NR_CACHED_STACKS; i++) { + vm_area = this_cpu_xchg(cached_stacks[i], NULL); + if (vm_area) + return vm_area; + } + } + + return NULL; +} + static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area) { unsigned int i; + int nid; - for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *tmp = NULL; + /* + * Don't cache stacks if any of the pages don't match the local domain, unless + * there is no local memory to begin with. + * + * Note that lack of local memory does not automatically mean it makes no difference + * performance-wise which other domain backs the stack. In this case we are merely + * trying to avoid constantly going to vmalloc. + */ + scoped_guard(preempt) { + nid = numa_node_id(); + if (node_state(nid, N_MEMORY)) { + for (i = 0; i < vm_area->nr_pages; i++) { + struct page *page = vm_area->pages[i]; + if (page_to_nid(page) != nid) + return false; + } + } - if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area)) - return true; + for (i = 0; i < NR_CACHED_STACKS; i++) { + struct vm_struct *tmp = NULL; + + if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area)) + return true; + } } return false; } @@ -283,13 +330,9 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) { struct vm_struct *vm_area; void *stack; - int i; - - for (i = 0; i < NR_CACHED_STACKS; i++) { - vm_area = this_cpu_xchg(cached_stacks[i], NULL); - if (!vm_area) - continue; + vm_area = alloc_thread_stack_node_from_cache(tsk, node); + if (vm_area) { if (memcg_charge_kernel_stack(vm_area)) { vfree(vm_area->addr); return -ENOMEM; @@ -736,9 +779,8 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(tsk == current); unwind_task_free(tsk); - sched_ext_free(tsk); io_uring_free(tsk); - cgroup_free(tsk); + cgroup_task_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); exit_creds(tsk); @@ -1059,10 +1101,10 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (current->mm) { unsigned long flags = __mm_flags_get_word(current->mm); - __mm_flags_set_word(mm, mmf_init_legacy_flags(flags)); + __mm_flags_overwrite_word(mm, mmf_init_legacy_flags(flags)); mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; } else { - __mm_flags_set_word(mm, default_dump_filter); + __mm_flags_overwrite_word(mm, default_dump_filter); mm->def_flags = 0; } diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index e2bbe5509ec2..1c2dd03f11ec 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -738,12 +738,11 @@ int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = restart->futex.uaddr; - ktime_t t, *tp = NULL; + ktime_t *tp = NULL; + + if (restart->futex.flags & FLAGS_HAS_TIMEOUT) + tp = &restart->futex.time; - if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { - t = restart->futex.time; - tp = &t; - } restart->fn = do_no_restart_syscall; return (long)futex_wait(uaddr, restart->futex.flags, diff --git a/kernel/hung_task.c b/kernel/hung_task.c index b2c1f14b8129..d2254c91450b 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -24,6 +24,7 @@ #include <linux/sched/sysctl.h> #include <linux/hung_task.h> #include <linux/rwsem.h> +#include <linux/sys_info.h> #include <trace/events/sched.h> @@ -50,7 +51,6 @@ static unsigned long __read_mostly sysctl_hung_task_detect_count; * Zero means infinite timeout - no checking done: */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; -EXPORT_SYMBOL_GPL(sysctl_hung_task_timeout_secs); /* * Zero (default value) means use sysctl_hung_task_timeout_secs: @@ -60,12 +60,17 @@ static unsigned long __read_mostly sysctl_hung_task_check_interval_secs; static int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; -static bool hung_task_show_lock; static bool hung_task_call_panic; -static bool hung_task_show_all_bt; static struct task_struct *watchdog_task; +/* + * A bitmask to control what kinds of system info to be printed when + * a hung task is detected, it could be task, memory, lock etc. Refer + * include/linux/sys_info.h for detailed bit definition. + */ +static unsigned long hung_task_si_mask; + #ifdef CONFIG_SMP /* * Should we dump all CPUs backtraces in a hung task event? @@ -81,7 +86,7 @@ static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; * hung task is detected: */ static unsigned int __read_mostly sysctl_hung_task_panic = - IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC); + CONFIG_BOOTPARAM_HUNG_TASK_PANIC; static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) @@ -218,8 +223,11 @@ static inline void debug_show_blocker(struct task_struct *task, unsigned long ti } #endif -static void check_hung_task(struct task_struct *t, unsigned long timeout) +static void check_hung_task(struct task_struct *t, unsigned long timeout, + unsigned long prev_detect_count) { + unsigned long total_hung_task; + if (!task_is_hung(t, timeout)) return; @@ -229,11 +237,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) */ sysctl_hung_task_detect_count++; + total_hung_task = sysctl_hung_task_detect_count - prev_detect_count; trace_sched_process_hang(t); - if (sysctl_hung_task_panic) { + if (sysctl_hung_task_panic && total_hung_task >= sysctl_hung_task_panic) { console_verbose(); - hung_task_show_lock = true; hung_task_call_panic = true; } @@ -256,10 +264,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) " disables this message.\n"); sched_show_task(t); debug_show_blocker(t, timeout); - hung_task_show_lock = true; - if (sysctl_hung_task_all_cpu_backtrace) - hung_task_show_all_bt = true; if (!sysctl_hung_task_warnings) pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n"); } @@ -300,6 +305,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) int max_count = sysctl_hung_task_check_count; unsigned long last_break = jiffies; struct task_struct *g, *t; + unsigned long prev_detect_count = sysctl_hung_task_detect_count; + int need_warning = sysctl_hung_task_warnings; + unsigned long si_mask = hung_task_si_mask; /* * If the system crashed already then all bets are off, @@ -308,7 +316,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) if (test_taint(TAINT_DIE) || did_panic) return; - hung_task_show_lock = false; + rcu_read_lock(); for_each_process_thread(g, t) { @@ -320,18 +328,23 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) last_break = jiffies; } - check_hung_task(t, timeout); + check_hung_task(t, timeout, prev_detect_count); } unlock: rcu_read_unlock(); - if (hung_task_show_lock) - debug_show_all_locks(); - if (hung_task_show_all_bt) { - hung_task_show_all_bt = false; - trigger_all_cpu_backtrace(); + if (!(sysctl_hung_task_detect_count - prev_detect_count)) + return; + + if (need_warning || hung_task_call_panic) { + si_mask |= SYS_INFO_LOCKS; + + if (sysctl_hung_task_all_cpu_backtrace) + si_mask |= SYS_INFO_ALL_BT; } + sys_info(si_mask); + if (hung_task_call_panic) panic("hung_task: blocked tasks"); } @@ -389,7 +402,7 @@ static const struct ctl_table hung_task_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "hung_task_check_count", @@ -430,6 +443,13 @@ static const struct ctl_table hung_task_sysctls[] = { .mode = 0444, .proc_handler = proc_doulongvec_minmax, }, + { + .procname = "hung_task_sys_info", + .data = &hung_task_si_mask, + .maxlen = sizeof(hung_task_si_mask), + .mode = 0644, + .proc_handler = sysctl_sys_info_handler, + }, }; static void __init hung_task_sysctl_init(void) diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index bf59e37d650a..3cd0c40282c0 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -650,7 +650,7 @@ static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc) } #ifdef CONFIG_PM -static int irq_gc_suspend(void) +static int irq_gc_suspend(void *data) { struct irq_chip_generic *gc; @@ -670,7 +670,7 @@ static int irq_gc_suspend(void) return 0; } -static void irq_gc_resume(void) +static void irq_gc_resume(void *data) { struct irq_chip_generic *gc; @@ -693,7 +693,7 @@ static void irq_gc_resume(void) #define irq_gc_resume NULL #endif -static void irq_gc_shutdown(void) +static void irq_gc_shutdown(void *data) { struct irq_chip_generic *gc; @@ -709,15 +709,19 @@ static void irq_gc_shutdown(void) } } -static struct syscore_ops irq_gc_syscore_ops = { +static const struct syscore_ops irq_gc_syscore_ops = { .suspend = irq_gc_suspend, .resume = irq_gc_resume, .shutdown = irq_gc_shutdown, }; +static struct syscore irq_gc_syscore = { + .ops = &irq_gc_syscore_ops, +}; + static int __init irq_gc_init_ops(void) { - register_syscore_ops(&irq_gc_syscore_ops); + register_syscore(&irq_gc_syscore); return 0; } device_initcall(irq_gc_init_ops); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 6acf268f005b..f8e4e13dbe33 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -720,7 +720,7 @@ EXPORT_SYMBOL_GPL(generic_handle_irq_safe); * This function must be called from an IRQ context with irq regs * initialized. */ -int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq) +int generic_handle_domain_irq(struct irq_domain *domain, irq_hw_number_t hwirq) { return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } @@ -738,7 +738,7 @@ EXPORT_SYMBOL_GPL(generic_handle_domain_irq); * context). If the interrupt is marked as 'enforce IRQ-context only' then * the function must be invoked from hard interrupt context. */ -int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq) +int generic_handle_domain_irq_safe(struct irq_domain *domain, irq_hw_number_t hwirq) { unsigned long flags; int ret; @@ -761,7 +761,7 @@ EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe); * This function must be called from an NMI context with irq regs * initialized. **/ -int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq) +int generic_handle_domain_nmi(struct irq_domain *domain, irq_hw_number_t hwirq) { WARN_ON_ONCE(!in_nmi()); return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index f7394729cedc..99ff65466d87 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -211,21 +211,26 @@ void rearm_wake_irq(unsigned int irq) /** * irq_pm_syscore_resume - enable interrupt lines early + * @data: syscore context * * Enable all interrupt lines with %IRQF_EARLY_RESUME set. */ -static void irq_pm_syscore_resume(void) +static void irq_pm_syscore_resume(void *data) { resume_irqs(true); } -static struct syscore_ops irq_pm_syscore_ops = { +static const struct syscore_ops irq_pm_syscore_ops = { .resume = irq_pm_syscore_resume, }; +static struct syscore irq_pm_syscore = { + .ops = &irq_pm_syscore_ops, +}; + static int __init irq_pm_init_ops(void) { - register_syscore_ops(&irq_pm_syscore_ops); + register_syscore(&irq_pm_syscore); return 0; } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 1e7635864124..049e296f586c 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -103,8 +103,11 @@ static char kallsyms_get_symbol_type(unsigned int off) { /* * Get just the first code, look it up in the token table, - * and return the first char from this token. + * and return the first char from this token. If MSB of length + * is 1, it is a "big" symbol, so needs an additional byte. */ + if (kallsyms_names[off] & 0x80) + off++; return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]]; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index fa00b239c5d9..0f92acdd354d 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -15,6 +15,7 @@ #include <linux/kexec.h> #include <linux/mutex.h> #include <linux/list.h> +#include <linux/liveupdate.h> #include <linux/highmem.h> #include <linux/syscalls.h> #include <linux/reboot.h> @@ -41,6 +42,7 @@ #include <linux/objtool.h> #include <linux/kmsg_dump.h> #include <linux/dma-map-ops.h> +#include <linux/sysfs.h> #include <asm/page.h> #include <asm/sections.h> @@ -742,7 +744,6 @@ static int kimage_load_cma_segment(struct kimage *image, int idx) struct kexec_segment *segment = &image->segment[idx]; struct page *cma = image->segment_cma[idx]; char *ptr = page_address(cma); - unsigned long maddr; size_t ubytes, mbytes; int result = 0; unsigned char __user *buf = NULL; @@ -754,15 +755,12 @@ static int kimage_load_cma_segment(struct kimage *image, int idx) buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; - maddr = segment->mem; /* Then copy from source buffer to the CMA one */ while (mbytes) { size_t uchunk, mchunk; - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); + mchunk = min_t(size_t, mbytes, PAGE_SIZE); uchunk = min(ubytes, mchunk); if (uchunk) { @@ -784,7 +782,6 @@ static int kimage_load_cma_segment(struct kimage *image, int idx) } ptr += mchunk; - maddr += mchunk; mbytes -= mchunk; cond_resched(); @@ -839,9 +836,7 @@ static int kimage_load_normal_segment(struct kimage *image, int idx) ptr = kmap_local_page(page); /* Start with a clear page */ clear_page(ptr); - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); + mchunk = min_t(size_t, mbytes, PAGE_SIZE); uchunk = min(ubytes, mchunk); if (uchunk) { @@ -904,9 +899,7 @@ static int kimage_load_crash_segment(struct kimage *image, int idx) } arch_kexec_post_alloc_pages(page_address(page), 1, 0); ptr = kmap_local_page(page); - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); + mchunk = min_t(size_t, mbytes, PAGE_SIZE); uchunk = min(ubytes, mchunk); if (mchunk > uchunk) { /* Zero the trailing part of the page */ @@ -1146,6 +1139,10 @@ int kernel_kexec(void) goto Unlock; } + error = liveupdate_reboot(); + if (error) + goto Unlock; + #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { /* @@ -1229,3 +1226,143 @@ int kernel_kexec(void) kexec_unlock(); return error; } + +static ssize_t loaded_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!kexec_image); +} +static struct kobj_attribute loaded_attr = __ATTR_RO(loaded); + +#ifdef CONFIG_CRASH_DUMP +static ssize_t crash_loaded_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", kexec_crash_loaded()); +} +static struct kobj_attribute crash_loaded_attr = __ATTR_RO(crash_loaded); + +#ifdef CONFIG_CRASH_RESERVE +static ssize_t crash_cma_ranges_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + + ssize_t len = 0; + int i; + + for (i = 0; i < crashk_cma_cnt; ++i) { + len += sysfs_emit_at(buf, len, "%08llx-%08llx\n", + crashk_cma_ranges[i].start, + crashk_cma_ranges[i].end); + } + return len; +} +static struct kobj_attribute crash_cma_ranges_attr = __ATTR_RO(crash_cma_ranges); +#endif + +static ssize_t crash_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + ssize_t size = crash_get_memory_size(); + + if (size < 0) + return size; + + return sysfs_emit(buf, "%zd\n", size); +} +static ssize_t crash_size_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long cnt; + int ret; + + if (kstrtoul(buf, 0, &cnt)) + return -EINVAL; + + ret = crash_shrink_memory(cnt); + return ret < 0 ? ret : count; +} +static struct kobj_attribute crash_size_attr = __ATTR_RW(crash_size); + +#ifdef CONFIG_CRASH_HOTPLUG +static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + unsigned int sz = crash_get_elfcorehdr_size(); + + return sysfs_emit(buf, "%u\n", sz); +} +static struct kobj_attribute crash_elfcorehdr_size_attr = __ATTR_RO(crash_elfcorehdr_size); + +#endif /* CONFIG_CRASH_HOTPLUG */ +#endif /* CONFIG_CRASH_DUMP */ + +static struct attribute *kexec_attrs[] = { + &loaded_attr.attr, +#ifdef CONFIG_CRASH_DUMP + &crash_loaded_attr.attr, + &crash_size_attr.attr, +#ifdef CONFIG_CRASH_RESERVE + &crash_cma_ranges_attr.attr, +#endif +#ifdef CONFIG_CRASH_HOTPLUG + &crash_elfcorehdr_size_attr.attr, +#endif +#endif + NULL +}; + +struct kexec_link_entry { + const char *target; + const char *name; +}; + +static struct kexec_link_entry kexec_links[] = { + { "loaded", "kexec_loaded" }, +#ifdef CONFIG_CRASH_DUMP + { "crash_loaded", "kexec_crash_loaded" }, + { "crash_size", "kexec_crash_size" }, +#ifdef CONFIG_CRASH_RESERVE + {"crash_cma_ranges", "kexec_crash_cma_ranges"}, +#endif +#ifdef CONFIG_CRASH_HOTPLUG + { "crash_elfcorehdr_size", "crash_elfcorehdr_size" }, +#endif +#endif +}; + +static struct kobject *kexec_kobj; +ATTRIBUTE_GROUPS(kexec); + +static int __init init_kexec_sysctl(void) +{ + int error; + int i; + + kexec_kobj = kobject_create_and_add("kexec", kernel_kobj); + if (!kexec_kobj) { + pr_err("failed to create kexec kobject\n"); + return -ENOMEM; + } + + error = sysfs_create_groups(kexec_kobj, kexec_groups); + if (error) + goto kset_exit; + + for (i = 0; i < ARRAY_SIZE(kexec_links); i++) { + error = compat_only_sysfs_link_entry_to_kobj(kernel_kobj, kexec_kobj, + kexec_links[i].target, + kexec_links[i].name); + if (error) + pr_err("Unable to create %s symlink (%d)", kexec_links[i].name, error); + } + + return 0; + +kset_exit: + kobject_put(kexec_kobj); + return error; +} + +subsys_initcall(init_kexec_sysctl); diff --git a/kernel/kexec_handover_internal.h b/kernel/kexec_handover_internal.h deleted file mode 100644 index 3c3c7148ceed..000000000000 --- a/kernel/kexec_handover_internal.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H -#define LINUX_KEXEC_HANDOVER_INTERNAL_H - -#include <linux/kexec_handover.h> -#include <linux/types.h> - -extern struct kho_scratch *kho_scratch; -extern unsigned int kho_scratch_cnt; - -#ifdef CONFIG_KEXEC_HANDOVER_DEBUG -bool kho_scratch_overlap(phys_addr_t phys, size_t size); -#else -static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size) -{ - return false; -} -#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */ - -#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */ diff --git a/kernel/kstack_erase.c b/kernel/kstack_erase.c index e49bb88b4f0a..d4449884084c 100644 --- a/kernel/kstack_erase.c +++ b/kernel/kstack_erase.c @@ -23,7 +23,7 @@ static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass); #ifdef CONFIG_SYSCTL static int stack_erasing_sysctl(const struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; int state = !static_branch_unlikely(&stack_erasing_bypass); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index eefb67d9883c..a9e6354d9e25 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -12,7 +12,7 @@ #include <linux/sysfs.h> #include <linux/export.h> #include <linux/init.h> -#include <linux/kexec.h> +#include <linux/vmcore_info.h> #include <linux/profile.h> #include <linux/stat.h> #include <linux/sched.h> @@ -119,50 +119,6 @@ static ssize_t profiling_store(struct kobject *kobj, KERNEL_ATTR_RW(profiling); #endif -#ifdef CONFIG_KEXEC_CORE -static ssize_t kexec_loaded_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sysfs_emit(buf, "%d\n", !!kexec_image); -} -KERNEL_ATTR_RO(kexec_loaded); - -#ifdef CONFIG_CRASH_DUMP -static ssize_t kexec_crash_loaded_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sysfs_emit(buf, "%d\n", kexec_crash_loaded()); -} -KERNEL_ATTR_RO(kexec_crash_loaded); - -static ssize_t kexec_crash_size_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - ssize_t size = crash_get_memory_size(); - - if (size < 0) - return size; - - return sysfs_emit(buf, "%zd\n", size); -} -static ssize_t kexec_crash_size_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long cnt; - int ret; - - if (kstrtoul(buf, 0, &cnt)) - return -EINVAL; - - ret = crash_shrink_memory(cnt); - return ret < 0 ? ret : count; -} -KERNEL_ATTR_RW(kexec_crash_size); - -#endif /* CONFIG_CRASH_DUMP*/ -#endif /* CONFIG_KEXEC_CORE */ - #ifdef CONFIG_VMCORE_INFO static ssize_t vmcoreinfo_show(struct kobject *kobj, @@ -174,18 +130,6 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, } KERNEL_ATTR_RO(vmcoreinfo); -#ifdef CONFIG_CRASH_HOTPLUG -static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - unsigned int sz = crash_get_elfcorehdr_size(); - - return sysfs_emit(buf, "%u\n", sz); -} -KERNEL_ATTR_RO(crash_elfcorehdr_size); - -#endif - #endif /* CONFIG_VMCORE_INFO */ /* whether file capabilities are enabled */ @@ -255,18 +199,8 @@ static struct attribute * kernel_attrs[] = { #ifdef CONFIG_PROFILING &profiling_attr.attr, #endif -#ifdef CONFIG_KEXEC_CORE - &kexec_loaded_attr.attr, -#ifdef CONFIG_CRASH_DUMP - &kexec_crash_loaded_attr.attr, - &kexec_crash_size_attr.attr, -#endif -#endif #ifdef CONFIG_VMCORE_INFO &vmcoreinfo_attr.attr, -#ifdef CONFIG_CRASH_HOTPLUG - &crash_elfcorehdr_size_attr.attr, -#endif #endif #ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 0044a8125013..9917756dae46 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -88,8 +88,14 @@ static struct klp_func *klp_find_func(struct klp_object *obj, struct klp_func *func; klp_for_each_func(obj, func) { + /* + * Besides identical old_sympos, also consider old_sympos + * of 0 and 1 are identical. + */ if ((strcmp(old_func->old_name, func->old_name) == 0) && - (old_func->old_sympos == func->old_sympos)) { + ((old_func->old_sympos == func->old_sympos) || + (old_func->old_sympos == 0 && func->old_sympos == 1) || + (old_func->old_sympos == 1 && func->old_sympos == 0))) { return func; } } diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig new file mode 100644 index 000000000000..9b2515f31afb --- /dev/null +++ b/kernel/liveupdate/Kconfig @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menu "Live Update and Kexec HandOver" + depends on !DEFERRED_STRUCT_PAGE_INIT + +config KEXEC_HANDOVER + bool "kexec handover" + depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE + depends on !DEFERRED_STRUCT_PAGE_INIT + select MEMBLOCK_KHO_SCRATCH + select KEXEC_FILE + select LIBFDT + select CMA + help + Allow kexec to hand over state across kernels by generating and + passing additional metadata to the target kernel. This is useful + to keep data or state alive across the kexec. For this to work, + both source and target kernels need to have this option enabled. + +config KEXEC_HANDOVER_DEBUG + bool "Enable Kexec Handover debug checks" + depends on KEXEC_HANDOVER + help + This option enables extra sanity checks for the Kexec Handover + subsystem. Since, KHO performance is crucial in live update + scenarios and the extra code might be adding overhead it is + only optionally enabled. + +config KEXEC_HANDOVER_DEBUGFS + bool "kexec handover debugfs interface" + default KEXEC_HANDOVER + depends on KEXEC_HANDOVER + select DEBUG_FS + help + Allow to control kexec handover device tree via debugfs + interface, i.e. finalize the state or aborting the finalization. + Also, enables inspecting the KHO fdt trees with the debugfs binary + blobs. + +config KEXEC_HANDOVER_ENABLE_DEFAULT + bool "Enable kexec handover by default" + depends on KEXEC_HANDOVER + help + Enable Kexec Handover by default. This avoids the need to + explicitly pass 'kho=on' on the kernel command line. + + This is useful for systems where KHO is a prerequisite for other + features, such as Live Update, ensuring the mechanism is always + active. + + The default behavior can still be overridden at boot time by + passing 'kho=off'. + +config LIVEUPDATE + bool "Live Update Orchestrator" + depends on KEXEC_HANDOVER + help + Enable the Live Update Orchestrator. Live Update is a mechanism, + typically based on kexec, that allows the kernel to be updated + while keeping selected devices operational across the transition. + These devices are intended to be reclaimed by the new kernel and + re-attached to their original workload without requiring a device + reset. + + Ability to handover a device from current to the next kernel depends + on specific support within device drivers and related kernel + subsystems. + + This feature primarily targets virtual machine hosts to quickly update + the kernel hypervisor with minimal disruption to the running virtual + machines. + + If unsure, say N. + +endmenu diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile new file mode 100644 index 000000000000..7cad2eece32d --- /dev/null +++ b/kernel/liveupdate/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 + +luo-y := \ + luo_core.o \ + luo_file.o \ + luo_session.o + +obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUGFS) += kexec_handover_debugfs.o + +obj-$(CONFIG_LIVEUPDATE) += luo.o diff --git a/kernel/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 03d12e27189f..9dc51fab604f 100644 --- a/kernel/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -4,21 +4,22 @@ * Copyright (C) 2023 Alexander Graf <graf@amazon.com> * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org> * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com> + * Copyright (C) 2025 Pasha Tatashin <pasha.tatashin@soleen.com> */ #define pr_fmt(fmt) "KHO: " fmt #include <linux/cleanup.h> #include <linux/cma.h> +#include <linux/kmemleak.h> #include <linux/count_zeros.h> -#include <linux/debugfs.h> #include <linux/kexec.h> #include <linux/kexec_handover.h> #include <linux/libfdt.h> #include <linux/list.h> #include <linux/memblock.h> -#include <linux/notifier.h> #include <linux/page-isolation.h> +#include <linux/unaligned.h> #include <linux/vmalloc.h> #include <asm/early_ioremap.h> @@ -28,8 +29,9 @@ * KHO is tightly coupled with mm init and needs access to some of mm * internal APIs. */ -#include "../mm/internal.h" -#include "kexec_internal.h" +#include "../../mm/internal.h" +#include "../kexec_internal.h" +#include "kexec_handover_internal.h" #define KHO_FDT_COMPATIBLE "kho-v1" #define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" @@ -51,7 +53,7 @@ union kho_page_info { static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)0)->private)); -static bool kho_enable __ro_after_init; +static bool kho_enable __ro_after_init = IS_ENABLED(CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT); bool kho_is_enabled(void) { @@ -103,34 +105,19 @@ struct kho_mem_track { struct khoser_mem_chunk; -struct kho_serialization { - struct page *fdt; - struct list_head fdt_list; - struct dentry *sub_fdt_dir; - struct kho_mem_track track; - /* First chunk of serialized preserved memory map */ - struct khoser_mem_chunk *preserved_mem_map; -}; - struct kho_out { - struct blocking_notifier_head chain_head; - - struct dentry *dir; - + void *fdt; + bool finalized; struct mutex lock; /* protects KHO FDT finalization */ - struct kho_serialization ser; - bool finalized; + struct kho_mem_track track; + struct kho_debugfs dbg; }; static struct kho_out kho_out = { - .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), .lock = __MUTEX_INITIALIZER(kho_out.lock), - .ser = { - .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list), - .track = { - .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), - }, + .track = { + .orders = XARRAY_INIT(kho_out.track.orders, 0), }, .finalized = false, }; @@ -159,26 +146,33 @@ static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) return no_free_ptr(elm); } -static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, - unsigned long end_pfn) +static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn, + unsigned int order) { struct kho_mem_phys_bits *bits; struct kho_mem_phys *physxa; + const unsigned long pfn_high = pfn >> order; - while (pfn < end_pfn) { - const unsigned int order = - min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); - const unsigned long pfn_high = pfn >> order; + physxa = xa_load(&track->orders, order); + if (WARN_ON_ONCE(!physxa)) + return; - physxa = xa_load(&track->orders, order); - if (WARN_ON_ONCE(!physxa)) - return; + bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); + if (WARN_ON_ONCE(!bits)) + return; - bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); - if (WARN_ON_ONCE(!bits)) - return; + clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); +} - clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); +static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, + unsigned long end_pfn) +{ + unsigned int order; + + while (pfn < end_pfn) { + order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + + __kho_unpreserve_order(track, pfn, order); pfn += 1 << order; } @@ -192,10 +186,6 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, const unsigned long pfn_high = pfn >> order; might_sleep(); - - if (kho_out.finalized) - return -EBUSY; - physxa = xa_load(&track->orders, order); if (!physxa) { int err; @@ -229,11 +219,11 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, return 0; } -static struct page *kho_restore_page(phys_addr_t phys) +static struct page *kho_restore_page(phys_addr_t phys, bool is_folio) { struct page *page = pfn_to_online_page(PHYS_PFN(phys)); + unsigned int nr_pages, ref_cnt; union kho_page_info info; - unsigned int nr_pages; if (!page) return NULL; @@ -253,11 +243,16 @@ static struct page *kho_restore_page(phys_addr_t phys) /* Head page gets refcount of 1. */ set_page_count(page, 1); - /* For higher order folios, tail pages get a page count of zero. */ + /* + * For higher order folios, tail pages get a page count of zero. + * For physically contiguous order-0 pages every pages gets a page + * count of 1 + */ + ref_cnt = is_folio ? 0 : 1; for (unsigned int i = 1; i < nr_pages; i++) - set_page_count(page + i, 0); + set_page_count(page + i, ref_cnt); - if (info.order > 0) + if (is_folio && info.order) prep_compound_page(page, info.order); adjust_managed_page_count(page, nr_pages); @@ -272,7 +267,7 @@ static struct page *kho_restore_page(phys_addr_t phys) */ struct folio *kho_restore_folio(phys_addr_t phys) { - struct page *page = kho_restore_page(phys); + struct page *page = kho_restore_page(phys, true); return page ? page_folio(page) : NULL; } @@ -297,11 +292,10 @@ struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages) while (pfn < end_pfn) { const unsigned int order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); - struct page *page = kho_restore_page(PFN_PHYS(pfn)); + struct page *page = kho_restore_page(PFN_PHYS(pfn), false); if (!page) return NULL; - split_page(page, order); pfn += 1 << order; } @@ -371,11 +365,32 @@ static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) struct khoser_mem_chunk *tmp = chunk; chunk = KHOSER_LOAD_PTR(chunk->hdr.next); - kfree(tmp); + free_page((unsigned long)tmp); } } -static int kho_mem_serialize(struct kho_serialization *ser) +/* + * Update memory map property, if old one is found discard it via + * kho_mem_ser_free(). + */ +static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk) +{ + void *ptr; + u64 phys; + + ptr = fdt_getprop_w(kho_out.fdt, 0, PROP_PRESERVED_MEMORY_MAP, NULL); + + /* Check and discard previous memory map */ + phys = get_unaligned((u64 *)ptr); + if (phys) + kho_mem_ser_free((struct khoser_mem_chunk *)phys_to_virt(phys)); + + /* Update with the new value */ + phys = first_chunk ? (u64)virt_to_phys(first_chunk) : 0; + put_unaligned(phys, (u64 *)ptr); +} + +static int kho_mem_serialize(struct kho_out *kho_out) { struct khoser_mem_chunk *first_chunk = NULL; struct khoser_mem_chunk *chunk = NULL; @@ -383,7 +398,7 @@ static int kho_mem_serialize(struct kho_serialization *ser) unsigned long order; int err = -ENOMEM; - xa_for_each(&ser->track.orders, order, physxa) { + xa_for_each(&kho_out->track.orders, order, physxa) { struct kho_mem_phys_bits *bits; unsigned long phys; @@ -415,7 +430,7 @@ static int kho_mem_serialize(struct kho_serialization *ser) } } - ser->preserved_mem_map = first_chunk; + kho_update_memory_map(first_chunk); return 0; @@ -445,20 +460,27 @@ static void __init deserialize_bitmap(unsigned int order, } } -static void __init kho_mem_deserialize(const void *fdt) +/* Return true if memory was deserizlied */ +static bool __init kho_mem_deserialize(const void *fdt) { struct khoser_mem_chunk *chunk; - const phys_addr_t *mem; + const void *mem_ptr; + u64 mem; int len; - mem = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); - - if (!mem || len != sizeof(*mem)) { + mem_ptr = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); + if (!mem_ptr || len != sizeof(u64)) { pr_err("failed to get preserved memory bitmaps\n"); - return; + return false; } - chunk = *mem ? phys_to_virt(*mem) : NULL; + mem = get_unaligned((const u64 *)mem_ptr); + chunk = mem ? phys_to_virt(mem) : NULL; + + /* No preserved physical pages were passed, no deserialization */ + if (!chunk) + return false; + while (chunk) { unsigned int i; @@ -467,6 +489,8 @@ static void __init kho_mem_deserialize(const void *fdt) &chunk->bitmaps[i]); chunk = KHOSER_LOAD_PTR(chunk->hdr.next); } + + return true; } /* @@ -674,40 +698,8 @@ err_disable_kho: kho_enable = false; } -struct fdt_debugfs { - struct list_head list; - struct debugfs_blob_wrapper wrapper; - struct dentry *file; -}; - -static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, - const char *name, const void *fdt) -{ - struct fdt_debugfs *f; - struct dentry *file; - - f = kmalloc(sizeof(*f), GFP_KERNEL); - if (!f) - return -ENOMEM; - - f->wrapper.data = (void *)fdt; - f->wrapper.size = fdt_totalsize(fdt); - - file = debugfs_create_blob(name, 0400, dir, &f->wrapper); - if (IS_ERR(file)) { - kfree(f); - return PTR_ERR(file); - } - - f->file = file; - list_add(&f->list, list); - - return 0; -} - /** * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. - * @ser: serialization control object passed by KHO notifiers. * @name: name of the sub tree. * @fdt: the sub tree blob. * @@ -716,38 +708,76 @@ static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, * by KHO for the new kernel to retrieve it after kexec. * * A debugfs blob entry is also created at - * ``/sys/kernel/debug/kho/out/sub_fdts/@name``. + * ``/sys/kernel/debug/kho/out/sub_fdts/@name`` when kernel is configured with + * CONFIG_KEXEC_HANDOVER_DEBUGFS * * Return: 0 on success, error code on failure */ -int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) +int kho_add_subtree(const char *name, void *fdt) { - int err = 0; - u64 phys = (u64)virt_to_phys(fdt); - void *root = page_to_virt(ser->fdt); + phys_addr_t phys = virt_to_phys(fdt); + void *root_fdt = kho_out.fdt; + int err = -ENOMEM; + int off, fdt_err; - err |= fdt_begin_node(root, name); - err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys)); - err |= fdt_end_node(root); + guard(mutex)(&kho_out.lock); - if (err) + fdt_err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE); + if (fdt_err < 0) return err; - return kho_debugfs_fdt_add(&ser->fdt_list, ser->sub_fdt_dir, name, fdt); + off = fdt_add_subnode(root_fdt, 0, name); + if (off < 0) { + if (off == -FDT_ERR_EXISTS) + err = -EEXIST; + goto out_pack; + } + + err = fdt_setprop(root_fdt, off, PROP_SUB_FDT, &phys, sizeof(phys)); + if (err < 0) + goto out_pack; + + WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false)); + +out_pack: + fdt_pack(root_fdt); + + return err; } EXPORT_SYMBOL_GPL(kho_add_subtree); -int register_kho_notifier(struct notifier_block *nb) +void kho_remove_subtree(void *fdt) { - return blocking_notifier_chain_register(&kho_out.chain_head, nb); -} -EXPORT_SYMBOL_GPL(register_kho_notifier); + phys_addr_t target_phys = virt_to_phys(fdt); + void *root_fdt = kho_out.fdt; + int off; + int err; -int unregister_kho_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&kho_out.chain_head, nb); + guard(mutex)(&kho_out.lock); + + err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE); + if (err < 0) + return; + + for (off = fdt_first_subnode(root_fdt, 0); off >= 0; + off = fdt_next_subnode(root_fdt, off)) { + const u64 *val; + int len; + + val = fdt_getprop(root_fdt, off, PROP_SUB_FDT, &len); + if (!val || len != sizeof(phys_addr_t)) + continue; + + if ((phys_addr_t)*val == target_phys) { + fdt_del_node(root_fdt, off); + kho_debugfs_fdt_remove(&kho_out.dbg, fdt); + break; + } + } + + fdt_pack(root_fdt); } -EXPORT_SYMBOL_GPL(unregister_kho_notifier); +EXPORT_SYMBOL_GPL(kho_remove_subtree); /** * kho_preserve_folio - preserve a folio across kexec. @@ -762,7 +792,7 @@ int kho_preserve_folio(struct folio *folio) { const unsigned long pfn = folio_pfn(folio); const unsigned int order = folio_order(folio); - struct kho_mem_track *track = &kho_out.ser.track; + struct kho_mem_track *track = &kho_out.track; if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order))) return -EINVAL; @@ -772,6 +802,24 @@ int kho_preserve_folio(struct folio *folio) EXPORT_SYMBOL_GPL(kho_preserve_folio); /** + * kho_unpreserve_folio - unpreserve a folio. + * @folio: folio to unpreserve. + * + * Instructs KHO to unpreserve a folio that was preserved by + * kho_preserve_folio() before. The provided @folio (pfn and order) + * must exactly match a previously preserved folio. + */ +void kho_unpreserve_folio(struct folio *folio) +{ + const unsigned long pfn = folio_pfn(folio); + const unsigned int order = folio_order(folio); + struct kho_mem_track *track = &kho_out.track; + + __kho_unpreserve_order(track, pfn, order); +} +EXPORT_SYMBOL_GPL(kho_unpreserve_folio); + +/** * kho_preserve_pages - preserve contiguous pages across kexec * @page: first page in the list. * @nr_pages: number of pages. @@ -783,7 +831,7 @@ EXPORT_SYMBOL_GPL(kho_preserve_folio); */ int kho_preserve_pages(struct page *page, unsigned int nr_pages) { - struct kho_mem_track *track = &kho_out.ser.track; + struct kho_mem_track *track = &kho_out.track; const unsigned long start_pfn = page_to_pfn(page); const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn = start_pfn; @@ -815,6 +863,26 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages) } EXPORT_SYMBOL_GPL(kho_preserve_pages); +/** + * kho_unpreserve_pages - unpreserve contiguous pages. + * @page: first page in the list. + * @nr_pages: number of pages. + * + * Instructs KHO to unpreserve @nr_pages contiguous pages starting from @page. + * This must be called with the same @page and @nr_pages as the corresponding + * kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger + * preserved blocks is not supported. + */ +void kho_unpreserve_pages(struct page *page, unsigned int nr_pages) +{ + struct kho_mem_track *track = &kho_out.track; + const unsigned long start_pfn = page_to_pfn(page); + const unsigned long end_pfn = start_pfn + nr_pages; + + __kho_unpreserve(track, start_pfn, end_pfn); +} +EXPORT_SYMBOL_GPL(kho_unpreserve_pages); + struct kho_vmalloc_hdr { DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *); }; @@ -885,7 +953,7 @@ err_free: static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk, unsigned short order) { - struct kho_mem_track *track = &kho_out.ser.track; + struct kho_mem_track *track = &kho_out.track; unsigned long pfn = PHYS_PFN(virt_to_phys(chunk)); __kho_unpreserve(track, pfn, pfn + 1); @@ -896,20 +964,6 @@ static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk, } } -static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc) -{ - struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(kho_vmalloc->first); - - while (chunk) { - struct kho_vmalloc_chunk *tmp = chunk; - - kho_vmalloc_unpreserve_chunk(chunk, kho_vmalloc->order); - - chunk = KHOSER_LOAD_PTR(chunk->hdr.next); - free_page((unsigned long)tmp); - } -} - /** * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec * @ptr: pointer to the area in vmalloc address space @@ -971,12 +1025,34 @@ int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation) return 0; err_free: - kho_vmalloc_free_chunks(preservation); + kho_unpreserve_vmalloc(preservation); return err; } EXPORT_SYMBOL_GPL(kho_preserve_vmalloc); /** + * kho_unpreserve_vmalloc - unpreserve memory allocated with vmalloc() + * @preservation: preservation metadata returned by kho_preserve_vmalloc() + * + * Instructs KHO to unpreserve the area in vmalloc address space that was + * previously preserved with kho_preserve_vmalloc(). + */ +void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation) +{ + struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first); + + while (chunk) { + struct kho_vmalloc_chunk *tmp = chunk; + + kho_vmalloc_unpreserve_chunk(chunk, preservation->order); + + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + free_page((unsigned long)tmp); + } +} +EXPORT_SYMBOL_GPL(kho_unpreserve_vmalloc); + +/** * kho_restore_vmalloc - recreates and populates an area in vmalloc address * space from the preserved memory. * @preservation: preservation metadata. @@ -1024,7 +1100,7 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) goto err_free_pages_array; for (int j = 0; j < contig_pages; j++) - pages[idx++] = page; + pages[idx++] = page + j; phys += contig_pages * PAGE_SIZE; } @@ -1065,217 +1141,122 @@ err_free_pages_array: } EXPORT_SYMBOL_GPL(kho_restore_vmalloc); -/* Handling for debug/kho/out */ - -static struct dentry *debugfs_root; - -static int kho_out_update_debugfs_fdt(void) -{ - int err = 0; - struct fdt_debugfs *ff, *tmp; - - if (kho_out.finalized) { - err = kho_debugfs_fdt_add(&kho_out.ser.fdt_list, kho_out.dir, - "fdt", page_to_virt(kho_out.ser.fdt)); - } else { - list_for_each_entry_safe(ff, tmp, &kho_out.ser.fdt_list, list) { - debugfs_remove(ff->file); - list_del(&ff->list); - kfree(ff); - } - } - - return err; -} - -static int kho_abort(void) +/** + * kho_alloc_preserve - Allocate, zero, and preserve memory. + * @size: The number of bytes to allocate. + * + * Allocates a physically contiguous block of zeroed pages that is large + * enough to hold @size bytes. The allocated memory is then registered with + * KHO for preservation across a kexec. + * + * Note: The actual allocated size will be rounded up to the nearest + * power-of-two page boundary. + * + * @return A virtual pointer to the allocated and preserved memory on success, + * or an ERR_PTR() encoded error on failure. + */ +void *kho_alloc_preserve(size_t size) { - int err; - unsigned long order; - struct kho_mem_phys *physxa; + struct folio *folio; + int order, ret; - xa_for_each(&kho_out.ser.track.orders, order, physxa) { - struct kho_mem_phys_bits *bits; - unsigned long phys; + if (!size) + return ERR_PTR(-EINVAL); - xa_for_each(&physxa->phys_bits, phys, bits) - kfree(bits); + order = get_order(size); + if (order > MAX_PAGE_ORDER) + return ERR_PTR(-E2BIG); - xa_destroy(&physxa->phys_bits); - kfree(physxa); - } - xa_destroy(&kho_out.ser.track.orders); + folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, order); + if (!folio) + return ERR_PTR(-ENOMEM); - if (kho_out.ser.preserved_mem_map) { - kho_mem_ser_free(kho_out.ser.preserved_mem_map); - kho_out.ser.preserved_mem_map = NULL; + ret = kho_preserve_folio(folio); + if (ret) { + folio_put(folio); + return ERR_PTR(ret); } - err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT, - NULL); - err = notifier_to_errno(err); - - if (err) - pr_err("Failed to abort KHO finalization: %d\n", err); - - return err; + return folio_address(folio); } +EXPORT_SYMBOL_GPL(kho_alloc_preserve); -static int kho_finalize(void) +/** + * kho_unpreserve_free - Unpreserve and free memory. + * @mem: Pointer to the memory allocated by kho_alloc_preserve(). + * + * Unregisters the memory from KHO preservation and frees the underlying + * pages back to the system. This function should be called to clean up + * memory allocated with kho_alloc_preserve(). + */ +void kho_unpreserve_free(void *mem) { - int err = 0; - u64 *preserved_mem_map; - void *fdt = page_to_virt(kho_out.ser.fdt); - - err |= fdt_create(fdt, PAGE_SIZE); - err |= fdt_finish_reservemap(fdt); - err |= fdt_begin_node(fdt, ""); - err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE); - /** - * Reserve the preserved-memory-map property in the root FDT, so - * that all property definitions will precede subnodes created by - * KHO callers. - */ - err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP, - sizeof(*preserved_mem_map), - (void **)&preserved_mem_map); - if (err) - goto abort; - - err = kho_preserve_folio(page_folio(kho_out.ser.fdt)); - if (err) - goto abort; - - err = blocking_notifier_call_chain(&kho_out.chain_head, - KEXEC_KHO_FINALIZE, &kho_out.ser); - err = notifier_to_errno(err); - if (err) - goto abort; - - err = kho_mem_serialize(&kho_out.ser); - if (err) - goto abort; - - *preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map); - - err |= fdt_end_node(fdt); - err |= fdt_finish(fdt); + struct folio *folio; -abort: - if (err) { - pr_err("Failed to convert KHO state tree: %d\n", err); - kho_abort(); - } + if (!mem) + return; - return err; + folio = virt_to_folio(mem); + kho_unpreserve_folio(folio); + folio_put(folio); } +EXPORT_SYMBOL_GPL(kho_unpreserve_free); -static int kho_out_finalize_get(void *data, u64 *val) +/** + * kho_restore_free - Restore and free memory after kexec. + * @mem: Pointer to the memory (in the new kernel's address space) + * that was allocated by the old kernel. + * + * This function is intended to be called in the new kernel (post-kexec) + * to take ownership of and free a memory region that was preserved by the + * old kernel using kho_alloc_preserve(). + * + * It first restores the pages from KHO (using their physical address) + * and then frees the pages back to the new kernel's page allocator. + */ +void kho_restore_free(void *mem) { - mutex_lock(&kho_out.lock); - *val = kho_out.finalized; - mutex_unlock(&kho_out.lock); + struct folio *folio; - return 0; + if (!mem) + return; + + folio = kho_restore_folio(__pa(mem)); + if (!WARN_ON(!folio)) + folio_put(folio); } +EXPORT_SYMBOL_GPL(kho_restore_free); -static int kho_out_finalize_set(void *data, u64 _val) +int kho_finalize(void) { - int ret = 0; - bool val = !!_val; - - mutex_lock(&kho_out.lock); - - if (val == kho_out.finalized) { - if (kho_out.finalized) - ret = -EEXIST; - else - ret = -ENOENT; - goto unlock; - } + int ret; - if (val) - ret = kho_finalize(); - else - ret = kho_abort(); + if (!kho_enable) + return -EOPNOTSUPP; + guard(mutex)(&kho_out.lock); + ret = kho_mem_serialize(&kho_out); if (ret) - goto unlock; - - kho_out.finalized = val; - ret = kho_out_update_debugfs_fdt(); - -unlock: - mutex_unlock(&kho_out.lock); - return ret; -} - -DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get, - kho_out_finalize_set, "%llu\n"); + return ret; -static int scratch_phys_show(struct seq_file *m, void *v) -{ - for (int i = 0; i < kho_scratch_cnt; i++) - seq_printf(m, "0x%llx\n", kho_scratch[i].addr); + kho_out.finalized = true; return 0; } -DEFINE_SHOW_ATTRIBUTE(scratch_phys); -static int scratch_len_show(struct seq_file *m, void *v) +bool kho_finalized(void) { - for (int i = 0; i < kho_scratch_cnt; i++) - seq_printf(m, "0x%llx\n", kho_scratch[i].size); - - return 0; -} -DEFINE_SHOW_ATTRIBUTE(scratch_len); - -static __init int kho_out_debugfs_init(void) -{ - struct dentry *dir, *f, *sub_fdt_dir; - - dir = debugfs_create_dir("out", debugfs_root); - if (IS_ERR(dir)) - return -ENOMEM; - - sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); - if (IS_ERR(sub_fdt_dir)) - goto err_rmdir; - - f = debugfs_create_file("scratch_phys", 0400, dir, NULL, - &scratch_phys_fops); - if (IS_ERR(f)) - goto err_rmdir; - - f = debugfs_create_file("scratch_len", 0400, dir, NULL, - &scratch_len_fops); - if (IS_ERR(f)) - goto err_rmdir; - - f = debugfs_create_file("finalize", 0600, dir, NULL, - &fops_kho_out_finalize); - if (IS_ERR(f)) - goto err_rmdir; - - kho_out.dir = dir; - kho_out.ser.sub_fdt_dir = sub_fdt_dir; - return 0; - -err_rmdir: - debugfs_remove_recursive(dir); - return -ENOENT; + guard(mutex)(&kho_out.lock); + return kho_out.finalized; } struct kho_in { - struct dentry *dir; phys_addr_t fdt_phys; phys_addr_t scratch_phys; - struct list_head fdt_list; + struct kho_debugfs dbg; }; static struct kho_in kho_in = { - .fdt_list = LIST_HEAD_INIT(kho_in.fdt_list), }; static const void *kho_get_fdt(void) @@ -1339,91 +1320,52 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys) } EXPORT_SYMBOL_GPL(kho_retrieve_subtree); -/* Handling for debugfs/kho/in */ - -static __init int kho_in_debugfs_init(const void *fdt) +static __init int kho_out_fdt_setup(void) { - struct dentry *sub_fdt_dir; - int err, child; - - kho_in.dir = debugfs_create_dir("in", debugfs_root); - if (IS_ERR(kho_in.dir)) - return PTR_ERR(kho_in.dir); - - sub_fdt_dir = debugfs_create_dir("sub_fdts", kho_in.dir); - if (IS_ERR(sub_fdt_dir)) { - err = PTR_ERR(sub_fdt_dir); - goto err_rmdir; - } - - err = kho_debugfs_fdt_add(&kho_in.fdt_list, kho_in.dir, "fdt", fdt); - if (err) - goto err_rmdir; - - fdt_for_each_subnode(child, fdt, 0) { - int len = 0; - const char *name = fdt_get_name(fdt, child, NULL); - const u64 *fdt_phys; - - fdt_phys = fdt_getprop(fdt, child, "fdt", &len); - if (!fdt_phys) - continue; - if (len != sizeof(*fdt_phys)) { - pr_warn("node `%s`'s prop `fdt` has invalid length: %d\n", - name, len); - continue; - } - err = kho_debugfs_fdt_add(&kho_in.fdt_list, sub_fdt_dir, name, - phys_to_virt(*fdt_phys)); - if (err) { - pr_warn("failed to add fdt `%s` to debugfs: %d\n", name, - err); - continue; - } - } + void *root = kho_out.fdt; + u64 empty_mem_map = 0; + int err; - return 0; + err = fdt_create(root, PAGE_SIZE); + err |= fdt_finish_reservemap(root); + err |= fdt_begin_node(root, ""); + err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); + err |= fdt_property(root, PROP_PRESERVED_MEMORY_MAP, &empty_mem_map, + sizeof(empty_mem_map)); + err |= fdt_end_node(root); + err |= fdt_finish(root); -err_rmdir: - debugfs_remove_recursive(kho_in.dir); return err; } static __init int kho_init(void) { - int err = 0; const void *fdt = kho_get_fdt(); + int err = 0; if (!kho_enable) return 0; - kho_out.ser.fdt = alloc_page(GFP_KERNEL); - if (!kho_out.ser.fdt) { - err = -ENOMEM; + kho_out.fdt = kho_alloc_preserve(PAGE_SIZE); + if (IS_ERR(kho_out.fdt)) { + err = PTR_ERR(kho_out.fdt); goto err_free_scratch; } - debugfs_root = debugfs_create_dir("kho", NULL); - if (IS_ERR(debugfs_root)) { - err = -ENOENT; + err = kho_debugfs_init(); + if (err) goto err_free_fdt; - } - err = kho_out_debugfs_init(); + err = kho_out_debugfs_init(&kho_out.dbg); if (err) goto err_free_fdt; - if (fdt) { - err = kho_in_debugfs_init(fdt); - /* - * Failure to create /sys/kernel/debug/kho/in does not prevent - * reviving state from KHO and setting up KHO for the next - * kexec. - */ - if (err) - pr_err("failed exposing handover FDT in debugfs: %d\n", - err); + err = kho_out_fdt_setup(); + if (err) + goto err_free_fdt; + if (fdt) { + kho_in_debugfs_init(&kho_in.dbg, fdt); return 0; } @@ -1432,17 +1374,29 @@ static __init int kho_init(void) unsigned long count = kho_scratch[i].size >> PAGE_SHIFT; unsigned long pfn; + /* + * When debug_pagealloc is enabled, __free_pages() clears the + * corresponding PRESENT bit in the kernel page table. + * Subsequent kmemleak scans of these pages cause the + * non-PRESENT page faults. + * Mark scratch areas with kmemleak_ignore_phys() to exclude + * them from kmemleak scanning. + */ + kmemleak_ignore_phys(kho_scratch[i].addr); for (pfn = base_pfn; pfn < base_pfn + count; pfn += pageblock_nr_pages) init_cma_reserved_pageblock(pfn_to_page(pfn)); } + WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt", + kho_out.fdt, true)); + return 0; err_free_fdt: - put_page(kho_out.ser.fdt); - kho_out.ser.fdt = NULL; + kho_unpreserve_free(kho_out.fdt); err_free_scratch: + kho_out.fdt = NULL; for (int i = 0; i < kho_scratch_cnt; i++) { void *start = __va(kho_scratch[i].addr); void *end = start + kho_scratch[i].size; @@ -1452,7 +1406,7 @@ err_free_scratch: kho_enable = false; return err; } -late_initcall(kho_init); +fs_initcall(kho_init); static void __init kho_release_scratch(void) { @@ -1480,16 +1434,12 @@ static void __init kho_release_scratch(void) void __init kho_memory_init(void) { - struct folio *folio; - if (kho_in.scratch_phys) { kho_scratch = phys_to_virt(kho_in.scratch_phys); kho_release_scratch(); - kho_mem_deserialize(kho_get_fdt()); - folio = kho_restore_folio(kho_in.fdt_phys); - if (!folio) - pr_warn("failed to restore folio for KHO fdt\n"); + if (!kho_mem_deserialize(kho_get_fdt())) + kho_in.fdt_phys = 0; } else { kho_reserve_scratch(); } @@ -1545,8 +1495,8 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, memblock_add(area->addr, size); err = memblock_mark_kho_scratch(area->addr, size); if (WARN_ON(err)) { - pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %d", - &area->addr, &size, err); + pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %pe", + &area->addr, &size, ERR_PTR(err)); goto out; } pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size); @@ -1566,7 +1516,7 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, kho_in.fdt_phys = fdt_phys; kho_in.scratch_phys = scratch_phys; kho_scratch_cnt = scratch_cnt; - pr_info("found kexec handover data. Will skip init for some devices\n"); + pr_info("found kexec handover data.\n"); out: if (fdt) @@ -1585,10 +1535,10 @@ int kho_fill_kimage(struct kimage *image) int err = 0; struct kexec_buf scratch; - if (!kho_out.finalized) + if (!kho_enable) return 0; - image->kho.fdt = page_to_phys(kho_out.ser.fdt); + image->kho.fdt = virt_to_phys(kho_out.fdt); scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt; scratch = (struct kexec_buf){ diff --git a/kernel/kexec_handover_debug.c b/kernel/liveupdate/kexec_handover_debug.c index 6efb696f5426..6efb696f5426 100644 --- a/kernel/kexec_handover_debug.c +++ b/kernel/liveupdate/kexec_handover_debug.c diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c new file mode 100644 index 000000000000..2abbf62ba942 --- /dev/null +++ b/kernel/liveupdate/kexec_handover_debugfs.c @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kexec_handover_debugfs.c - kexec handover debugfs interfaces + * Copyright (C) 2023 Alexander Graf <graf@amazon.com> + * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org> + * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com> + * Copyright (C) 2025 Google LLC, Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +#define pr_fmt(fmt) "KHO: " fmt + +#include <linux/init.h> +#include <linux/io.h> +#include <linux/libfdt.h> +#include <linux/mm.h> +#include "kexec_handover_internal.h" + +static struct dentry *debugfs_root; + +struct fdt_debugfs { + struct list_head list; + struct debugfs_blob_wrapper wrapper; + struct dentry *file; +}; + +static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, + const char *name, const void *fdt) +{ + struct fdt_debugfs *f; + struct dentry *file; + + f = kmalloc(sizeof(*f), GFP_KERNEL); + if (!f) + return -ENOMEM; + + f->wrapper.data = (void *)fdt; + f->wrapper.size = fdt_totalsize(fdt); + + file = debugfs_create_blob(name, 0400, dir, &f->wrapper); + if (IS_ERR(file)) { + kfree(f); + return PTR_ERR(file); + } + + f->file = file; + list_add(&f->list, list); + + return 0; +} + +int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, + const void *fdt, bool root) +{ + struct dentry *dir; + + if (root) + dir = dbg->dir; + else + dir = dbg->sub_fdt_dir; + + return __kho_debugfs_fdt_add(&dbg->fdt_list, dir, name, fdt); +} + +void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt) +{ + struct fdt_debugfs *ff; + + list_for_each_entry(ff, &dbg->fdt_list, list) { + if (ff->wrapper.data == fdt) { + debugfs_remove(ff->file); + list_del(&ff->list); + kfree(ff); + break; + } + } +} + +static int kho_out_finalize_get(void *data, u64 *val) +{ + *val = kho_finalized(); + + return 0; +} + +static int kho_out_finalize_set(void *data, u64 val) +{ + if (val) + return kho_finalize(); + else + return -EINVAL; +} + +DEFINE_DEBUGFS_ATTRIBUTE(kho_out_finalize_fops, kho_out_finalize_get, + kho_out_finalize_set, "%llu\n"); + +static int scratch_phys_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].addr); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_phys); + +static int scratch_len_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].size); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_len); + +__init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) +{ + struct dentry *dir, *sub_fdt_dir; + int err, child; + + INIT_LIST_HEAD(&dbg->fdt_list); + + dir = debugfs_create_dir("in", debugfs_root); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + goto err_out; + } + + sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); + if (IS_ERR(sub_fdt_dir)) { + err = PTR_ERR(sub_fdt_dir); + goto err_rmdir; + } + + err = __kho_debugfs_fdt_add(&dbg->fdt_list, dir, "fdt", fdt); + if (err) + goto err_rmdir; + + fdt_for_each_subnode(child, fdt, 0) { + int len = 0; + const char *name = fdt_get_name(fdt, child, NULL); + const u64 *fdt_phys; + + fdt_phys = fdt_getprop(fdt, child, "fdt", &len); + if (!fdt_phys) + continue; + if (len != sizeof(*fdt_phys)) { + pr_warn("node %s prop fdt has invalid length: %d\n", + name, len); + continue; + } + err = __kho_debugfs_fdt_add(&dbg->fdt_list, sub_fdt_dir, name, + phys_to_virt(*fdt_phys)); + if (err) { + pr_warn("failed to add fdt %s to debugfs: %pe\n", name, + ERR_PTR(err)); + continue; + } + } + + dbg->dir = dir; + dbg->sub_fdt_dir = sub_fdt_dir; + + return; +err_rmdir: + debugfs_remove_recursive(dir); +err_out: + /* + * Failure to create /sys/kernel/debug/kho/in does not prevent + * reviving state from KHO and setting up KHO for the next + * kexec. + */ + if (err) { + pr_err("failed exposing handover FDT in debugfs: %pe\n", + ERR_PTR(err)); + } +} + +__init int kho_out_debugfs_init(struct kho_debugfs *dbg) +{ + struct dentry *dir, *f, *sub_fdt_dir; + + INIT_LIST_HEAD(&dbg->fdt_list); + + dir = debugfs_create_dir("out", debugfs_root); + if (IS_ERR(dir)) + return -ENOMEM; + + sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); + if (IS_ERR(sub_fdt_dir)) + goto err_rmdir; + + f = debugfs_create_file("scratch_phys", 0400, dir, NULL, + &scratch_phys_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("scratch_len", 0400, dir, NULL, + &scratch_len_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("finalize", 0600, dir, NULL, + &kho_out_finalize_fops); + if (IS_ERR(f)) + goto err_rmdir; + + dbg->dir = dir; + dbg->sub_fdt_dir = sub_fdt_dir; + return 0; + +err_rmdir: + debugfs_remove_recursive(dir); + return -ENOENT; +} + +__init int kho_debugfs_init(void) +{ + debugfs_root = debugfs_create_dir("kho", NULL); + if (IS_ERR(debugfs_root)) + return -ENOENT; + return 0; +} diff --git a/kernel/liveupdate/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h new file mode 100644 index 000000000000..0202c85ad14f --- /dev/null +++ b/kernel/liveupdate/kexec_handover_internal.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H +#define LINUX_KEXEC_HANDOVER_INTERNAL_H + +#include <linux/kexec_handover.h> +#include <linux/list.h> +#include <linux/types.h> + +#ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS +#include <linux/debugfs.h> + +struct kho_debugfs { + struct dentry *dir; + struct dentry *sub_fdt_dir; + struct list_head fdt_list; +}; + +#else +struct kho_debugfs {}; +#endif + +extern struct kho_scratch *kho_scratch; +extern unsigned int kho_scratch_cnt; + +bool kho_finalized(void); +int kho_finalize(void); + +#ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS +int kho_debugfs_init(void); +void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt); +int kho_out_debugfs_init(struct kho_debugfs *dbg); +int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, + const void *fdt, bool root); +void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt); +#else +static inline int kho_debugfs_init(void) { return 0; } +static inline void kho_in_debugfs_init(struct kho_debugfs *dbg, + const void *fdt) { } +static inline int kho_out_debugfs_init(struct kho_debugfs *dbg) { return 0; } +static inline int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, + const void *fdt, bool root) { return 0; } +static inline void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, + void *fdt) { } +#endif /* CONFIG_KEXEC_HANDOVER_DEBUGFS */ + +#ifdef CONFIG_KEXEC_HANDOVER_DEBUG +bool kho_scratch_overlap(phys_addr_t phys, size_t size); +#else +static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size) +{ + return false; +} +#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */ + +#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */ diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c new file mode 100644 index 000000000000..f7ecaf7740d1 --- /dev/null +++ b/kernel/liveupdate/luo_core.c @@ -0,0 +1,450 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +/** + * DOC: Live Update Orchestrator (LUO) + * + * Live Update is a specialized, kexec-based reboot process that allows a + * running kernel to be updated from one version to another while preserving + * the state of selected resources and keeping designated hardware devices + * operational. For these devices, DMA activity may continue throughout the + * kernel transition. + * + * While the primary use case driving this work is supporting live updates of + * the Linux kernel when it is used as a hypervisor in cloud environments, the + * LUO framework itself is designed to be workload-agnostic. Live Update + * facilitates a full kernel version upgrade for any type of system. + * + * For example, a non-hypervisor system running an in-memory cache like + * memcached with many gigabytes of data can use LUO. The userspace service + * can place its cache into a memfd, have its state preserved by LUO, and + * restore it immediately after the kernel kexec. + * + * Whether the system is running virtual machines, containers, a + * high-performance database, or networking services, LUO's primary goal is to + * enable a full kernel update by preserving critical userspace state and + * keeping essential devices operational. + * + * The core of LUO is a mechanism that tracks the progress of a live update, + * along with a callback API that allows other kernel subsystems to participate + * in the process. Example subsystems that can hook into LUO include: kvm, + * iommu, interrupts, vfio, participating filesystems, and memory management. + * + * LUO uses Kexec Handover to transfer memory state from the current kernel to + * the next kernel. For more details see + * Documentation/core-api/kho/concepts.rst. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/atomic.h> +#include <linux/errno.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/kexec_handover.h> +#include <linux/kho/abi/luo.h> +#include <linux/kobject.h> +#include <linux/libfdt.h> +#include <linux/liveupdate.h> +#include <linux/miscdevice.h> +#include <linux/mm.h> +#include <linux/sizes.h> +#include <linux/string.h> +#include <linux/unaligned.h> + +#include "kexec_handover_internal.h" +#include "luo_internal.h" + +static struct { + bool enabled; + void *fdt_out; + void *fdt_in; + u64 liveupdate_num; +} luo_global; + +static int __init early_liveupdate_param(char *buf) +{ + return kstrtobool(buf, &luo_global.enabled); +} +early_param("liveupdate", early_liveupdate_param); + +static int __init luo_early_startup(void) +{ + phys_addr_t fdt_phys; + int err, ln_size; + const void *ptr; + + if (!kho_is_enabled()) { + if (liveupdate_enabled()) + pr_warn("Disabling liveupdate because KHO is disabled\n"); + luo_global.enabled = false; + return 0; + } + + /* Retrieve LUO subtree, and verify its format. */ + err = kho_retrieve_subtree(LUO_FDT_KHO_ENTRY_NAME, &fdt_phys); + if (err) { + if (err != -ENOENT) { + pr_err("failed to retrieve FDT '%s' from KHO: %pe\n", + LUO_FDT_KHO_ENTRY_NAME, ERR_PTR(err)); + return err; + } + + return 0; + } + + luo_global.fdt_in = phys_to_virt(fdt_phys); + err = fdt_node_check_compatible(luo_global.fdt_in, 0, + LUO_FDT_COMPATIBLE); + if (err) { + pr_err("FDT '%s' is incompatible with '%s' [%d]\n", + LUO_FDT_KHO_ENTRY_NAME, LUO_FDT_COMPATIBLE, err); + + return -EINVAL; + } + + ln_size = 0; + ptr = fdt_getprop(luo_global.fdt_in, 0, LUO_FDT_LIVEUPDATE_NUM, + &ln_size); + if (!ptr || ln_size != sizeof(luo_global.liveupdate_num)) { + pr_err("Unable to get live update number '%s' [%d]\n", + LUO_FDT_LIVEUPDATE_NUM, ln_size); + + return -EINVAL; + } + + luo_global.liveupdate_num = get_unaligned((u64 *)ptr); + pr_info("Retrieved live update data, liveupdate number: %lld\n", + luo_global.liveupdate_num); + + err = luo_session_setup_incoming(luo_global.fdt_in); + if (err) + return err; + + return 0; +} + +static int __init liveupdate_early_init(void) +{ + int err; + + err = luo_early_startup(); + if (err) { + luo_global.enabled = false; + luo_restore_fail("The incoming tree failed to initialize properly [%pe], disabling live update\n", + ERR_PTR(err)); + } + + return err; +} +early_initcall(liveupdate_early_init); + +/* Called during boot to create outgoing LUO fdt tree */ +static int __init luo_fdt_setup(void) +{ + const u64 ln = luo_global.liveupdate_num + 1; + void *fdt_out; + int err; + + fdt_out = kho_alloc_preserve(LUO_FDT_SIZE); + if (IS_ERR(fdt_out)) { + pr_err("failed to allocate/preserve FDT memory\n"); + return PTR_ERR(fdt_out); + } + + err = fdt_create(fdt_out, LUO_FDT_SIZE); + err |= fdt_finish_reservemap(fdt_out); + err |= fdt_begin_node(fdt_out, ""); + err |= fdt_property_string(fdt_out, "compatible", LUO_FDT_COMPATIBLE); + err |= fdt_property(fdt_out, LUO_FDT_LIVEUPDATE_NUM, &ln, sizeof(ln)); + err |= luo_session_setup_outgoing(fdt_out); + err |= fdt_end_node(fdt_out); + err |= fdt_finish(fdt_out); + if (err) + goto exit_free; + + err = kho_add_subtree(LUO_FDT_KHO_ENTRY_NAME, fdt_out); + if (err) + goto exit_free; + luo_global.fdt_out = fdt_out; + + return 0; + +exit_free: + kho_unpreserve_free(fdt_out); + pr_err("failed to prepare LUO FDT: %d\n", err); + + return err; +} + +/* + * late initcall because it initializes the outgoing tree that is needed only + * once userspace starts using /dev/liveupdate. + */ +static int __init luo_late_startup(void) +{ + int err; + + if (!liveupdate_enabled()) + return 0; + + err = luo_fdt_setup(); + if (err) + luo_global.enabled = false; + + return err; +} +late_initcall(luo_late_startup); + +/* Public Functions */ + +/** + * liveupdate_reboot() - Kernel reboot notifier for live update final + * serialization. + * + * This function is invoked directly from the reboot() syscall pathway + * if kexec is in progress. + * + * If any callback fails, this function aborts KHO, undoes the freeze() + * callbacks, and returns an error. + */ +int liveupdate_reboot(void) +{ + int err; + + if (!liveupdate_enabled()) + return 0; + + err = luo_session_serialize(); + if (err) + return err; + + err = kho_finalize(); + if (err) { + pr_err("kho_finalize failed %d\n", err); + /* + * kho_finalize() may return libfdt errors, to aboid passing to + * userspace unknown errors, change this to EAGAIN. + */ + err = -EAGAIN; + } + + return err; +} + +/** + * liveupdate_enabled - Check if the live update feature is enabled. + * + * This function returns the state of the live update feature flag, which + * can be controlled via the ``liveupdate`` kernel command-line parameter. + * + * @return true if live update is enabled, false otherwise. + */ +bool liveupdate_enabled(void) +{ + return luo_global.enabled; +} + +/** + * DOC: LUO ioctl Interface + * + * The IOCTL user-space control interface for the LUO subsystem. + * It registers a character device, typically found at ``/dev/liveupdate``, + * which allows a userspace agent to manage the LUO state machine and its + * associated resources, such as preservable file descriptors. + * + * To ensure that the state machine is controlled by a single entity, access + * to this device is exclusive: only one process is permitted to have + * ``/dev/liveupdate`` open at any given time. Subsequent open attempts will + * fail with -EBUSY until the first process closes its file descriptor. + * This singleton model simplifies state management by preventing conflicting + * commands from multiple userspace agents. + */ + +struct luo_device_state { + struct miscdevice miscdev; + atomic_t in_use; +}; + +static int luo_ioctl_create_session(struct luo_ucmd *ucmd) +{ + struct liveupdate_ioctl_create_session *argp = ucmd->cmd; + struct file *file; + int err; + + argp->fd = get_unused_fd_flags(O_CLOEXEC); + if (argp->fd < 0) + return argp->fd; + + err = luo_session_create(argp->name, &file); + if (err) + goto err_put_fd; + + err = luo_ucmd_respond(ucmd, sizeof(*argp)); + if (err) + goto err_put_file; + + fd_install(argp->fd, file); + + return 0; + +err_put_file: + fput(file); +err_put_fd: + put_unused_fd(argp->fd); + + return err; +} + +static int luo_ioctl_retrieve_session(struct luo_ucmd *ucmd) +{ + struct liveupdate_ioctl_retrieve_session *argp = ucmd->cmd; + struct file *file; + int err; + + argp->fd = get_unused_fd_flags(O_CLOEXEC); + if (argp->fd < 0) + return argp->fd; + + err = luo_session_retrieve(argp->name, &file); + if (err < 0) + goto err_put_fd; + + err = luo_ucmd_respond(ucmd, sizeof(*argp)); + if (err) + goto err_put_file; + + fd_install(argp->fd, file); + + return 0; + +err_put_file: + fput(file); +err_put_fd: + put_unused_fd(argp->fd); + + return err; +} + +static int luo_open(struct inode *inodep, struct file *filep) +{ + struct luo_device_state *ldev = container_of(filep->private_data, + struct luo_device_state, + miscdev); + + if (atomic_cmpxchg(&ldev->in_use, 0, 1)) + return -EBUSY; + + /* Always return -EIO to user if deserialization fail */ + if (luo_session_deserialize()) { + atomic_set(&ldev->in_use, 0); + return -EIO; + } + + return 0; +} + +static int luo_release(struct inode *inodep, struct file *filep) +{ + struct luo_device_state *ldev = container_of(filep->private_data, + struct luo_device_state, + miscdev); + atomic_set(&ldev->in_use, 0); + + return 0; +} + +union ucmd_buffer { + struct liveupdate_ioctl_create_session create; + struct liveupdate_ioctl_retrieve_session retrieve; +}; + +struct luo_ioctl_op { + unsigned int size; + unsigned int min_size; + unsigned int ioctl_num; + int (*execute)(struct luo_ucmd *ucmd); +}; + +#define IOCTL_OP(_ioctl, _fn, _struct, _last) \ + [_IOC_NR(_ioctl) - LIVEUPDATE_CMD_BASE] = { \ + .size = sizeof(_struct) + \ + BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \ + sizeof(_struct)), \ + .min_size = offsetofend(_struct, _last), \ + .ioctl_num = _ioctl, \ + .execute = _fn, \ + } + +static const struct luo_ioctl_op luo_ioctl_ops[] = { + IOCTL_OP(LIVEUPDATE_IOCTL_CREATE_SESSION, luo_ioctl_create_session, + struct liveupdate_ioctl_create_session, name), + IOCTL_OP(LIVEUPDATE_IOCTL_RETRIEVE_SESSION, luo_ioctl_retrieve_session, + struct liveupdate_ioctl_retrieve_session, name), +}; + +static long luo_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + const struct luo_ioctl_op *op; + struct luo_ucmd ucmd = {}; + union ucmd_buffer buf; + unsigned int nr; + int err; + + nr = _IOC_NR(cmd); + if (nr < LIVEUPDATE_CMD_BASE || + (nr - LIVEUPDATE_CMD_BASE) >= ARRAY_SIZE(luo_ioctl_ops)) { + return -EINVAL; + } + + ucmd.ubuffer = (void __user *)arg; + err = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); + if (err) + return err; + + op = &luo_ioctl_ops[nr - LIVEUPDATE_CMD_BASE]; + if (op->ioctl_num != cmd) + return -ENOIOCTLCMD; + if (ucmd.user_size < op->min_size) + return -EINVAL; + + ucmd.cmd = &buf; + err = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, + ucmd.user_size); + if (err) + return err; + + return op->execute(&ucmd); +} + +static const struct file_operations luo_fops = { + .owner = THIS_MODULE, + .open = luo_open, + .release = luo_release, + .unlocked_ioctl = luo_ioctl, +}; + +static struct luo_device_state luo_dev = { + .miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "liveupdate", + .fops = &luo_fops, + }, + .in_use = ATOMIC_INIT(0), +}; + +static int __init liveupdate_ioctl_init(void) +{ + if (!liveupdate_enabled()) + return 0; + + return misc_register(&luo_dev.miscdev); +} +late_initcall(liveupdate_ioctl_init); diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c new file mode 100644 index 000000000000..ddff87917b21 --- /dev/null +++ b/kernel/liveupdate/luo_file.c @@ -0,0 +1,889 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +/** + * DOC: LUO File Descriptors + * + * LUO provides the infrastructure to preserve specific, stateful file + * descriptors across a kexec-based live update. The primary goal is to allow + * workloads, such as virtual machines using vfio, memfd, or iommufd, to + * retain access to their essential resources without interruption. + * + * The framework is built around a callback-based handler model and a well- + * defined lifecycle for each preserved file. + * + * Handler Registration: + * Kernel modules responsible for a specific file type (e.g., memfd, vfio) + * register a &struct liveupdate_file_handler. This handler provides a set of + * callbacks that LUO invokes at different stages of the update process, most + * notably: + * + * - can_preserve(): A lightweight check to determine if the handler is + * compatible with a given 'struct file'. + * - preserve(): The heavyweight operation that saves the file's state and + * returns an opaque u64 handle. This is typically performed while the + * workload is still active to minimize the downtime during the + * actual reboot transition. + * - unpreserve(): Cleans up any resources allocated by .preserve(), called + * if the preservation process is aborted before the reboot (i.e. session is + * closed). + * - freeze(): A final pre-reboot opportunity to prepare the state for kexec. + * We are already in reboot syscall, and therefore userspace cannot mutate + * the file anymore. + * - unfreeze(): Undoes the actions of .freeze(), called if the live update + * is aborted after the freeze phase. + * - retrieve(): Reconstructs the file in the new kernel from the preserved + * handle. + * - finish(): Performs final check and cleanup in the new kernel. After + * succesul finish call, LUO gives up ownership to this file. + * + * File Preservation Lifecycle happy path: + * + * 1. Preserve (Normal Operation): A userspace agent preserves files one by one + * via an ioctl. For each file, luo_preserve_file() finds a compatible + * handler, calls its .preserve() operation, and creates an internal &struct + * luo_file to track the live state. + * + * 2. Freeze (Pre-Reboot): Just before the kexec, luo_file_freeze() is called. + * It iterates through all preserved files, calls their respective .freeze() + * operation, and serializes their final metadata (compatible string, token, + * and data handle) into a contiguous memory block for KHO. + * + * 3. Deserialize: After kexec, luo_file_deserialize() runs when session gets + * deserialized (which is when /dev/liveupdate is first opened). It reads the + * serialized data from the KHO memory region and reconstructs the in-memory + * list of &struct luo_file instances for the new kernel, linking them to + * their corresponding handlers. + * + * 4. Retrieve (New Kernel - Userspace Ready): The userspace agent can now + * restore file descriptors by providing a token. luo_retrieve_file() + * searches for the matching token, calls the handler's .retrieve() op to + * re-create the 'struct file', and returns a new FD. Files can be + * retrieved in ANY order. + * + * 5. Finish (New Kernel - Cleanup): Once a session retrival is complete, + * luo_file_finish() is called. It iterates through all files, invokes their + * .finish() operations for final cleanup, and releases all associated kernel + * resources. + * + * File Preservation Lifecycle unhappy paths: + * + * 1. Abort Before Reboot: If the userspace agent aborts the live update + * process before calling reboot (e.g., by closing the session file + * descriptor), the session's release handler calls + * luo_file_unpreserve_files(). This invokes the .unpreserve() callback on + * all preserved files, ensuring all allocated resources are cleaned up and + * returning the system to a clean state. + * + * 2. Freeze Failure: During the reboot() syscall, if any handler's .freeze() + * op fails, the .unfreeze() op is invoked on all previously *successful* + * freezes to roll back their state. The reboot() syscall then returns an + * error to userspace, canceling the live update. + * + * 3. Finish Failure: In the new kernel, if a handler's .finish() op fails, + * the luo_file_finish() operation is aborted. LUO retains ownership of + * all files within that session, including those that were not yet + * processed. The userspace agent can attempt to call the finish operation + * again later. If the issue cannot be resolved, these resources will be held + * by LUO until the next live update cycle, at which point they will be + * discarded. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cleanup.h> +#include <linux/compiler.h> +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/io.h> +#include <linux/kexec_handover.h> +#include <linux/kho/abi/luo.h> +#include <linux/liveupdate.h> +#include <linux/module.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/string.h> +#include "luo_internal.h" + +static LIST_HEAD(luo_file_handler_list); + +/* 2 4K pages, give space for 128 files per file_set */ +#define LUO_FILE_PGCNT 2ul +#define LUO_FILE_MAX \ + ((LUO_FILE_PGCNT << PAGE_SHIFT) / sizeof(struct luo_file_ser)) + +/** + * struct luo_file - Represents a single preserved file instance. + * @fh: Pointer to the &struct liveupdate_file_handler that manages + * this type of file. + * @file: Pointer to the kernel's &struct file that is being preserved. + * This is NULL in the new kernel until the file is successfully + * retrieved. + * @serialized_data: The opaque u64 handle to the serialized state of the file. + * This handle is passed back to the handler's .freeze(), + * .retrieve(), and .finish() callbacks, allowing it to track + * and update its serialized state across phases. + * @private_data: Pointer to the private data for the file used to hold runtime + * state that is not preserved. Set by the handler's .preserve() + * callback, and must be freed in the handler's .unpreserve() + * callback. + * @retrieved: A flag indicating whether a user/kernel in the new kernel has + * successfully called retrieve() on this file. This prevents + * multiple retrieval attempts. + * @mutex: A mutex that protects the fields of this specific instance + * (e.g., @retrieved, @file), ensuring that operations like + * retrieving or finishing a file are atomic. + * @list: The list_head linking this instance into its parent + * file_set's list of preserved files. + * @token: The user-provided unique token used to identify this file. + * + * This structure is the core in-kernel representation of a single file being + * managed through a live update. An instance is created by luo_preserve_file() + * to link a 'struct file' to its corresponding handler, a user-provided token, + * and the serialized state handle returned by the handler's .preserve() + * operation. + * + * These instances are tracked in a per-file_set list. The @serialized_data + * field, which holds a handle to the file's serialized state, may be updated + * during the .freeze() callback before being serialized for the next kernel. + * After reboot, these structures are recreated by luo_file_deserialize() and + * are finally cleaned up by luo_file_finish(). + */ +struct luo_file { + struct liveupdate_file_handler *fh; + struct file *file; + u64 serialized_data; + void *private_data; + bool retrieved; + struct mutex mutex; + struct list_head list; + u64 token; +}; + +static int luo_alloc_files_mem(struct luo_file_set *file_set) +{ + size_t size; + void *mem; + + if (file_set->files) + return 0; + + WARN_ON_ONCE(file_set->count); + + size = LUO_FILE_PGCNT << PAGE_SHIFT; + mem = kho_alloc_preserve(size); + if (IS_ERR(mem)) + return PTR_ERR(mem); + + file_set->files = mem; + + return 0; +} + +static void luo_free_files_mem(struct luo_file_set *file_set) +{ + /* If file_set has files, no need to free preservation memory */ + if (file_set->count) + return; + + if (!file_set->files) + return; + + kho_unpreserve_free(file_set->files); + file_set->files = NULL; +} + +static bool luo_token_is_used(struct luo_file_set *file_set, u64 token) +{ + struct luo_file *iter; + + list_for_each_entry(iter, &file_set->files_list, list) { + if (iter->token == token) + return true; + } + + return false; +} + +/** + * luo_preserve_file - Initiate the preservation of a file descriptor. + * @file_set: The file_set to which the preserved file will be added. + * @token: A unique, user-provided identifier for the file. + * @fd: The file descriptor to be preserved. + * + * This function orchestrates the first phase of preserving a file. Upon entry, + * it takes a reference to the 'struct file' via fget(), effectively making LUO + * a co-owner of the file. This reference is held until the file is either + * unpreserved or successfully finished in the next kernel, preventing the file + * from being prematurely destroyed. + * + * This function orchestrates the first phase of preserving a file. It performs + * the following steps: + * + * 1. Validates that the @token is not already in use within the file_set. + * 2. Ensures the file_set's memory for files serialization is allocated + * (allocates if needed). + * 3. Iterates through registered handlers, calling can_preserve() to find one + * compatible with the given @fd. + * 4. Calls the handler's .preserve() operation, which saves the file's state + * and returns an opaque private data handle. + * 5. Adds the new instance to the file_set's internal list. + * + * On success, LUO takes a reference to the 'struct file' and considers it + * under its management until it is unpreserved or finished. + * + * In case of any failure, all intermediate allocations (file reference, memory + * for the 'luo_file' struct, etc.) are cleaned up before returning an error. + * + * Context: Can be called from an ioctl handler during normal system operation. + * Return: 0 on success. Returns a negative errno on failure: + * -EEXIST if the token is already used. + * -EBADF if the file descriptor is invalid. + * -ENOSPC if the file_set is full. + * -ENOENT if no compatible handler is found. + * -ENOMEM on memory allocation failure. + * Other erros might be returned by .preserve(). + */ +int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) +{ + struct liveupdate_file_op_args args = {0}; + struct liveupdate_file_handler *fh; + struct luo_file *luo_file; + struct file *file; + int err; + + if (luo_token_is_used(file_set, token)) + return -EEXIST; + + if (file_set->count == LUO_FILE_MAX) + return -ENOSPC; + + file = fget(fd); + if (!file) + return -EBADF; + + err = luo_alloc_files_mem(file_set); + if (err) + goto err_fput; + + err = -ENOENT; + luo_list_for_each_private(fh, &luo_file_handler_list, list) { + if (fh->ops->can_preserve(fh, file)) { + err = 0; + break; + } + } + + /* err is still -ENOENT if no handler was found */ + if (err) + goto err_free_files_mem; + + luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL); + if (!luo_file) { + err = -ENOMEM; + goto err_free_files_mem; + } + + luo_file->file = file; + luo_file->fh = fh; + luo_file->token = token; + luo_file->retrieved = false; + mutex_init(&luo_file->mutex); + + args.handler = fh; + args.file = file; + err = fh->ops->preserve(&args); + if (err) + goto err_kfree; + + luo_file->serialized_data = args.serialized_data; + luo_file->private_data = args.private_data; + list_add_tail(&luo_file->list, &file_set->files_list); + file_set->count++; + + return 0; + +err_kfree: + kfree(luo_file); +err_free_files_mem: + luo_free_files_mem(file_set); +err_fput: + fput(file); + + return err; +} + +/** + * luo_file_unpreserve_files - Unpreserves all files from a file_set. + * @file_set: The files to be cleaned up. + * + * This function serves as the primary cleanup path for a file_set. It is + * invoked when the userspace agent closes the file_set's file descriptor. + * + * For each file, it performs the following cleanup actions: + * 1. Calls the handler's .unpreserve() callback to allow the handler to + * release any resources it allocated. + * 2. Removes the file from the file_set's internal tracking list. + * 3. Releases the reference to the 'struct file' that was taken by + * luo_preserve_file() via fput(), returning ownership. + * 4. Frees the memory associated with the internal 'struct luo_file'. + * + * After all individual files are unpreserved, it frees the contiguous memory + * block that was allocated to hold their serialization data. + */ +void luo_file_unpreserve_files(struct luo_file_set *file_set) +{ + struct luo_file *luo_file; + + while (!list_empty(&file_set->files_list)) { + struct liveupdate_file_op_args args = {0}; + + luo_file = list_last_entry(&file_set->files_list, + struct luo_file, list); + + args.handler = luo_file->fh; + args.file = luo_file->file; + args.serialized_data = luo_file->serialized_data; + args.private_data = luo_file->private_data; + luo_file->fh->ops->unpreserve(&args); + + list_del(&luo_file->list); + file_set->count--; + + fput(luo_file->file); + mutex_destroy(&luo_file->mutex); + kfree(luo_file); + } + + luo_free_files_mem(file_set); +} + +static int luo_file_freeze_one(struct luo_file_set *file_set, + struct luo_file *luo_file) +{ + int err = 0; + + guard(mutex)(&luo_file->mutex); + + if (luo_file->fh->ops->freeze) { + struct liveupdate_file_op_args args = {0}; + + args.handler = luo_file->fh; + args.file = luo_file->file; + args.serialized_data = luo_file->serialized_data; + args.private_data = luo_file->private_data; + + err = luo_file->fh->ops->freeze(&args); + if (!err) + luo_file->serialized_data = args.serialized_data; + } + + return err; +} + +static void luo_file_unfreeze_one(struct luo_file_set *file_set, + struct luo_file *luo_file) +{ + guard(mutex)(&luo_file->mutex); + + if (luo_file->fh->ops->unfreeze) { + struct liveupdate_file_op_args args = {0}; + + args.handler = luo_file->fh; + args.file = luo_file->file; + args.serialized_data = luo_file->serialized_data; + args.private_data = luo_file->private_data; + + luo_file->fh->ops->unfreeze(&args); + } + + luo_file->serialized_data = 0; +} + +static void __luo_file_unfreeze(struct luo_file_set *file_set, + struct luo_file *failed_entry) +{ + struct list_head *files_list = &file_set->files_list; + struct luo_file *luo_file; + + list_for_each_entry(luo_file, files_list, list) { + if (luo_file == failed_entry) + break; + + luo_file_unfreeze_one(file_set, luo_file); + } + + memset(file_set->files, 0, LUO_FILE_PGCNT << PAGE_SHIFT); +} + +/** + * luo_file_freeze - Freezes all preserved files and serializes their metadata. + * @file_set: The file_set whose files are to be frozen. + * @file_set_ser: Where to put the serialized file_set. + * + * This function is called from the reboot() syscall path, just before the + * kernel transitions to the new image via kexec. Its purpose is to perform the + * final preparation and serialization of all preserved files in the file_set. + * + * It iterates through each preserved file in FIFO order (the order of + * preservation) and performs two main actions: + * + * 1. Freezes the File: It calls the handler's .freeze() callback for each + * file. This gives the handler a final opportunity to quiesce the device or + * prepare its state for the upcoming reboot. The handler may update its + * private data handle during this step. + * + * 2. Serializes Metadata: After a successful freeze, it copies the final file + * metadata—the handler's compatible string, the user token, and the final + * private data handle—into the pre-allocated contiguous memory buffer + * (file_set->files) that will be handed over to the next kernel via KHO. + * + * Error Handling (Rollback): + * This function is atomic. If any handler's .freeze() operation fails, the + * entire live update is aborted. The __luo_file_unfreeze() helper is + * immediately called to invoke the .unfreeze() op on all files that were + * successfully frozen before the point of failure, rolling them back to a + * running state. The function then returns an error, causing the reboot() + * syscall to fail. + * + * Context: Called only from the liveupdate_reboot() path. + * Return: 0 on success, or a negative errno on failure. + */ +int luo_file_freeze(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser) +{ + struct luo_file_ser *file_ser = file_set->files; + struct luo_file *luo_file; + int err; + int i; + + if (!file_set->count) + return 0; + + if (WARN_ON(!file_ser)) + return -EINVAL; + + i = 0; + list_for_each_entry(luo_file, &file_set->files_list, list) { + err = luo_file_freeze_one(file_set, luo_file); + if (err < 0) { + pr_warn("Freeze failed for token[%#0llx] handler[%s] err[%pe]\n", + luo_file->token, luo_file->fh->compatible, + ERR_PTR(err)); + goto err_unfreeze; + } + + strscpy(file_ser[i].compatible, luo_file->fh->compatible, + sizeof(file_ser[i].compatible)); + file_ser[i].data = luo_file->serialized_data; + file_ser[i].token = luo_file->token; + i++; + } + + file_set_ser->count = file_set->count; + if (file_set->files) + file_set_ser->files = virt_to_phys(file_set->files); + + return 0; + +err_unfreeze: + __luo_file_unfreeze(file_set, luo_file); + + return err; +} + +/** + * luo_file_unfreeze - Unfreezes all files in a file_set and clear serialization + * @file_set: The file_set whose files are to be unfrozen. + * @file_set_ser: Serialized file_set. + * + * This function rolls back the state of all files in a file_set after the + * freeze phase has begun but must be aborted. It is the counterpart to + * luo_file_freeze(). + * + * It invokes the __luo_file_unfreeze() helper with a NULL argument, which + * signals the helper to iterate through all files in the file_set and call + * their respective .unfreeze() handler callbacks. + * + * Context: This is called when the live update is aborted during + * the reboot() syscall, after luo_file_freeze() has been called. + */ +void luo_file_unfreeze(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser) +{ + if (!file_set->count) + return; + + __luo_file_unfreeze(file_set, NULL); + memset(file_set_ser, 0, sizeof(*file_set_ser)); +} + +/** + * luo_retrieve_file - Restores a preserved file from a file_set by its token. + * @file_set: The file_set from which to retrieve the file. + * @token: The unique token identifying the file to be restored. + * @filep: Output parameter; on success, this is populated with a pointer + * to the newly retrieved 'struct file'. + * + * This function is the primary mechanism for recreating a file in the new + * kernel after a live update. It searches the file_set's list of deserialized + * files for an entry matching the provided @token. + * + * The operation is idempotent: if a file has already been successfully + * retrieved, this function will simply return a pointer to the existing + * 'struct file' and report success without re-executing the retrieve + * operation. This is handled by checking the 'retrieved' flag under a lock. + * + * File retrieval can happen in any order; it is not bound by the order of + * preservation. + * + * Context: Can be called from an ioctl or other in-kernel code in the new + * kernel. + * Return: 0 on success. Returns a negative errno on failure: + * -ENOENT if no file with the matching token is found. + * Any error code returned by the handler's .retrieve() op. + */ +int luo_retrieve_file(struct luo_file_set *file_set, u64 token, + struct file **filep) +{ + struct liveupdate_file_op_args args = {0}; + struct luo_file *luo_file; + int err; + + if (list_empty(&file_set->files_list)) + return -ENOENT; + + list_for_each_entry(luo_file, &file_set->files_list, list) { + if (luo_file->token == token) + break; + } + + if (luo_file->token != token) + return -ENOENT; + + guard(mutex)(&luo_file->mutex); + if (luo_file->retrieved) { + /* + * Someone is asking for this file again, so get a reference + * for them. + */ + get_file(luo_file->file); + *filep = luo_file->file; + return 0; + } + + args.handler = luo_file->fh; + args.serialized_data = luo_file->serialized_data; + err = luo_file->fh->ops->retrieve(&args); + if (!err) { + luo_file->file = args.file; + + /* Get reference so we can keep this file in LUO until finish */ + get_file(luo_file->file); + *filep = luo_file->file; + luo_file->retrieved = true; + } + + return err; +} + +static int luo_file_can_finish_one(struct luo_file_set *file_set, + struct luo_file *luo_file) +{ + bool can_finish = true; + + guard(mutex)(&luo_file->mutex); + + if (luo_file->fh->ops->can_finish) { + struct liveupdate_file_op_args args = {0}; + + args.handler = luo_file->fh; + args.file = luo_file->file; + args.serialized_data = luo_file->serialized_data; + args.retrieved = luo_file->retrieved; + can_finish = luo_file->fh->ops->can_finish(&args); + } + + return can_finish ? 0 : -EBUSY; +} + +static void luo_file_finish_one(struct luo_file_set *file_set, + struct luo_file *luo_file) +{ + struct liveupdate_file_op_args args = {0}; + + guard(mutex)(&luo_file->mutex); + + args.handler = luo_file->fh; + args.file = luo_file->file; + args.serialized_data = luo_file->serialized_data; + args.retrieved = luo_file->retrieved; + + luo_file->fh->ops->finish(&args); +} + +/** + * luo_file_finish - Completes the lifecycle for all files in a file_set. + * @file_set: The file_set to be finalized. + * + * This function orchestrates the final teardown of a live update file_set in + * the new kernel. It should be called after all necessary files have been + * retrieved and the userspace agent is ready to release the preserved state. + * + * The function iterates through all tracked files. For each file, it performs + * the following sequence of cleanup actions: + * + * 1. If file is not yet retrieved, retrieves it, and calls can_finish() on + * every file in the file_set. If all can_finish return true, continue to + * finish. + * 2. Calls the handler's .finish() callback (via luo_file_finish_one) to + * allow for final resource cleanup within the handler. + * 3. Releases LUO's ownership reference on the 'struct file' via fput(). This + * is the counterpart to the get_file() call in luo_retrieve_file(). + * 4. Removes the 'struct luo_file' from the file_set's internal list. + * 5. Frees the memory for the 'struct luo_file' instance itself. + * + * After successfully finishing all individual files, it frees the + * contiguous memory block that was used to transfer the serialized metadata + * from the previous kernel. + * + * Error Handling (Atomic Failure): + * This operation is atomic. If any handler's .can_finish() op fails, the entire + * function aborts immediately and returns an error. + * + * Context: Can be called from an ioctl handler in the new kernel. + * Return: 0 on success, or a negative errno on failure. + */ +int luo_file_finish(struct luo_file_set *file_set) +{ + struct list_head *files_list = &file_set->files_list; + struct luo_file *luo_file; + int err; + + if (!file_set->count) + return 0; + + list_for_each_entry(luo_file, files_list, list) { + err = luo_file_can_finish_one(file_set, luo_file); + if (err) + return err; + } + + while (!list_empty(&file_set->files_list)) { + luo_file = list_last_entry(&file_set->files_list, + struct luo_file, list); + + luo_file_finish_one(file_set, luo_file); + + if (luo_file->file) + fput(luo_file->file); + list_del(&luo_file->list); + file_set->count--; + mutex_destroy(&luo_file->mutex); + kfree(luo_file); + } + + if (file_set->files) { + kho_restore_free(file_set->files); + file_set->files = NULL; + } + + return 0; +} + +/** + * luo_file_deserialize - Reconstructs the list of preserved files in the new kernel. + * @file_set: The incoming file_set to fill with deserialized data. + * @file_set_ser: Serialized KHO file_set data from the previous kernel. + * + * This function is called during the early boot process of the new kernel. It + * takes the raw, contiguous memory block of 'struct luo_file_ser' entries, + * provided by the previous kernel, and transforms it back into a live, + * in-memory linked list of 'struct luo_file' instances. + * + * For each serialized entry, it performs the following steps: + * 1. Reads the 'compatible' string. + * 2. Searches the global list of registered file handlers for one that + * matches the compatible string. + * 3. Allocates a new 'struct luo_file'. + * 4. Populates the new structure with the deserialized data (token, private + * data handle) and links it to the found handler. The 'file' pointer is + * initialized to NULL, as the file has not been retrieved yet. + * 5. Adds the new 'struct luo_file' to the file_set's files_list. + * + * This prepares the file_set for userspace, which can later call + * luo_retrieve_file() to restore the actual file descriptors. + * + * Context: Called from session deserialization. + */ +int luo_file_deserialize(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser) +{ + struct luo_file_ser *file_ser; + u64 i; + + if (!file_set_ser->files) { + WARN_ON(file_set_ser->count); + return 0; + } + + file_set->count = file_set_ser->count; + file_set->files = phys_to_virt(file_set_ser->files); + + /* + * Note on error handling: + * + * If deserialization fails (e.g., allocation failure or corrupt data), + * we intentionally skip cleanup of files that were already restored. + * + * A partial failure leaves the preserved state inconsistent. + * Implementing a safe "undo" to unwind complex dependencies (sessions, + * files, hardware state) is error-prone and provides little value, as + * the system is effectively in a broken state. + * + * We treat these resources as leaked. The expected recovery path is for + * userspace to detect the failure and trigger a reboot, which will + * reliably reset devices and reclaim memory. + */ + file_ser = file_set->files; + for (i = 0; i < file_set->count; i++) { + struct liveupdate_file_handler *fh; + bool handler_found = false; + struct luo_file *luo_file; + + luo_list_for_each_private(fh, &luo_file_handler_list, list) { + if (!strcmp(fh->compatible, file_ser[i].compatible)) { + handler_found = true; + break; + } + } + + if (!handler_found) { + pr_warn("No registered handler for compatible '%s'\n", + file_ser[i].compatible); + return -ENOENT; + } + + luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL); + if (!luo_file) + return -ENOMEM; + + luo_file->fh = fh; + luo_file->file = NULL; + luo_file->serialized_data = file_ser[i].data; + luo_file->token = file_ser[i].token; + luo_file->retrieved = false; + mutex_init(&luo_file->mutex); + list_add_tail(&luo_file->list, &file_set->files_list); + } + + return 0; +} + +void luo_file_set_init(struct luo_file_set *file_set) +{ + INIT_LIST_HEAD(&file_set->files_list); +} + +void luo_file_set_destroy(struct luo_file_set *file_set) +{ + WARN_ON(file_set->count); + WARN_ON(!list_empty(&file_set->files_list)); +} + +/** + * liveupdate_register_file_handler - Register a file handler with LUO. + * @fh: Pointer to a caller-allocated &struct liveupdate_file_handler. + * The caller must initialize this structure, including a unique + * 'compatible' string and a valid 'fh' callbacks. This function adds the + * handler to the global list of supported file handlers. + * + * Context: Typically called during module initialization for file types that + * support live update preservation. + * + * Return: 0 on success. Negative errno on failure. + */ +int liveupdate_register_file_handler(struct liveupdate_file_handler *fh) +{ + struct liveupdate_file_handler *fh_iter; + int err; + + if (!liveupdate_enabled()) + return -EOPNOTSUPP; + + /* Sanity check that all required callbacks are set */ + if (!fh->ops->preserve || !fh->ops->unpreserve || !fh->ops->retrieve || + !fh->ops->finish || !fh->ops->can_preserve) { + return -EINVAL; + } + + /* + * Ensure the system is quiescent (no active sessions). + * This prevents registering new handlers while sessions are active or + * while deserialization is in progress. + */ + if (!luo_session_quiesce()) + return -EBUSY; + + /* Check for duplicate compatible strings */ + luo_list_for_each_private(fh_iter, &luo_file_handler_list, list) { + if (!strcmp(fh_iter->compatible, fh->compatible)) { + pr_err("File handler registration failed: Compatible string '%s' already registered.\n", + fh->compatible); + err = -EEXIST; + goto err_resume; + } + } + + /* Pin the module implementing the handler */ + if (!try_module_get(fh->ops->owner)) { + err = -EAGAIN; + goto err_resume; + } + + INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, list)); + list_add_tail(&ACCESS_PRIVATE(fh, list), &luo_file_handler_list); + luo_session_resume(); + + return 0; + +err_resume: + luo_session_resume(); + return err; +} + +/** + * liveupdate_unregister_file_handler - Unregister a liveupdate file handler + * @fh: The file handler to unregister + * + * Unregisters the file handler from the liveupdate core. This function + * reverses the operations of liveupdate_register_file_handler(). + * + * It ensures safe removal by checking that: + * No live update session is currently in progress. + * + * If the unregistration fails, the internal test state is reverted. + * + * Return: 0 Success. -EOPNOTSUPP when live update is not enabled. -EBUSY A live + * update is in progress, can't quiesce live update. + */ +int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) +{ + if (!liveupdate_enabled()) + return -EOPNOTSUPP; + + if (!luo_session_quiesce()) + return -EBUSY; + + list_del(&ACCESS_PRIVATE(fh, list)); + module_put(fh->ops->owner); + luo_session_resume(); + + return 0; +} diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h new file mode 100644 index 000000000000..c8973b543d1d --- /dev/null +++ b/kernel/liveupdate/luo_internal.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +#ifndef _LINUX_LUO_INTERNAL_H +#define _LINUX_LUO_INTERNAL_H + +#include <linux/liveupdate.h> +#include <linux/uaccess.h> + +struct luo_ucmd { + void __user *ubuffer; + u32 user_size; + void *cmd; +}; + +static inline int luo_ucmd_respond(struct luo_ucmd *ucmd, + size_t kernel_cmd_size) +{ + /* + * Copy the minimum of what the user provided and what we actually + * have. + */ + if (copy_to_user(ucmd->ubuffer, ucmd->cmd, + min_t(size_t, ucmd->user_size, kernel_cmd_size))) { + return -EFAULT; + } + return 0; +} + +/* + * Handles a deserialization failure: devices and memory is in unpredictable + * state. + * + * Continuing the boot process after a failure is dangerous because it could + * lead to leaks of private data. + */ +#define luo_restore_fail(__fmt, ...) panic(__fmt, ##__VA_ARGS__) + +/* Mimics list_for_each_entry() but for private list head entries */ +#define luo_list_for_each_private(pos, head, member) \ + for (struct list_head *__iter = (head)->next; \ + __iter != (head) && \ + ({ pos = container_of(__iter, typeof(*(pos)), member); 1; }); \ + __iter = __iter->next) + +/** + * struct luo_file_set - A set of files that belong to the same sessions. + * @files_list: An ordered list of files associated with this session, it is + * ordered by preservation time. + * @files: The physically contiguous memory block that holds the serialized + * state of files. + * @count: A counter tracking the number of files currently stored in the + * @files_list for this session. + */ +struct luo_file_set { + struct list_head files_list; + struct luo_file_ser *files; + long count; +}; + +/** + * struct luo_session - Represents an active or incoming Live Update session. + * @name: A unique name for this session, used for identification and + * retrieval. + * @ser: Pointer to the serialized data for this session. + * @list: A list_head member used to link this session into a global list + * of either outgoing (to be preserved) or incoming (restored from + * previous kernel) sessions. + * @retrieved: A boolean flag indicating whether this session has been + * retrieved by a consumer in the new kernel. + * @file_set: A set of files that belong to this session. + * @mutex: protects fields in the luo_session. + */ +struct luo_session { + char name[LIVEUPDATE_SESSION_NAME_LENGTH]; + struct luo_session_ser *ser; + struct list_head list; + bool retrieved; + struct luo_file_set file_set; + struct mutex mutex; +}; + +int luo_session_create(const char *name, struct file **filep); +int luo_session_retrieve(const char *name, struct file **filep); +int __init luo_session_setup_outgoing(void *fdt); +int __init luo_session_setup_incoming(void *fdt); +int luo_session_serialize(void); +int luo_session_deserialize(void); +bool luo_session_quiesce(void); +void luo_session_resume(void); + +int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd); +void luo_file_unpreserve_files(struct luo_file_set *file_set); +int luo_file_freeze(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser); +void luo_file_unfreeze(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser); +int luo_retrieve_file(struct luo_file_set *file_set, u64 token, + struct file **filep); +int luo_file_finish(struct luo_file_set *file_set); +int luo_file_deserialize(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser); +void luo_file_set_init(struct luo_file_set *file_set); +void luo_file_set_destroy(struct luo_file_set *file_set); + +#endif /* _LINUX_LUO_INTERNAL_H */ diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c new file mode 100644 index 000000000000..dbdbc3bd7929 --- /dev/null +++ b/kernel/liveupdate/luo_session.c @@ -0,0 +1,646 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +/** + * DOC: LUO Sessions + * + * LUO Sessions provide the core mechanism for grouping and managing `struct + * file *` instances that need to be preserved across a kexec-based live + * update. Each session acts as a named container for a set of file objects, + * allowing a userspace agent to manage the lifecycle of resources critical to a + * workload. + * + * Core Concepts: + * + * - Named Containers: Sessions are identified by a unique, user-provided name, + * which is used for both creation in the current kernel and retrieval in the + * next kernel. + * + * - Userspace Interface: Session management is driven from userspace via + * ioctls on /dev/liveupdate. + * + * - Serialization: Session metadata is preserved using the KHO framework. When + * a live update is triggered via kexec, an array of `struct luo_session_ser` + * is populated and placed in a preserved memory region. An FDT node is also + * created, containing the count of sessions and the physical address of this + * array. + * + * Session Lifecycle: + * + * 1. Creation: A userspace agent calls `luo_session_create()` to create a + * new, empty session and receives a file descriptor for it. + * + * 2. Serialization: When the `reboot(LINUX_REBOOT_CMD_KEXEC)` syscall is + * made, `luo_session_serialize()` is called. It iterates through all + * active sessions and writes their metadata into a memory area preserved + * by KHO. + * + * 3. Deserialization (in new kernel): After kexec, `luo_session_deserialize()` + * runs, reading the serialized data and creating a list of `struct + * luo_session` objects representing the preserved sessions. + * + * 4. Retrieval: A userspace agent in the new kernel can then call + * `luo_session_retrieve()` with a session name to get a new file + * descriptor and access the preserved state. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/anon_inodes.h> +#include <linux/cleanup.h> +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/io.h> +#include <linux/kexec_handover.h> +#include <linux/kho/abi/luo.h> +#include <linux/libfdt.h> +#include <linux/list.h> +#include <linux/liveupdate.h> +#include <linux/mutex.h> +#include <linux/rwsem.h> +#include <linux/slab.h> +#include <linux/unaligned.h> +#include <uapi/linux/liveupdate.h> +#include "luo_internal.h" + +/* 16 4K pages, give space for 744 sessions */ +#define LUO_SESSION_PGCNT 16ul +#define LUO_SESSION_MAX (((LUO_SESSION_PGCNT << PAGE_SHIFT) - \ + sizeof(struct luo_session_header_ser)) / \ + sizeof(struct luo_session_ser)) + +/** + * struct luo_session_header - Header struct for managing LUO sessions. + * @count: The number of sessions currently tracked in the @list. + * @list: The head of the linked list of `struct luo_session` instances. + * @rwsem: A read-write semaphore providing synchronized access to the + * session list and other fields in this structure. + * @header_ser: The header data of serialization array. + * @ser: The serialized session data (an array of + * `struct luo_session_ser`). + * @active: Set to true when first initialized. If previous kernel did not + * send session data, active stays false for incoming. + */ +struct luo_session_header { + long count; + struct list_head list; + struct rw_semaphore rwsem; + struct luo_session_header_ser *header_ser; + struct luo_session_ser *ser; + bool active; +}; + +/** + * struct luo_session_global - Global container for managing LUO sessions. + * @incoming: The sessions passed from the previous kernel. + * @outgoing: The sessions that are going to be passed to the next kernel. + */ +struct luo_session_global { + struct luo_session_header incoming; + struct luo_session_header outgoing; +}; + +static struct luo_session_global luo_session_global = { + .incoming = { + .list = LIST_HEAD_INIT(luo_session_global.incoming.list), + .rwsem = __RWSEM_INITIALIZER(luo_session_global.incoming.rwsem), + }, + .outgoing = { + .list = LIST_HEAD_INIT(luo_session_global.outgoing.list), + .rwsem = __RWSEM_INITIALIZER(luo_session_global.outgoing.rwsem), + }, +}; + +static struct luo_session *luo_session_alloc(const char *name) +{ + struct luo_session *session = kzalloc(sizeof(*session), GFP_KERNEL); + + if (!session) + return ERR_PTR(-ENOMEM); + + strscpy(session->name, name, sizeof(session->name)); + INIT_LIST_HEAD(&session->file_set.files_list); + luo_file_set_init(&session->file_set); + INIT_LIST_HEAD(&session->list); + mutex_init(&session->mutex); + + return session; +} + +static void luo_session_free(struct luo_session *session) +{ + luo_file_set_destroy(&session->file_set); + mutex_destroy(&session->mutex); + kfree(session); +} + +static int luo_session_insert(struct luo_session_header *sh, + struct luo_session *session) +{ + struct luo_session *it; + + guard(rwsem_write)(&sh->rwsem); + + /* + * For outgoing we should make sure there is room in serialization array + * for new session. + */ + if (sh == &luo_session_global.outgoing) { + if (sh->count == LUO_SESSION_MAX) + return -ENOMEM; + } + + /* + * For small number of sessions this loop won't hurt performance + * but if we ever start using a lot of sessions, this might + * become a bottle neck during deserialization time, as it would + * cause O(n*n) complexity. + */ + list_for_each_entry(it, &sh->list, list) { + if (!strncmp(it->name, session->name, sizeof(it->name))) + return -EEXIST; + } + list_add_tail(&session->list, &sh->list); + sh->count++; + + return 0; +} + +static void luo_session_remove(struct luo_session_header *sh, + struct luo_session *session) +{ + guard(rwsem_write)(&sh->rwsem); + list_del(&session->list); + sh->count--; +} + +static int luo_session_finish_one(struct luo_session *session) +{ + guard(mutex)(&session->mutex); + return luo_file_finish(&session->file_set); +} + +static void luo_session_unfreeze_one(struct luo_session *session, + struct luo_session_ser *ser) +{ + guard(mutex)(&session->mutex); + luo_file_unfreeze(&session->file_set, &ser->file_set_ser); +} + +static int luo_session_freeze_one(struct luo_session *session, + struct luo_session_ser *ser) +{ + guard(mutex)(&session->mutex); + return luo_file_freeze(&session->file_set, &ser->file_set_ser); +} + +static int luo_session_release(struct inode *inodep, struct file *filep) +{ + struct luo_session *session = filep->private_data; + struct luo_session_header *sh; + + /* If retrieved is set, it means this session is from incoming list */ + if (session->retrieved) { + int err = luo_session_finish_one(session); + + if (err) { + pr_warn("Unable to finish session [%s] on release\n", + session->name); + return err; + } + sh = &luo_session_global.incoming; + } else { + scoped_guard(mutex, &session->mutex) + luo_file_unpreserve_files(&session->file_set); + sh = &luo_session_global.outgoing; + } + + luo_session_remove(sh, session); + luo_session_free(session); + + return 0; +} + +static int luo_session_preserve_fd(struct luo_session *session, + struct luo_ucmd *ucmd) +{ + struct liveupdate_session_preserve_fd *argp = ucmd->cmd; + int err; + + guard(mutex)(&session->mutex); + err = luo_preserve_file(&session->file_set, argp->token, argp->fd); + if (err) + return err; + + err = luo_ucmd_respond(ucmd, sizeof(*argp)); + if (err) + pr_warn("The file was successfully preserved, but response to user failed\n"); + + return err; +} + +static int luo_session_retrieve_fd(struct luo_session *session, + struct luo_ucmd *ucmd) +{ + struct liveupdate_session_retrieve_fd *argp = ucmd->cmd; + struct file *file; + int err; + + argp->fd = get_unused_fd_flags(O_CLOEXEC); + if (argp->fd < 0) + return argp->fd; + + guard(mutex)(&session->mutex); + err = luo_retrieve_file(&session->file_set, argp->token, &file); + if (err < 0) + goto err_put_fd; + + err = luo_ucmd_respond(ucmd, sizeof(*argp)); + if (err) + goto err_put_file; + + fd_install(argp->fd, file); + + return 0; + +err_put_file: + fput(file); +err_put_fd: + put_unused_fd(argp->fd); + + return err; +} + +static int luo_session_finish(struct luo_session *session, + struct luo_ucmd *ucmd) +{ + struct liveupdate_session_finish *argp = ucmd->cmd; + int err = luo_session_finish_one(session); + + if (err) + return err; + + return luo_ucmd_respond(ucmd, sizeof(*argp)); +} + +union ucmd_buffer { + struct liveupdate_session_finish finish; + struct liveupdate_session_preserve_fd preserve; + struct liveupdate_session_retrieve_fd retrieve; +}; + +struct luo_ioctl_op { + unsigned int size; + unsigned int min_size; + unsigned int ioctl_num; + int (*execute)(struct luo_session *session, struct luo_ucmd *ucmd); +}; + +#define IOCTL_OP(_ioctl, _fn, _struct, _last) \ + [_IOC_NR(_ioctl) - LIVEUPDATE_CMD_SESSION_BASE] = { \ + .size = sizeof(_struct) + \ + BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \ + sizeof(_struct)), \ + .min_size = offsetofend(_struct, _last), \ + .ioctl_num = _ioctl, \ + .execute = _fn, \ + } + +static const struct luo_ioctl_op luo_session_ioctl_ops[] = { + IOCTL_OP(LIVEUPDATE_SESSION_FINISH, luo_session_finish, + struct liveupdate_session_finish, reserved), + IOCTL_OP(LIVEUPDATE_SESSION_PRESERVE_FD, luo_session_preserve_fd, + struct liveupdate_session_preserve_fd, token), + IOCTL_OP(LIVEUPDATE_SESSION_RETRIEVE_FD, luo_session_retrieve_fd, + struct liveupdate_session_retrieve_fd, token), +}; + +static long luo_session_ioctl(struct file *filep, unsigned int cmd, + unsigned long arg) +{ + struct luo_session *session = filep->private_data; + const struct luo_ioctl_op *op; + struct luo_ucmd ucmd = {}; + union ucmd_buffer buf; + unsigned int nr; + int ret; + + nr = _IOC_NR(cmd); + if (nr < LIVEUPDATE_CMD_SESSION_BASE || (nr - LIVEUPDATE_CMD_SESSION_BASE) >= + ARRAY_SIZE(luo_session_ioctl_ops)) { + return -EINVAL; + } + + ucmd.ubuffer = (void __user *)arg; + ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); + if (ret) + return ret; + + op = &luo_session_ioctl_ops[nr - LIVEUPDATE_CMD_SESSION_BASE]; + if (op->ioctl_num != cmd) + return -ENOIOCTLCMD; + if (ucmd.user_size < op->min_size) + return -EINVAL; + + ucmd.cmd = &buf; + ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, + ucmd.user_size); + if (ret) + return ret; + + return op->execute(session, &ucmd); +} + +static const struct file_operations luo_session_fops = { + .owner = THIS_MODULE, + .release = luo_session_release, + .unlocked_ioctl = luo_session_ioctl, +}; + +/* Create a "struct file" for session */ +static int luo_session_getfile(struct luo_session *session, struct file **filep) +{ + char name_buf[128]; + struct file *file; + + lockdep_assert_held(&session->mutex); + snprintf(name_buf, sizeof(name_buf), "[luo_session] %s", session->name); + file = anon_inode_getfile(name_buf, &luo_session_fops, session, O_RDWR); + if (IS_ERR(file)) + return PTR_ERR(file); + + *filep = file; + + return 0; +} + +int luo_session_create(const char *name, struct file **filep) +{ + struct luo_session *session; + int err; + + session = luo_session_alloc(name); + if (IS_ERR(session)) + return PTR_ERR(session); + + err = luo_session_insert(&luo_session_global.outgoing, session); + if (err) + goto err_free; + + scoped_guard(mutex, &session->mutex) + err = luo_session_getfile(session, filep); + if (err) + goto err_remove; + + return 0; + +err_remove: + luo_session_remove(&luo_session_global.outgoing, session); +err_free: + luo_session_free(session); + + return err; +} + +int luo_session_retrieve(const char *name, struct file **filep) +{ + struct luo_session_header *sh = &luo_session_global.incoming; + struct luo_session *session = NULL; + struct luo_session *it; + int err; + + scoped_guard(rwsem_read, &sh->rwsem) { + list_for_each_entry(it, &sh->list, list) { + if (!strncmp(it->name, name, sizeof(it->name))) { + session = it; + break; + } + } + } + + if (!session) + return -ENOENT; + + guard(mutex)(&session->mutex); + if (session->retrieved) + return -EINVAL; + + err = luo_session_getfile(session, filep); + if (!err) + session->retrieved = true; + + return err; +} + +int __init luo_session_setup_outgoing(void *fdt_out) +{ + struct luo_session_header_ser *header_ser; + u64 header_ser_pa; + int err; + + header_ser = kho_alloc_preserve(LUO_SESSION_PGCNT << PAGE_SHIFT); + if (IS_ERR(header_ser)) + return PTR_ERR(header_ser); + header_ser_pa = virt_to_phys(header_ser); + + err = fdt_begin_node(fdt_out, LUO_FDT_SESSION_NODE_NAME); + err |= fdt_property_string(fdt_out, "compatible", + LUO_FDT_SESSION_COMPATIBLE); + err |= fdt_property(fdt_out, LUO_FDT_SESSION_HEADER, &header_ser_pa, + sizeof(header_ser_pa)); + err |= fdt_end_node(fdt_out); + + if (err) + goto err_unpreserve; + + luo_session_global.outgoing.header_ser = header_ser; + luo_session_global.outgoing.ser = (void *)(header_ser + 1); + luo_session_global.outgoing.active = true; + + return 0; + +err_unpreserve: + kho_unpreserve_free(header_ser); + return err; +} + +int __init luo_session_setup_incoming(void *fdt_in) +{ + struct luo_session_header_ser *header_ser; + int err, header_size, offset; + u64 header_ser_pa; + const void *ptr; + + offset = fdt_subnode_offset(fdt_in, 0, LUO_FDT_SESSION_NODE_NAME); + if (offset < 0) { + pr_err("Unable to get session node: [%s]\n", + LUO_FDT_SESSION_NODE_NAME); + return -EINVAL; + } + + err = fdt_node_check_compatible(fdt_in, offset, + LUO_FDT_SESSION_COMPATIBLE); + if (err) { + pr_err("Session node incompatible [%s]\n", + LUO_FDT_SESSION_COMPATIBLE); + return -EINVAL; + } + + header_size = 0; + ptr = fdt_getprop(fdt_in, offset, LUO_FDT_SESSION_HEADER, &header_size); + if (!ptr || header_size != sizeof(u64)) { + pr_err("Unable to get session header '%s' [%d]\n", + LUO_FDT_SESSION_HEADER, header_size); + return -EINVAL; + } + + header_ser_pa = get_unaligned((u64 *)ptr); + header_ser = phys_to_virt(header_ser_pa); + + luo_session_global.incoming.header_ser = header_ser; + luo_session_global.incoming.ser = (void *)(header_ser + 1); + luo_session_global.incoming.active = true; + + return 0; +} + +int luo_session_deserialize(void) +{ + struct luo_session_header *sh = &luo_session_global.incoming; + static bool is_deserialized; + static int err; + + /* If has been deserialized, always return the same error code */ + if (is_deserialized) + return err; + + is_deserialized = true; + if (!sh->active) + return 0; + + /* + * Note on error handling: + * + * If deserialization fails (e.g., allocation failure or corrupt data), + * we intentionally skip cleanup of sessions that were already restored. + * + * A partial failure leaves the preserved state inconsistent. + * Implementing a safe "undo" to unwind complex dependencies (sessions, + * files, hardware state) is error-prone and provides little value, as + * the system is effectively in a broken state. + * + * We treat these resources as leaked. The expected recovery path is for + * userspace to detect the failure and trigger a reboot, which will + * reliably reset devices and reclaim memory. + */ + for (int i = 0; i < sh->header_ser->count; i++) { + struct luo_session *session; + + session = luo_session_alloc(sh->ser[i].name); + if (IS_ERR(session)) { + pr_warn("Failed to allocate session [%s] during deserialization %pe\n", + sh->ser[i].name, session); + return PTR_ERR(session); + } + + err = luo_session_insert(sh, session); + if (err) { + pr_warn("Failed to insert session [%s] %pe\n", + session->name, ERR_PTR(err)); + luo_session_free(session); + return err; + } + + scoped_guard(mutex, &session->mutex) { + luo_file_deserialize(&session->file_set, + &sh->ser[i].file_set_ser); + } + } + + kho_restore_free(sh->header_ser); + sh->header_ser = NULL; + sh->ser = NULL; + + return 0; +} + +int luo_session_serialize(void) +{ + struct luo_session_header *sh = &luo_session_global.outgoing; + struct luo_session *session; + int i = 0; + int err; + + guard(rwsem_write)(&sh->rwsem); + list_for_each_entry(session, &sh->list, list) { + err = luo_session_freeze_one(session, &sh->ser[i]); + if (err) + goto err_undo; + + strscpy(sh->ser[i].name, session->name, + sizeof(sh->ser[i].name)); + i++; + } + sh->header_ser->count = sh->count; + + return 0; + +err_undo: + list_for_each_entry_continue_reverse(session, &sh->list, list) { + i--; + luo_session_unfreeze_one(session, &sh->ser[i]); + memset(sh->ser[i].name, 0, sizeof(sh->ser[i].name)); + } + + return err; +} + +/** + * luo_session_quiesce - Ensure no active sessions exist and lock session lists. + * + * Acquires exclusive write locks on both incoming and outgoing session lists. + * It then validates no sessions exist in either list. + * + * This mechanism is used during file handler un/registration to ensure that no + * sessions are currently using the handler, and no new sessions can be created + * while un/registration is in progress. + * + * This prevents registering new handlers while sessions are active or + * while deserialization is in progress. + * + * Return: + * true - System is quiescent (0 sessions) and locked. + * false - Active sessions exist. The locks are released internally. + */ +bool luo_session_quiesce(void) +{ + down_write(&luo_session_global.incoming.rwsem); + down_write(&luo_session_global.outgoing.rwsem); + + if (luo_session_global.incoming.count || + luo_session_global.outgoing.count) { + up_write(&luo_session_global.outgoing.rwsem); + up_write(&luo_session_global.incoming.rwsem); + return false; + } + + return true; +} + +/** + * luo_session_resume - Unlock session lists and resume normal activity. + * + * Releases the exclusive locks acquired by a successful call to + * luo_session_quiesce(). + */ +void luo_session_resume(void) +{ + up_write(&luo_session_global.outgoing.rwsem); + up_write(&luo_session_global.incoming.rwsem); +} diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index ce0362f0a871..6567e5eeacc0 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -103,8 +103,8 @@ static const struct kernel_param_ops lt_bind_ops = { .get = param_get_cpumask, }; -module_param_cb(bind_readers, <_bind_ops, &bind_readers, 0644); -module_param_cb(bind_writers, <_bind_ops, &bind_writers, 0644); +module_param_cb(bind_readers, <_bind_ops, &bind_readers, 0444); +module_param_cb(bind_writers, <_bind_ops, &bind_writers, 0444); long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn); @@ -1211,6 +1211,10 @@ end: cxt.cur_ops->exit(); cxt.init_called = false; } + + free_cpumask_var(bind_readers); + free_cpumask_var(bind_writers); + torture_cleanup_end(); } diff --git a/kernel/module/main.c b/kernel/module/main.c index 7b3ec2fa6e7c..710ee30b3bea 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -954,7 +954,7 @@ size_t module_flags_taint(unsigned long taints, char *buf) int i; for (i = 0; i < TAINT_FLAGS_COUNT; i++) { - if (taint_flags[i].module && test_bit(i, &taints)) + if (test_bit(i, &taints)) buf[l++] = taint_flags[i].c_true; } diff --git a/kernel/panic.c b/kernel/panic.c index b2f2470af7e5..0d52210a9e2b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -401,7 +401,7 @@ static void panic_trigger_all_cpu_backtrace(void) */ static void panic_other_cpus_shutdown(bool crash_kexec) { - if (panic_print & SYS_INFO_ALL_CPU_BT) + if (panic_print & SYS_INFO_ALL_BT) panic_trigger_all_cpu_backtrace(); /* @@ -628,38 +628,40 @@ void panic(const char *fmt, ...) } EXPORT_SYMBOL(panic); -#define TAINT_FLAG(taint, _c_true, _c_false, _module) \ +#define TAINT_FLAG(taint, _c_true, _c_false) \ [ TAINT_##taint ] = { \ .c_true = _c_true, .c_false = _c_false, \ - .module = _module, \ .desc = #taint, \ } /* - * TAINT_FORCED_RMMOD could be a per-module flag but the module - * is being removed anyway. + * NOTE: if you modify the taint_flags or TAINT_FLAGS_COUNT, + * please also modify tools/debugging/kernel-chktaint and + * Documentation/admin-guide/tainted-kernels.rst, including its + * small shell script that prints the TAINT_FLAGS_COUNT bits of + * /proc/sys/kernel/tainted. */ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { - TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G', true), - TAINT_FLAG(FORCED_MODULE, 'F', ' ', true), - TAINT_FLAG(CPU_OUT_OF_SPEC, 'S', ' ', false), - TAINT_FLAG(FORCED_RMMOD, 'R', ' ', false), - TAINT_FLAG(MACHINE_CHECK, 'M', ' ', false), - TAINT_FLAG(BAD_PAGE, 'B', ' ', false), - TAINT_FLAG(USER, 'U', ' ', false), - TAINT_FLAG(DIE, 'D', ' ', false), - TAINT_FLAG(OVERRIDDEN_ACPI_TABLE, 'A', ' ', false), - TAINT_FLAG(WARN, 'W', ' ', false), - TAINT_FLAG(CRAP, 'C', ' ', true), - TAINT_FLAG(FIRMWARE_WORKAROUND, 'I', ' ', false), - TAINT_FLAG(OOT_MODULE, 'O', ' ', true), - TAINT_FLAG(UNSIGNED_MODULE, 'E', ' ', true), - TAINT_FLAG(SOFTLOCKUP, 'L', ' ', false), - TAINT_FLAG(LIVEPATCH, 'K', ' ', true), - TAINT_FLAG(AUX, 'X', ' ', true), - TAINT_FLAG(RANDSTRUCT, 'T', ' ', true), - TAINT_FLAG(TEST, 'N', ' ', true), - TAINT_FLAG(FWCTL, 'J', ' ', true), + TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G'), + TAINT_FLAG(FORCED_MODULE, 'F', ' '), + TAINT_FLAG(CPU_OUT_OF_SPEC, 'S', ' '), + TAINT_FLAG(FORCED_RMMOD, 'R', ' '), + TAINT_FLAG(MACHINE_CHECK, 'M', ' '), + TAINT_FLAG(BAD_PAGE, 'B', ' '), + TAINT_FLAG(USER, 'U', ' '), + TAINT_FLAG(DIE, 'D', ' '), + TAINT_FLAG(OVERRIDDEN_ACPI_TABLE, 'A', ' '), + TAINT_FLAG(WARN, 'W', ' '), + TAINT_FLAG(CRAP, 'C', ' '), + TAINT_FLAG(FIRMWARE_WORKAROUND, 'I', ' '), + TAINT_FLAG(OOT_MODULE, 'O', ' '), + TAINT_FLAG(UNSIGNED_MODULE, 'E', ' '), + TAINT_FLAG(SOFTLOCKUP, 'L', ' '), + TAINT_FLAG(LIVEPATCH, 'K', ' '), + TAINT_FLAG(AUX, 'X', ' '), + TAINT_FLAG(RANDSTRUCT, 'T', ' '), + TAINT_FLAG(TEST, 'N', ' '), + TAINT_FLAG(FWCTL, 'J', ' '), }; #undef TAINT_FLAG diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index f72bbfa266d6..5f5f626f4279 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -3,7 +3,6 @@ * internal.h - printk internal definitions */ #include <linux/console.h> -#include <linux/percpu.h> #include <linux/types.h> #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) @@ -112,47 +111,6 @@ bool nbcon_kthread_create(struct console *con); void nbcon_kthread_stop(struct console *con); void nbcon_kthreads_wake(void); -/* - * Check if the given console is currently capable and allowed to print - * records. Note that this function does not consider the current context, - * which can also play a role in deciding if @con can be used to print - * records. - */ -static inline bool console_is_usable(struct console *con, short flags, bool use_atomic) -{ - if (!(flags & CON_ENABLED)) - return false; - - if ((flags & CON_SUSPENDED)) - return false; - - if (flags & CON_NBCON) { - /* The write_atomic() callback is optional. */ - if (use_atomic && !con->write_atomic) - return false; - - /* - * For the !use_atomic case, @printk_kthreads_running is not - * checked because the write_thread() callback is also used - * via the legacy loop when the printer threads are not - * available. - */ - } else { - if (!con->write) - return false; - } - - /* - * Console drivers may assume that per-cpu resources have been - * allocated. So unless they're explicitly marked as being able to - * cope (CON_ANYTIME) don't call them until this CPU is officially up. - */ - if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) - return false; - - return true; -} - /** * nbcon_kthread_wake - Wake up a console printing thread * @con: Console to operate on @@ -204,9 +162,6 @@ static inline bool nbcon_legacy_emit_next_record(struct console *con, bool *hand static inline void nbcon_kthread_wake(struct console *con) { } static inline void nbcon_kthreads_wake(void) { } -static inline bool console_is_usable(struct console *con, short flags, - bool use_atomic) { return false; } - #endif /* CONFIG_PRINTK */ extern bool have_boot_console; @@ -230,6 +185,8 @@ struct console_flush_type { bool legacy_offload; }; +extern bool console_irqwork_blocked; + /* * Identify which console flushing methods should be used in the context of * the caller. @@ -241,7 +198,7 @@ static inline void printk_get_console_flush_type(struct console_flush_type *ft) switch (nbcon_get_default_prio()) { case NBCON_PRIO_NORMAL: if (have_nbcon_console && !have_boot_console) { - if (printk_kthreads_running) + if (printk_kthreads_running && !console_irqwork_blocked) ft->nbcon_offload = true; else ft->nbcon_atomic = true; @@ -251,7 +208,7 @@ static inline void printk_get_console_flush_type(struct console_flush_type *ft) if (have_legacy_console || have_boot_console) { if (!is_printk_legacy_deferred()) ft->legacy_direct = true; - else + else if (!console_irqwork_blocked) ft->legacy_offload = true; } break; @@ -264,7 +221,7 @@ static inline void printk_get_console_flush_type(struct console_flush_type *ft) if (have_legacy_console || have_boot_console) { if (!is_printk_legacy_deferred()) ft->legacy_direct = true; - else + else if (!console_irqwork_blocked) ft->legacy_offload = true; } break; diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index 558ef3177976..3fa403f9831f 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -10,6 +10,7 @@ #include <linux/export.h> #include <linux/init.h> #include <linux/irqflags.h> +#include <linux/kdb.h> #include <linux/kthread.h> #include <linux/minmax.h> #include <linux/panic.h> @@ -118,6 +119,9 @@ * from scratch. */ +/* Counter of active nbcon emergency contexts. */ +static atomic_t nbcon_cpu_emergency_cnt = ATOMIC_INIT(0); + /** * nbcon_state_set - Helper function to set the console state * @con: Console to update @@ -249,13 +253,16 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, * since all non-panic CPUs are stopped during panic(), it * is safer to have them avoid gaining console ownership. * - * If this acquire is a reacquire (and an unsafe takeover + * One exception is when kdb has locked for printing on this CPU. + * + * Second exception is a reacquire (and an unsafe takeover * has not previously occurred) then it is allowed to attempt * a direct acquire in panic. This gives console drivers an * opportunity to perform any necessary cleanup if they were * interrupted by the panic CPU while printing. */ if (panic_on_other_cpu() && + !kdb_printf_on_this_cpu() && (!is_reacquire || cur->unsafe_takeover)) { return -EPERM; } @@ -850,8 +857,8 @@ out: return nbcon_context_can_proceed(ctxt, &cur); } -static void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, - char *buf, unsigned int len) +void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, + char *buf, unsigned int len) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; @@ -1163,6 +1170,17 @@ static bool nbcon_kthread_should_wakeup(struct console *con, struct nbcon_contex if (kthread_should_stop()) return true; + /* + * Block the kthread when the system is in an emergency or panic mode. + * It increases the chance that these contexts would be able to show + * the messages directly. And it reduces the risk of interrupted writes + * where the context with a higher priority takes over the nbcon console + * ownership in the middle of a message. + */ + if (unlikely(atomic_read(&nbcon_cpu_emergency_cnt)) || + unlikely(panic_in_progress())) + return false; + cookie = console_srcu_read_lock(); flags = console_srcu_read_flags(con); @@ -1214,6 +1232,14 @@ wait_for_event: if (kthread_should_stop()) return 0; + /* + * Block the kthread when the system is in an emergency or panic + * mode. See nbcon_kthread_should_wakeup() for more details. + */ + if (unlikely(atomic_read(&nbcon_cpu_emergency_cnt)) || + unlikely(panic_in_progress())) + goto wait_for_event; + backlog = false; /* @@ -1276,6 +1302,13 @@ void nbcon_kthreads_wake(void) if (!printk_kthreads_running) return; + /* + * It is not allowed to call this function when console irq_work + * is blocked. + */ + if (WARN_ON_ONCE(console_irqwork_blocked)) + return; + cookie = console_srcu_read_lock(); for_each_console_srcu(con) { if (!(console_srcu_read_flags(con) & CON_NBCON)) @@ -1404,6 +1437,26 @@ enum nbcon_prio nbcon_get_default_prio(void) return NBCON_PRIO_NORMAL; } +/* + * Track if it is allowed to perform unsafe hostile takeovers of console + * ownership. When true, console drivers might perform unsafe actions while + * printing. It is externally available via nbcon_allow_unsafe_takeover(). + */ +static bool panic_nbcon_allow_unsafe_takeover; + +/** + * nbcon_allow_unsafe_takeover - Check if unsafe console takeovers are allowed + * + * Return: True, when it is permitted to perform unsafe console printing + * + * This is also used by console_is_usable() to determine if it is allowed to + * call write_atomic() callbacks flagged as unsafe (CON_NBCON_ATOMIC_UNSAFE). + */ +bool nbcon_allow_unsafe_takeover(void) +{ + return panic_on_this_cpu() && panic_nbcon_allow_unsafe_takeover; +} + /** * nbcon_legacy_emit_next_record - Print one record for an nbcon console * in legacy contexts @@ -1474,7 +1527,6 @@ bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, * write_atomic() callback * @con: The nbcon console to flush * @stop_seq: Flush up until this record - * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers * * Return: 0 if @con was flushed up to @stop_seq Otherwise, error code on * failure. @@ -1493,8 +1545,7 @@ bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, * returned, it cannot be expected that the unfinalized record will become * available. */ -static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, - bool allow_unsafe_takeover) +static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq) { struct nbcon_write_context wctxt = { }; struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); @@ -1503,12 +1554,12 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, ctxt->console = con; ctxt->spinwait_max_us = 2000; ctxt->prio = nbcon_get_default_prio(); - ctxt->allow_unsafe_takeover = allow_unsafe_takeover; - - if (!nbcon_context_try_acquire(ctxt, false)) - return -EPERM; + ctxt->allow_unsafe_takeover = nbcon_allow_unsafe_takeover(); while (nbcon_seq_read(con) < stop_seq) { + if (!nbcon_context_try_acquire(ctxt, false)) + return -EPERM; + /* * nbcon_emit_next_record() returns false when the console was * handed over or taken over. In both cases the context is no @@ -1517,6 +1568,8 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, if (!nbcon_emit_next_record(&wctxt, true)) return -EAGAIN; + nbcon_context_release(ctxt); + if (!ctxt->backlog) { /* Are there reserved but not yet finalized records? */ if (nbcon_seq_read(con) < stop_seq) @@ -1525,7 +1578,6 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, } } - nbcon_context_release(ctxt); return err; } @@ -1534,15 +1586,13 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, * write_atomic() callback * @con: The nbcon console to flush * @stop_seq: Flush up until this record - * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers * * This will stop flushing before @stop_seq if another context has ownership. * That context is then responsible for the flushing. Likewise, if new records * are added while this context was flushing and there is no other context * to handle the printing, this context must also flush those records. */ -static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, - bool allow_unsafe_takeover) +static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq) { struct console_flush_type ft; unsigned long flags; @@ -1557,7 +1607,7 @@ again: */ local_irq_save(flags); - err = __nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover); + err = __nbcon_atomic_flush_pending_con(con, stop_seq); local_irq_restore(flags); @@ -1589,9 +1639,8 @@ again: * __nbcon_atomic_flush_pending - Flush all nbcon consoles using their * write_atomic() callback * @stop_seq: Flush up until this record - * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers */ -static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeover) +static void __nbcon_atomic_flush_pending(u64 stop_seq) { struct console *con; int cookie; @@ -1609,7 +1658,7 @@ static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeove if (nbcon_seq_read(con) >= stop_seq) continue; - nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover); + nbcon_atomic_flush_pending_con(con, stop_seq); } console_srcu_read_unlock(cookie); } @@ -1625,7 +1674,7 @@ static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeove */ void nbcon_atomic_flush_pending(void) { - __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), false); + __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb)); } /** @@ -1637,7 +1686,9 @@ void nbcon_atomic_flush_pending(void) */ void nbcon_atomic_flush_unsafe(void) { - __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), true); + panic_nbcon_allow_unsafe_takeover = true; + __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb)); + panic_nbcon_allow_unsafe_takeover = false; } /** @@ -1655,6 +1706,8 @@ void nbcon_cpu_emergency_enter(void) preempt_disable(); + atomic_inc(&nbcon_cpu_emergency_cnt); + cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); (*cpu_emergency_nesting)++; } @@ -1669,10 +1722,24 @@ void nbcon_cpu_emergency_exit(void) unsigned int *cpu_emergency_nesting; cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); - if (!WARN_ON_ONCE(*cpu_emergency_nesting == 0)) (*cpu_emergency_nesting)--; + /* + * Wake up kthreads because there might be some pending messages + * added by other CPUs with normal priority since the last flush + * in the emergency context. + */ + if (!WARN_ON_ONCE(atomic_read(&nbcon_cpu_emergency_cnt) == 0)) { + if (atomic_dec_return(&nbcon_cpu_emergency_cnt) == 0) { + struct console_flush_type ft; + + printk_get_console_flush_type(&ft); + if (ft.nbcon_offload) + nbcon_kthreads_wake(); + } + } + preempt_enable(); } @@ -1844,14 +1911,75 @@ void nbcon_device_release(struct console *con) * using the legacy loop. */ if (ft.nbcon_atomic) { - __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb), false); + __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb)); } else if (ft.legacy_direct) { if (console_trylock()) console_unlock(); } else if (ft.legacy_offload) { - printk_trigger_flush(); + defer_console_output(); } } console_srcu_read_unlock(cookie); } EXPORT_SYMBOL_GPL(nbcon_device_release); + +/** + * nbcon_kdb_try_acquire - Try to acquire nbcon console and enter unsafe + * section + * @con: The nbcon console to acquire + * @wctxt: The nbcon write context to be used on success + * + * Context: Under console_srcu_read_lock() for emitting a single kdb message + * using the given con->write_atomic() callback. Can be called + * only when the console is usable at the moment. + * + * Return: True if the console was acquired. False otherwise. + * + * kdb emits messages on consoles registered for printk() without + * storing them into the ring buffer. It has to acquire the console + * ownerhip so that it could call con->write_atomic() callback a safe way. + * + * This function acquires the nbcon console using priority NBCON_PRIO_EMERGENCY + * and marks it unsafe for handover/takeover. + */ +bool nbcon_kdb_try_acquire(struct console *con, + struct nbcon_write_context *wctxt) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + + memset(ctxt, 0, sizeof(*ctxt)); + ctxt->console = con; + ctxt->prio = NBCON_PRIO_EMERGENCY; + + if (!nbcon_context_try_acquire(ctxt, false)) + return false; + + if (!nbcon_context_enter_unsafe(ctxt)) + return false; + + return true; +} + +/** + * nbcon_kdb_release - Exit unsafe section and release the nbcon console + * + * @wctxt: The nbcon write context initialized by a successful + * nbcon_kdb_try_acquire() + */ +void nbcon_kdb_release(struct nbcon_write_context *wctxt) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + + if (!nbcon_context_exit_unsafe(ctxt)) + return; + + nbcon_context_release(ctxt); + + /* + * Flush any new printk() messages added when the console was blocked. + * Only the console used by the given write context was blocked. + * The console was locked only when the write_atomic() callback + * was usable. + */ + __nbcon_atomic_flush_pending_con(ctxt->console, prb_next_reserve_seq(prb)); +} diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5aee9ffb16b9..1d765ad242b8 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -462,6 +462,9 @@ bool have_boot_console; /* See printk_legacy_allow_panic_sync() for details. */ bool legacy_allow_panic_sync; +/* Avoid using irq_work when suspending. */ +bool console_irqwork_blocked; + #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); static DECLARE_WAIT_QUEUE_HEAD(legacy_wait); @@ -2390,7 +2393,7 @@ asmlinkage int vprintk_emit(int facility, int level, /* If called from the scheduler, we can not call up(). */ if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; - ft.legacy_offload |= ft.legacy_direct; + ft.legacy_offload |= ft.legacy_direct && !console_irqwork_blocked; ft.legacy_direct = false; } @@ -2426,7 +2429,7 @@ asmlinkage int vprintk_emit(int facility, int level, if (ft.legacy_offload) defer_console_output(); - else + else if (!console_irqwork_blocked) wake_up_klogd(); return printed_len; @@ -2730,10 +2733,20 @@ void console_suspend_all(void) { struct console *con; + if (console_suspend_enabled) + pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); + + /* + * Flush any console backlog and then avoid queueing irq_work until + * console_resume_all(). Until then deferred printing is no longer + * triggered, NBCON consoles transition to atomic flushing, and + * any klogd waiters are not triggered. + */ + pr_flush(1000, true); + console_irqwork_blocked = true; + if (!console_suspend_enabled) return; - pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); - pr_flush(1000, true); console_list_lock(); for_each_console(con) @@ -2754,26 +2767,34 @@ void console_resume_all(void) struct console_flush_type ft; struct console *con; - if (!console_suspend_enabled) - return; - - console_list_lock(); - for_each_console(con) - console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED); - console_list_unlock(); - /* - * Ensure that all SRCU list walks have completed. All printing - * contexts must be able to see they are no longer suspended so - * that they are guaranteed to wake up and resume printing. + * Allow queueing irq_work. After restoring console state, deferred + * printing and any klogd waiters need to be triggered in case there + * is now a console backlog. */ - synchronize_srcu(&console_srcu); + console_irqwork_blocked = false; + + if (console_suspend_enabled) { + console_list_lock(); + for_each_console(con) + console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED); + console_list_unlock(); + + /* + * Ensure that all SRCU list walks have completed. All printing + * contexts must be able to see they are no longer suspended so + * that they are guaranteed to wake up and resume printing. + */ + synchronize_srcu(&console_srcu); + } printk_get_console_flush_type(&ft); if (ft.nbcon_offload) nbcon_kthreads_wake(); if (ft.legacy_offload) defer_console_output(); + else + wake_up_klogd(); pr_flush(1000, true); } @@ -3002,21 +3023,18 @@ out: } /* - * Legacy console printing from printk() caller context does not respect - * raw_spinlock/spinlock nesting. For !PREEMPT_RT the lockdep warning is a - * false positive. For PREEMPT_RT the false positive condition does not - * occur. - * - * This map is used to temporarily establish LD_WAIT_SLEEP context for the - * console write() callback when legacy printing to avoid false positive - * lockdep complaints, thus allowing lockdep to continue to function for - * real issues. + * The legacy console always acquires a spinlock_t from its printing + * callback. This violates lock nesting if the caller acquired an always + * spinning lock (raw_spinlock_t) while invoking printk(). This is not a + * problem on PREEMPT_RT because legacy consoles print always from a + * dedicated thread and never from within printk(). Therefore we tell + * lockdep that a sleeping spin lock (spinlock_t) is valid here. */ #ifdef CONFIG_PREEMPT_RT static inline void printk_legacy_allow_spinlock_enter(void) { } static inline void printk_legacy_allow_spinlock_exit(void) { } #else -static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_SLEEP); +static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_CONFIG); static inline void printk_legacy_allow_spinlock_enter(void) { @@ -3134,104 +3152,147 @@ static inline void printk_kthreads_check_locked(void) { } #endif /* CONFIG_PRINTK */ + /* - * Print out all remaining records to all consoles. + * Print out one record for each console. * * @do_cond_resched is set by the caller. It can be true only in schedulable * context. * * @next_seq is set to the sequence number after the last available record. - * The value is valid only when this function returns true. It means that all - * usable consoles are completely flushed. + * The value is valid only when all usable consoles were flushed. It is + * when the function returns true (can do the job) and @try_again parameter + * is set to false, see below. * * @handover will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding the * console_lock. Otherwise it is set to false. * - * Returns true when there was at least one usable console and all messages - * were flushed to all usable consoles. A returned false informs the caller - * that everything was not flushed (either there were no usable consoles or - * another context has taken over printing or it is a panic situation and this - * is not the panic CPU). Regardless the reason, the caller should assume it - * is not useful to immediately try again. + * @try_again will be set to true when it still makes sense to call this + * function again. The function could do the job, see the return value. + * And some consoles still make progress. + * + * Returns true when the function could do the job. Some consoles are usable, + * and there was no takeover and no panic_on_other_cpu(). * * Requires the console_lock. */ -static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) +static bool console_flush_one_record(bool do_cond_resched, u64 *next_seq, bool *handover, + bool *try_again) { struct console_flush_type ft; bool any_usable = false; struct console *con; - bool any_progress; int cookie; - *next_seq = 0; - *handover = false; + *try_again = false; - do { - any_progress = false; + printk_get_console_flush_type(&ft); - printk_get_console_flush_type(&ft); + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { + short flags = console_srcu_read_flags(con); + u64 printk_seq; + bool progress; - cookie = console_srcu_read_lock(); - for_each_console_srcu(con) { - short flags = console_srcu_read_flags(con); - u64 printk_seq; - bool progress; + /* + * console_flush_one_record() is only responsible for + * nbcon consoles when the nbcon consoles cannot print via + * their atomic or threaded flushing. + */ + if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) + continue; - /* - * console_flush_all() is only responsible for nbcon - * consoles when the nbcon consoles cannot print via - * their atomic or threaded flushing. - */ - if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) - continue; + if (!console_is_usable(con, flags, !do_cond_resched)) + continue; + any_usable = true; - if (!console_is_usable(con, flags, !do_cond_resched)) - continue; - any_usable = true; + if (flags & CON_NBCON) { + progress = nbcon_legacy_emit_next_record(con, handover, cookie, + !do_cond_resched); + printk_seq = nbcon_seq_read(con); + } else { + progress = console_emit_next_record(con, handover, cookie); + printk_seq = con->seq; + } - if (flags & CON_NBCON) { - progress = nbcon_legacy_emit_next_record(con, handover, cookie, - !do_cond_resched); - printk_seq = nbcon_seq_read(con); - } else { - progress = console_emit_next_record(con, handover, cookie); - printk_seq = con->seq; - } + /* + * If a handover has occurred, the SRCU read lock + * is already released. + */ + if (*handover) + goto fail; - /* - * If a handover has occurred, the SRCU read lock - * is already released. - */ - if (*handover) - return false; + /* Track the next of the highest seq flushed. */ + if (printk_seq > *next_seq) + *next_seq = printk_seq; - /* Track the next of the highest seq flushed. */ - if (printk_seq > *next_seq) - *next_seq = printk_seq; + if (!progress) + continue; - if (!progress) - continue; - any_progress = true; + /* + * An usable console made a progress. There might still be + * pending messages. + */ + *try_again = true; - /* Allow panic_cpu to take over the consoles safely. */ - if (panic_on_other_cpu()) - goto abandon; + /* Allow panic_cpu to take over the consoles safely. */ + if (panic_on_other_cpu()) + goto fail_srcu; - if (do_cond_resched) - cond_resched(); - } - console_srcu_read_unlock(cookie); - } while (any_progress); + if (do_cond_resched) + cond_resched(); + } + console_srcu_read_unlock(cookie); return any_usable; -abandon: +fail_srcu: console_srcu_read_unlock(cookie); +fail: + *try_again = false; return false; } +/* + * Print out all remaining records to all consoles. + * + * @do_cond_resched is set by the caller. It can be true only in schedulable + * context. + * + * @next_seq is set to the sequence number after the last available record. + * The value is valid only when this function returns true. It means that all + * usable consoles are completely flushed. + * + * @handover will be set to true if a printk waiter has taken over the + * console_lock, in which case the caller is no longer holding the + * console_lock. Otherwise it is set to false. + * + * Returns true when there was at least one usable console and all messages + * were flushed to all usable consoles. A returned false informs the caller + * that everything was not flushed (either there were no usable consoles or + * another context has taken over printing or it is a panic situation and this + * is not the panic CPU). Regardless the reason, the caller should assume it + * is not useful to immediately try again. + * + * Requires the console_lock. + */ +static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) +{ + bool try_again; + bool ret; + + *next_seq = 0; + *handover = false; + + do { + ret = console_flush_one_record(do_cond_resched, next_seq, + handover, &try_again); + } while (try_again); + + return ret; +} + static void __console_flush_and_unlock(void) { bool do_cond_resched; @@ -3331,12 +3392,10 @@ void console_unblank(void) */ cookie = console_srcu_read_lock(); for_each_console_srcu(c) { - short flags = console_srcu_read_flags(c); - - if (flags & CON_SUSPENDED) + if (!console_is_usable(c, console_srcu_read_flags(c), true)) continue; - if ((flags & CON_ENABLED) && c->unblank) { + if (c->unblank) { found_unblank = true; break; } @@ -3373,12 +3432,10 @@ void console_unblank(void) cookie = console_srcu_read_lock(); for_each_console_srcu(c) { - short flags = console_srcu_read_flags(c); - - if (flags & CON_SUSPENDED) + if (!console_is_usable(c, console_srcu_read_flags(c), true)) continue; - if ((flags & CON_ENABLED) && c->unblank) + if (c->unblank) c->unblank(); } console_srcu_read_unlock(cookie); @@ -3601,17 +3658,26 @@ static bool legacy_kthread_should_wakeup(void) static int legacy_kthread_func(void *unused) { - for (;;) { - wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup()); + bool try_again; + +wait_for_event: + wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup()); + + do { + bool handover = false; + u64 next_seq = 0; if (kthread_should_stop()) - break; + return 0; console_lock(); - __console_flush_and_unlock(); - } + console_flush_one_record(true, &next_seq, &handover, &try_again); + if (!handover) + __console_unlock(); - return 0; + } while (try_again); + + goto wait_for_event; } static bool legacy_kthread_create(void) @@ -3639,12 +3705,13 @@ static bool legacy_kthread_create(void) /** * printk_kthreads_shutdown - shutdown all threaded printers + * @data: syscore context * * On system shutdown all threaded printers are stopped. This allows printk * to transition back to atomic printing, thus providing a robust mechanism * for the final shutdown/reboot messages to be output. */ -static void printk_kthreads_shutdown(void) +static void printk_kthreads_shutdown(void *data) { struct console *con; @@ -3666,10 +3733,14 @@ static void printk_kthreads_shutdown(void) console_list_unlock(); } -static struct syscore_ops printk_syscore_ops = { +static const struct syscore_ops printk_syscore_ops = { .shutdown = printk_kthreads_shutdown, }; +static struct syscore printk_syscore = { + .ops = &printk_syscore_ops, +}; + /* * If appropriate, start nbcon kthreads and set @printk_kthreads_running. * If any kthreads fail to start, those consoles are unregistered. @@ -3737,7 +3808,7 @@ static void printk_kthreads_check_locked(void) static int __init printk_set_kthreads_ready(void) { - register_syscore_ops(&printk_syscore_ops); + register_syscore(&printk_syscore); console_list_lock(); printk_kthreads_ready = true; @@ -4511,6 +4582,13 @@ static void __wake_up_klogd(int val) if (!printk_percpu_data_ready()) return; + /* + * It is not allowed to call this function when console irq_work + * is blocked. + */ + if (WARN_ON_ONCE(console_irqwork_blocked)) + return; + preempt_disable(); /* * Guarantee any new records can be seen by tasks preparing to wait @@ -4567,9 +4645,30 @@ void defer_console_output(void) __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT); } +/** + * printk_trigger_flush - Attempt to flush printk buffer to consoles. + * + * If possible, flush the printk buffer to all consoles in the caller's + * context. If offloading is available, trigger deferred printing. + * + * This is best effort. Depending on the system state, console states, + * and caller context, no actual flushing may result from this call. + */ void printk_trigger_flush(void) { - defer_console_output(); + struct console_flush_type ft; + + printk_get_console_flush_type(&ft); + if (ft.nbcon_atomic) + nbcon_atomic_flush_pending(); + if (ft.nbcon_offload) + nbcon_kthreads_wake(); + if (ft.legacy_direct) { + if (console_trylock()) + console_unlock(); + } + if (ft.legacy_offload) + defer_console_output(); } int vprintk_deferred(const char *fmt, va_list args) diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 40198bffb7d0..56c8e3d031f4 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -411,6 +411,23 @@ static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size) return to_blk_size(size) <= DATA_SIZE(data_ring) / 2; } +/* + * Compare the current and requested logical position and decide + * whether more space is needed. + * + * Return false when @lpos_current is already at or beyond @lpos_target. + * + * Also return false when the difference between the positions is bigger + * than the size of the data buffer. It might happen only when the caller + * raced with another CPU(s) which already made and used the space. + */ +static bool need_more_space(struct prb_data_ring *data_ring, + unsigned long lpos_current, + unsigned long lpos_target) +{ + return lpos_target - lpos_current - 1 < DATA_SIZE(data_ring); +} + /* Query the state of a descriptor. */ static enum desc_state get_desc_state(unsigned long id, unsigned long state_val) @@ -577,7 +594,7 @@ static bool data_make_reusable(struct printk_ringbuffer *rb, unsigned long id; /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */ - while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) { + while (need_more_space(data_ring, lpos_begin, lpos_end)) { blk = to_block(data_ring, lpos_begin); /* @@ -668,7 +685,7 @@ static bool data_push_tail(struct printk_ringbuffer *rb, unsigned long lpos) * sees the new tail lpos, any descriptor states that transitioned to * the reusable state must already be visible. */ - while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) { + while (need_more_space(data_ring, tail_lpos, lpos)) { /* * Make all descriptors reusable that are associated with * data blocks before @lpos. @@ -999,6 +1016,17 @@ static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) return true; } +static bool is_blk_wrapped(struct prb_data_ring *data_ring, + unsigned long begin_lpos, unsigned long next_lpos) +{ + /* + * Subtract one from next_lpos since it's not actually part of this data + * block. This allows perfectly fitting records to not wrap. + */ + return DATA_WRAPS(data_ring, begin_lpos) != + DATA_WRAPS(data_ring, next_lpos - 1); +} + /* Determine the end of a data block. */ static unsigned long get_next_lpos(struct prb_data_ring *data_ring, unsigned long lpos, unsigned int size) @@ -1010,7 +1038,7 @@ static unsigned long get_next_lpos(struct prb_data_ring *data_ring, next_lpos = lpos + size; /* First check if the data block does not wrap. */ - if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos)) + if (!is_blk_wrapped(data_ring, begin_lpos, next_lpos)) return next_lpos; /* Wrapping data blocks store their data at the beginning. */ @@ -1087,7 +1115,7 @@ static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size, blk = to_block(data_ring, begin_lpos); blk->id = id; /* LMM(data_alloc:B) */ - if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) { + if (is_blk_wrapped(data_ring, begin_lpos, next_lpos)) { /* Wrapping data blocks store their data at the beginning. */ blk = to_block(data_ring, 0); @@ -1131,14 +1159,21 @@ static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size, return NULL; /* Keep track if @blk_lpos was a wrapping data block. */ - wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next)); + wrapped = is_blk_wrapped(data_ring, blk_lpos->begin, blk_lpos->next); size = to_blk_size(size); next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size); - /* If the data block does not increase, there is nothing to do. */ - if (head_lpos - next_lpos < DATA_SIZE(data_ring)) { + /* + * Use the current data block when the size does not increase, i.e. + * when @head_lpos is already able to accommodate the new @next_lpos. + * + * Note that need_more_space() could never return false here because + * the difference between the positions was bigger than the data + * buffer size. The data block is reopened and can't get reused. + */ + if (!need_more_space(data_ring, head_lpos, next_lpos)) { if (wrapped) blk = to_block(data_ring, 0); else @@ -1167,7 +1202,7 @@ static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size, blk = to_block(data_ring, blk_lpos->begin); - if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) { + if (is_blk_wrapped(data_ring, blk_lpos->begin, next_lpos)) { struct prb_data_block *old_blk = blk; /* Wrapping data blocks store their data at the beginning. */ @@ -1203,7 +1238,7 @@ static unsigned int space_used(struct prb_data_ring *data_ring, if (BLK_DATALESS(blk_lpos)) return 0; - if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) { + if (!is_blk_wrapped(data_ring, blk_lpos->begin, blk_lpos->next)) { /* Data block does not wrap. */ return (DATA_INDEX(data_ring, blk_lpos->next) - DATA_INDEX(data_ring, blk_lpos->begin)); @@ -1249,15 +1284,15 @@ static const char *get_data(struct prb_data_ring *data_ring, return NULL; } - /* Regular data block: @begin less than @next and in same wrap. */ - if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) && - blk_lpos->begin < blk_lpos->next) { + /* Regular data block: @begin and @next in the same wrap. */ + if (!is_blk_wrapped(data_ring, blk_lpos->begin, blk_lpos->next)) { db = to_block(data_ring, blk_lpos->begin); *data_size = blk_lpos->next - blk_lpos->begin; /* Wrapping data block: @begin is one wrap behind @next. */ - } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) == - DATA_WRAPS(data_ring, blk_lpos->next)) { + } else if (!is_blk_wrapped(data_ring, + blk_lpos->begin + DATA_SIZE(data_ring), + blk_lpos->next)) { db = to_block(data_ring, 0); *data_size = DATA_INDEX(data_ring, blk_lpos->next); @@ -1267,6 +1302,10 @@ static const char *get_data(struct prb_data_ring *data_ring, return NULL; } + /* Sanity check. Data-less blocks were handled earlier. */ + if (WARN_ON_ONCE(!data_check_size(data_ring, *data_size) || !*data_size)) + return NULL; + /* A valid data block will always be aligned to the ID size. */ if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) || WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) { diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 12e4c64ebae1..625d75392647 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -213,4 +213,19 @@ config RCU_STRICT_GRACE_PERIOD when looking for certain types of RCU usage bugs, for example, too-short RCU read-side critical sections. + +config RCU_DYNTICKS_TORTURE + bool "Minimize RCU dynticks counter size" + depends on RCU_EXPERT && !COMPILE_TEST + default n + help + This option sets the width of the dynticks counter to its + minimum usable value. This minimum width greatly increases + the probability of flushing out bugs involving counter wrap, + but it also increases the probability of extending grace period + durations. This Kconfig option should therefore be avoided in + production due to the consequent increased probability of OOMs. + + This has no value for production and is only for testing. + endmenu # "RCU Debugging" diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 29fe3c01312f..07e51974b06b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -389,6 +389,7 @@ struct rcu_torture_ops { void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); + void (*exp_current)(void); unsigned long (*get_gp_state_exp)(void); unsigned long (*start_gp_poll_exp)(void); void (*start_gp_poll_exp_full)(struct rcu_gp_oldstate *rgosp); @@ -691,10 +692,29 @@ static struct rcu_torture_ops rcu_busted_ops = { */ DEFINE_STATIC_SRCU(srcu_ctl); +DEFINE_STATIC_SRCU_FAST(srcu_ctlf); +DEFINE_STATIC_SRCU_FAST_UPDOWN(srcu_ctlfud); static struct srcu_struct srcu_ctld; static struct srcu_struct *srcu_ctlp = &srcu_ctl; static struct rcu_torture_ops srcud_ops; +static void srcu_torture_init(void) +{ + rcu_sync_torture_init(); + if (!reader_flavor || (reader_flavor & SRCU_READ_FLAVOR_NORMAL)) + VERBOSE_TOROUT_STRING("srcu_torture_init normal SRCU"); + if (reader_flavor & SRCU_READ_FLAVOR_NMI) + VERBOSE_TOROUT_STRING("srcu_torture_init NMI-safe SRCU"); + if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + srcu_ctlp = &srcu_ctlf; + VERBOSE_TOROUT_STRING("srcu_torture_init fast SRCU"); + } + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { + srcu_ctlp = &srcu_ctlfud; + VERBOSE_TOROUT_STRING("srcu_torture_init fast-up/down SRCU"); + } +} + static void srcu_get_gp_data(int *flags, unsigned long *gp_seq) { srcutorture_get_gp_data(srcu_ctlp, flags, gp_seq); @@ -722,6 +742,12 @@ static int srcu_torture_read_lock(void) scp = srcu_read_lock_fast(srcu_ctlp); idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); WARN_ON_ONCE(idx & ~0x1); + ret += idx << 2; + } + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { + scp = srcu_read_lock_fast_updown(srcu_ctlp); + idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); + WARN_ON_ONCE(idx & ~0x1); ret += idx << 3; } return ret; @@ -749,8 +775,11 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) static void srcu_torture_read_unlock(int idx) { WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) + srcu_read_unlock_fast_updown(srcu_ctlp, + __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); if (reader_flavor & SRCU_READ_FLAVOR_FAST) - srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); + srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x4) >> 2)); if (reader_flavor & SRCU_READ_FLAVOR_NMI) srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1); if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) @@ -784,7 +813,7 @@ static int srcu_torture_down_read(void) WARN_ON_ONCE(idx & ~0x1); return idx; } - if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { scp = srcu_down_read_fast(srcu_ctlp); idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); WARN_ON_ONCE(idx & ~0x1); @@ -797,7 +826,7 @@ static int srcu_torture_down_read(void) static void srcu_torture_up_read(int idx) { WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); - if (reader_flavor & SRCU_READ_FLAVOR_FAST) + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) srcu_up_read_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); else if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) @@ -857,9 +886,14 @@ static void srcu_torture_synchronize_expedited(void) synchronize_srcu_expedited(srcu_ctlp); } +static void srcu_torture_expedite_current(void) +{ + srcu_expedite_current(srcu_ctlp); +} + static struct rcu_torture_ops srcu_ops = { .ttype = SRCU_FLAVOR, - .init = rcu_sync_torture_init, + .init = srcu_torture_init, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, @@ -871,6 +905,7 @@ static struct rcu_torture_ops srcu_ops = { .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .exp_current = srcu_torture_expedite_current, .same_gp_state = same_state_synchronize_srcu, .get_comp_state = get_completed_synchronize_srcu, .get_gp_state = srcu_torture_get_gp_state, @@ -886,14 +921,28 @@ static struct rcu_torture_ops srcu_ops = { .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .debug_objects = 1, .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU) - ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST, + ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST_UPDOWN, .name = "srcu" }; -static void srcu_torture_init(void) +static void srcud_torture_init(void) { rcu_sync_torture_init(); - WARN_ON(init_srcu_struct(&srcu_ctld)); + if (!reader_flavor || (reader_flavor & SRCU_READ_FLAVOR_NORMAL)) { + WARN_ON(init_srcu_struct(&srcu_ctld)); + VERBOSE_TOROUT_STRING("srcud_torture_init normal SRCU"); + } else if (reader_flavor & SRCU_READ_FLAVOR_NMI) { + WARN_ON(init_srcu_struct(&srcu_ctld)); + VERBOSE_TOROUT_STRING("srcud_torture_init NMI-safe SRCU"); + } else if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + WARN_ON(init_srcu_struct_fast(&srcu_ctld)); + VERBOSE_TOROUT_STRING("srcud_torture_init fast SRCU"); + } else if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { + WARN_ON(init_srcu_struct_fast_updown(&srcu_ctld)); + VERBOSE_TOROUT_STRING("srcud_torture_init fast-up/down SRCU"); + } else { + WARN_ON(init_srcu_struct(&srcu_ctld)); + } srcu_ctlp = &srcu_ctld; } @@ -906,7 +955,7 @@ static void srcu_torture_cleanup(void) /* As above, but dynamically allocated. */ static struct rcu_torture_ops srcud_ops = { .ttype = SRCU_FLAVOR, - .init = srcu_torture_init, + .init = srcud_torture_init, .cleanup = srcu_torture_cleanup, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, @@ -919,6 +968,7 @@ static struct rcu_torture_ops srcud_ops = { .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .exp_current = srcu_torture_expedite_current, .same_gp_state = same_state_synchronize_srcu, .get_comp_state = get_completed_synchronize_srcu, .get_gp_state = srcu_torture_get_gp_state, @@ -934,7 +984,7 @@ static struct rcu_torture_ops srcud_ops = { .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .debug_objects = 1, .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU) - ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST, + ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST_UPDOWN, .name = "srcud" }; @@ -1700,6 +1750,8 @@ rcu_torture_writer(void *arg) ulo[i] = cur_ops->get_comp_state(); gp_snap = cur_ops->start_gp_poll(); rcu_torture_writer_state = RTWS_POLL_WAIT; + if (cur_ops->exp_current && !torture_random(&rand) % 0xff) + cur_ops->exp_current(); while (!cur_ops->poll_gp_state(gp_snap)) { gp_snap1 = cur_ops->get_gp_state(); for (i = 0; i < ulo_size; i++) @@ -1720,6 +1772,8 @@ rcu_torture_writer(void *arg) cur_ops->get_comp_state_full(&rgo[i]); cur_ops->start_gp_poll_full(&gp_snap_full); rcu_torture_writer_state = RTWS_POLL_WAIT_FULL; + if (cur_ops->exp_current && !torture_random(&rand) % 0xff) + cur_ops->exp_current(); while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { cur_ops->get_gp_state_full(&gp_snap1_full); for (i = 0; i < rgo_size; i++) @@ -2384,10 +2438,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) newstate = rcutorture_extend_mask(rtors.readstate, trsp); WARN_ON_ONCE(newstate & RCUTORTURE_RDR_UPDOWN); rcutorture_one_extend(&rtors.readstate, newstate, trsp, rtors.rtrsp++); - if (!rcu_torture_one_read_start(&rtors, trsp, myid)) { - rcutorture_one_extend(&rtors.readstate, 0, trsp, rtors.rtrsp); + if (!rcu_torture_one_read_start(&rtors, trsp, myid)) return false; - } rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, trsp, rtors.rtrsp); rcu_torture_one_read_end(&rtors, trsp); return true; diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 19841704d8f5..07a313782dfd 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -136,6 +136,7 @@ struct ref_scale_ops { void (*cleanup)(void); void (*readsection)(const int nloops); void (*delaysection)(const int nloops, const int udl, const int ndl); + bool enable_irqs; const char *name; }; @@ -184,6 +185,8 @@ static const struct ref_scale_ops rcu_ops = { // Definitions for SRCU ref scale testing. DEFINE_STATIC_SRCU(srcu_refctl_scale); +DEFINE_STATIC_SRCU_FAST(srcu_fast_refctl_scale); +DEFINE_STATIC_SRCU_FAST_UPDOWN(srcu_fast_updown_refctl_scale); static struct srcu_struct *srcu_ctlp = &srcu_refctl_scale; static void srcu_ref_scale_read_section(const int nloops) @@ -216,6 +219,12 @@ static const struct ref_scale_ops srcu_ops = { .name = "srcu" }; +static bool srcu_fast_sync_scale_init(void) +{ + srcu_ctlp = &srcu_fast_refctl_scale; + return true; +} + static void srcu_fast_ref_scale_read_section(const int nloops) { int i; @@ -240,12 +249,48 @@ static void srcu_fast_ref_scale_delay_section(const int nloops, const int udl, c } static const struct ref_scale_ops srcu_fast_ops = { - .init = rcu_sync_scale_init, + .init = srcu_fast_sync_scale_init, .readsection = srcu_fast_ref_scale_read_section, .delaysection = srcu_fast_ref_scale_delay_section, .name = "srcu-fast" }; +static bool srcu_fast_updown_sync_scale_init(void) +{ + srcu_ctlp = &srcu_fast_updown_refctl_scale; + return true; +} + +static void srcu_fast_updown_ref_scale_read_section(const int nloops) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast_updown(srcu_ctlp); + srcu_read_unlock_fast_updown(srcu_ctlp, scp); + } +} + +static void srcu_fast_updown_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast_updown(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock_fast_updown(srcu_ctlp, scp); + } +} + +static const struct ref_scale_ops srcu_fast_updown_ops = { + .init = srcu_fast_updown_sync_scale_init, + .readsection = srcu_fast_updown_ref_scale_read_section, + .delaysection = srcu_fast_updown_ref_scale_delay_section, + .name = "srcu-fast-updown" +}; + #ifdef CONFIG_TASKS_RCU // Definitions for RCU Tasks ref scale testing: Empty read markers. @@ -323,6 +368,9 @@ static const struct ref_scale_ops rcu_trace_ops = { // Definitions for reference count static atomic_t refcnt; +// Definitions acquire-release. +static DEFINE_PER_CPU(unsigned long, test_acqrel); + static void ref_refcnt_section(const int nloops) { int i; @@ -351,6 +399,184 @@ static const struct ref_scale_ops refcnt_ops = { .name = "refcnt" }; +static void ref_percpuinc_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + this_cpu_inc(test_acqrel); + this_cpu_dec(test_acqrel); + } +} + +static void ref_percpuinc_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + this_cpu_inc(test_acqrel); + un_delay(udl, ndl); + this_cpu_dec(test_acqrel); + } +} + +static const struct ref_scale_ops percpuinc_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_percpuinc_section, + .delaysection = ref_percpuinc_delay_section, + .name = "percpuinc" +}; + +// Note that this can lose counts in preemptible kernels. +static void ref_incpercpu_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap = this_cpu_ptr(&test_acqrel); + + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + } +} + +static void ref_incpercpu_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap = this_cpu_ptr(&test_acqrel); + + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + un_delay(udl, ndl); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + } +} + +static const struct ref_scale_ops incpercpu_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_incpercpu_section, + .delaysection = ref_incpercpu_delay_section, + .name = "incpercpu" +}; + +static void ref_incpercpupreempt_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + preempt_disable(); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + preempt_enable(); + } +} + +static void ref_incpercpupreempt_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + preempt_disable(); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + un_delay(udl, ndl); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + preempt_enable(); + } +} + +static const struct ref_scale_ops incpercpupreempt_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_incpercpupreempt_section, + .delaysection = ref_incpercpupreempt_delay_section, + .name = "incpercpupreempt" +}; + +static void ref_incpercpubh_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + local_bh_disable(); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + local_bh_enable(); + } +} + +static void ref_incpercpubh_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + local_bh_disable(); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + un_delay(udl, ndl); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + local_bh_enable(); + } +} + +static const struct ref_scale_ops incpercpubh_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_incpercpubh_section, + .delaysection = ref_incpercpubh_delay_section, + .enable_irqs = true, + .name = "incpercpubh" +}; + +static void ref_incpercpuirqsave_section(const int nloops) +{ + int i; + unsigned long flags; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + local_irq_save(flags); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + local_irq_restore(flags); + } +} + +static void ref_incpercpuirqsave_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + unsigned long flags; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + local_irq_save(flags); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + un_delay(udl, ndl); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + local_irq_restore(flags); + } +} + +static const struct ref_scale_ops incpercpuirqsave_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_incpercpuirqsave_section, + .delaysection = ref_incpercpuirqsave_delay_section, + .name = "incpercpuirqsave" +}; + // Definitions for rwlock static rwlock_t test_rwlock; @@ -494,9 +720,6 @@ static const struct ref_scale_ops lock_irq_ops = { .name = "lock-irq" }; -// Definitions acquire-release. -static DEFINE_PER_CPU(unsigned long, test_acqrel); - static void ref_acqrel_section(const int nloops) { unsigned long x; @@ -629,6 +852,133 @@ static const struct ref_scale_ops jiffies_ops = { .name = "jiffies" }; +static void ref_preempt_section(const int nloops) +{ + int i; + + migrate_disable(); + for (i = nloops; i >= 0; i--) { + preempt_disable(); + preempt_enable(); + } + migrate_enable(); +} + +static void ref_preempt_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + migrate_disable(); + for (i = nloops; i >= 0; i--) { + preempt_disable(); + un_delay(udl, ndl); + preempt_enable(); + } + migrate_enable(); +} + +static const struct ref_scale_ops preempt_ops = { + .readsection = ref_preempt_section, + .delaysection = ref_preempt_delay_section, + .name = "preempt" +}; + +static void ref_bh_section(const int nloops) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_bh_disable(); + local_bh_enable(); + } + preempt_enable(); +} + +static void ref_bh_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_bh_disable(); + un_delay(udl, ndl); + local_bh_enable(); + } + preempt_enable(); +} + +static const struct ref_scale_ops bh_ops = { + .readsection = ref_bh_section, + .delaysection = ref_bh_delay_section, + .enable_irqs = true, + .name = "bh" +}; + +static void ref_irq_section(const int nloops) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_irq_disable(); + local_irq_enable(); + } + preempt_enable(); +} + +static void ref_irq_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_irq_disable(); + un_delay(udl, ndl); + local_irq_enable(); + } + preempt_enable(); +} + +static const struct ref_scale_ops irq_ops = { + .readsection = ref_irq_section, + .delaysection = ref_irq_delay_section, + .name = "irq" +}; + +static void ref_irqsave_section(const int nloops) +{ + unsigned long flags; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_irq_save(flags); + local_irq_restore(flags); + } + preempt_enable(); +} + +static void ref_irqsave_delay_section(const int nloops, const int udl, const int ndl) +{ + unsigned long flags; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_irq_save(flags); + un_delay(udl, ndl); + local_irq_restore(flags); + } + preempt_enable(); +} + +static const struct ref_scale_ops irqsave_ops = { + .readsection = ref_irqsave_section, + .delaysection = ref_irqsave_delay_section, + .name = "irqsave" +}; + //////////////////////////////////////////////////////////////////////// // // Methods leveraging SLAB_TYPESAFE_BY_RCU. @@ -924,15 +1274,18 @@ repeat: if (!atomic_dec_return(&n_warmedup)) while (atomic_read_acquire(&n_warmedup)) rcu_scale_one_reader(); - // Also keep interrupts disabled. This also has the effect - // of preventing entries into slow path for rcu_read_unlock(). - local_irq_save(flags); + // Also keep interrupts disabled when it is safe to do so, which + // it is not for local_bh_enable(). This also has the effect of + // preventing entries into slow path for rcu_read_unlock(). + if (!cur_ops->enable_irqs) + local_irq_save(flags); start = ktime_get_mono_fast_ns(); rcu_scale_one_reader(); duration = ktime_get_mono_fast_ns() - start; - local_irq_restore(flags); + if (!cur_ops->enable_irqs) + local_irq_restore(flags); rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration; // To reduce runtime-skew noise, do maintain-load invocations until @@ -1163,9 +1516,13 @@ ref_scale_init(void) long i; int firsterr = 0; static const struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcu_fast_ops, RCU_TRACE_OPS RCU_TASKS_OPS - &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, - &acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops, + &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_fast_updown_ops, + RCU_TRACE_OPS RCU_TASKS_OPS + &refcnt_ops, &percpuinc_ops, &incpercpu_ops, &incpercpupreempt_ops, + &incpercpubh_ops, &incpercpuirqsave_ops, + &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, + &sched_clock_ops, &clock_ops, &jiffies_ops, + &preempt_ops, &bh_ops, &irq_ops, &irqsave_ops, &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, }; diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index e3b64a5e0ec7..3450c3751ef7 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -106,15 +106,15 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); preempt_enable(); - if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task()) + if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task() && !irqs_disabled()) swake_up_one(&ssp->srcu_wq); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); /* * Workqueue handler to drive one grace period and invoke any callbacks - * that become ready as a result. Single-CPU and !PREEMPTION operation - * means that we get away with murder on synchronization. ;-) + * that become ready as a result. Single-CPU operation and preemption + * disabling mean that we get away with murder on synchronization. ;-) */ void srcu_drive_gp(struct work_struct *wp) { @@ -141,7 +141,12 @@ void srcu_drive_gp(struct work_struct *wp) WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ preempt_enable(); - swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); + do { + // Deadlock issues prevent __srcu_read_unlock() from + // doing an unconditional wakeup, so polling is required. + swait_event_timeout_exclusive(ssp->srcu_wq, + !READ_ONCE(ssp->srcu_lock_nesting[idx]), HZ / 10); + } while (READ_ONCE(ssp->srcu_lock_nesting[idx])); preempt_disable(); // Needed for PREEMPT_LAZY WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 1ff94b76d91f..ea3f128de06f 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -286,32 +286,92 @@ err_free_sup: #ifdef CONFIG_DEBUG_LOCK_ALLOC -int __init_srcu_struct(struct srcu_struct *ssp, const char *name, - struct lock_class_key *key) +static int +__init_srcu_struct_common(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) { /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)ssp, sizeof(*ssp)); lockdep_init_map(&ssp->dep_map, name, key, 0); return init_srcu_struct_fields(ssp, false); } + +int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) +{ + ssp->srcu_reader_flavor = 0; + return __init_srcu_struct_common(ssp, name, key); +} EXPORT_SYMBOL_GPL(__init_srcu_struct); +int __init_srcu_struct_fast(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST; + return __init_srcu_struct_common(ssp, name, key); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct_fast); + +int __init_srcu_struct_fast_updown(struct srcu_struct *ssp, const char *name, + struct lock_class_key *key) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST_UPDOWN; + return __init_srcu_struct_common(ssp, name, key); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct_fast_updown); + #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /** * init_srcu_struct - initialize a sleep-RCU structure * @ssp: structure to initialize. * - * Must invoke this on a given srcu_struct before passing that srcu_struct + * Use this in place of DEFINE_SRCU() and DEFINE_STATIC_SRCU() + * for non-static srcu_struct structures that are to be passed to + * srcu_read_lock(), srcu_read_lock_nmisafe(), and friends. It is necessary + * to invoke this on a given srcu_struct before passing that srcu_struct * to any other function. Each srcu_struct represents a separate domain * of SRCU protection. */ int init_srcu_struct(struct srcu_struct *ssp) { + ssp->srcu_reader_flavor = 0; return init_srcu_struct_fields(ssp, false); } EXPORT_SYMBOL_GPL(init_srcu_struct); +/** + * init_srcu_struct_fast - initialize a fast-reader sleep-RCU structure + * @ssp: structure to initialize. + * + * Use this in place of DEFINE_SRCU_FAST() and DEFINE_STATIC_SRCU_FAST() + * for non-static srcu_struct structures that are to be passed to + * srcu_read_lock_fast() and friends. It is necessary to invoke this on a + * given srcu_struct before passing that srcu_struct to any other function. + * Each srcu_struct represents a separate domain of SRCU protection. + */ +int init_srcu_struct_fast(struct srcu_struct *ssp) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST; + return init_srcu_struct_fields(ssp, false); +} +EXPORT_SYMBOL_GPL(init_srcu_struct_fast); + +/** + * init_srcu_struct_fast_updown - initialize a fast-reader up/down sleep-RCU structure + * @ssp: structure to initialize. + * + * Use this function in place of DEFINE_SRCU_FAST_UPDOWN() and + * DEFINE_STATIC_SRCU_FAST_UPDOWN() for non-static srcu_struct + * structures that are to be passed to srcu_read_lock_fast_updown(), + * srcu_down_read_fast(), and friends. It is necessary to invoke this on a + * given srcu_struct before passing that srcu_struct to any other function. + * Each srcu_struct represents a separate domain of SRCU protection. + */ +int init_srcu_struct_fast_updown(struct srcu_struct *ssp) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST_UPDOWN; + return init_srcu_struct_fields(ssp, false); +} +EXPORT_SYMBOL_GPL(init_srcu_struct_fast_updown); + #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* @@ -461,7 +521,7 @@ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, uns static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm) { int cpu; - unsigned long mask = 0; + unsigned long mask = ssp->srcu_reader_flavor; unsigned long sum = 0; for_each_possible_cpu(cpu) { @@ -734,6 +794,10 @@ void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) sdp = raw_cpu_ptr(ssp->sda); old_read_flavor = READ_ONCE(sdp->srcu_reader_flavor); + WARN_ON_ONCE(ssp->srcu_reader_flavor && read_flavor != ssp->srcu_reader_flavor); + WARN_ON_ONCE(old_read_flavor && ssp->srcu_reader_flavor && + old_read_flavor != ssp->srcu_reader_flavor); + WARN_ON_ONCE(read_flavor == SRCU_READ_FLAVOR_FAST && !ssp->srcu_reader_flavor); if (!old_read_flavor) { old_read_flavor = cmpxchg(&sdp->srcu_reader_flavor, 0, read_flavor); if (!old_read_flavor) @@ -1688,6 +1752,64 @@ void srcu_barrier(struct srcu_struct *ssp) } EXPORT_SYMBOL_GPL(srcu_barrier); +/* Callback for srcu_expedite_current() usage. */ +static void srcu_expedite_current_cb(struct rcu_head *rhp) +{ + unsigned long flags; + bool needcb = false; + struct srcu_data *sdp = container_of(rhp, struct srcu_data, srcu_ec_head); + + spin_lock_irqsave_sdp_contention(sdp, &flags); + if (sdp->srcu_ec_state == SRCU_EC_IDLE) { + WARN_ON_ONCE(1); + } else if (sdp->srcu_ec_state == SRCU_EC_PENDING) { + sdp->srcu_ec_state = SRCU_EC_IDLE; + } else { + WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST); + sdp->srcu_ec_state = SRCU_EC_PENDING; + needcb = true; + } + spin_unlock_irqrestore_rcu_node(sdp, flags); + // If needed, requeue ourselves as an expedited SRCU callback. + if (needcb) + __call_srcu(sdp->ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false); +} + +/** + * srcu_expedite_current - Expedite the current SRCU grace period + * @ssp: srcu_struct to expedite. + * + * Cause the current SRCU grace period to become expedited. The grace + * period following the current one might also be expedited. If there is + * no current grace period, one might be created. If the current grace + * period is currently sleeping, that sleep will complete before expediting + * will take effect. + */ +void srcu_expedite_current(struct srcu_struct *ssp) +{ + unsigned long flags; + bool needcb = false; + struct srcu_data *sdp; + + migrate_disable(); + sdp = this_cpu_ptr(ssp->sda); + spin_lock_irqsave_sdp_contention(sdp, &flags); + if (sdp->srcu_ec_state == SRCU_EC_IDLE) { + sdp->srcu_ec_state = SRCU_EC_PENDING; + needcb = true; + } else if (sdp->srcu_ec_state == SRCU_EC_PENDING) { + sdp->srcu_ec_state = SRCU_EC_REPOST; + } else { + WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST); + } + spin_unlock_irqrestore_rcu_node(sdp, flags); + // If needed, queue an expedited SRCU callback. + if (needcb) + __call_srcu(ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false); + migrate_enable(); +} +EXPORT_SYMBOL_GPL(srcu_expedite_current); + /** * srcu_batches_completed - return batches completed. * @ssp: srcu_struct on which to report batch completion. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 85b82a7007b9..293bbd9ac3f4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4017,7 +4017,7 @@ bool rcu_cpu_online(int cpu) * RCU on an offline processor during initial boot, hence the check for * rcu_scheduler_fully_active. */ -bool rcu_lockdep_current_cpu_online(void) +bool notrace rcu_lockdep_current_cpu_online(void) { struct rcu_data *rdp; bool ret = false; diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index c912b594ba98..dfeba9b35395 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -117,7 +117,7 @@ static bool rcu_read_lock_held_common(bool *ret) return false; } -int rcu_read_lock_sched_held(void) +int notrace rcu_read_lock_sched_held(void) { bool ret; @@ -342,7 +342,7 @@ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); * Note that rcu_read_lock() is disallowed if the CPU is either idle or * offline from an RCU perspective, so check for those as well. */ -int rcu_read_lock_held(void) +int notrace rcu_read_lock_held(void) { bool ret; @@ -367,7 +367,7 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held); * Note that rcu_read_lock_bh() is disallowed if the CPU is either idle or * offline from an RCU perspective, so check for those as well. */ -int rcu_read_lock_bh_held(void) +int notrace rcu_read_lock_bh_held(void) { bool ret; @@ -377,7 +377,7 @@ int rcu_read_lock_bh_held(void) } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); -int rcu_read_lock_any_held(void) +int notrace rcu_read_lock_any_held(void) { bool ret; diff --git a/kernel/relay.c b/kernel/relay.c index 8d915fe98198..e36f6b926f7f 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -72,17 +72,18 @@ static void relay_free_page_array(struct page **array) } /** - * relay_mmap_buf: - mmap channel buffer to process address space - * @buf: relay channel buffer - * @vma: vm_area_struct describing memory to be mapped + * relay_mmap_prepare_buf: - mmap channel buffer to process address space + * @buf: the relay channel buffer + * @desc: describing what to map * * Returns 0 if ok, negative on error * * Caller should already have grabbed mmap_lock. */ -static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) +static int relay_mmap_prepare_buf(struct rchan_buf *buf, + struct vm_area_desc *desc) { - unsigned long length = vma->vm_end - vma->vm_start; + unsigned long length = vma_desc_size(desc); if (!buf) return -EBADF; @@ -90,9 +91,9 @@ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) if (length != (unsigned long)buf->chan->alloc_size) return -EINVAL; - vma->vm_ops = &relay_file_mmap_ops; - vm_flags_set(vma, VM_DONTEXPAND); - vma->vm_private_data = buf; + desc->vm_ops = &relay_file_mmap_ops; + desc->vm_flags |= VM_DONTEXPAND; + desc->private_data = buf; return 0; } @@ -749,16 +750,16 @@ static int relay_file_open(struct inode *inode, struct file *filp) } /** - * relay_file_mmap - mmap file op for relay files - * @filp: the file - * @vma: the vma describing what to map + * relay_file_mmap_prepare - mmap file op for relay files + * @desc: describing what to map * - * Calls upon relay_mmap_buf() to map the file into user space. + * Calls upon relay_mmap_prepare_buf() to map the file into user space. */ -static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma) +static int relay_file_mmap_prepare(struct vm_area_desc *desc) { - struct rchan_buf *buf = filp->private_data; - return relay_mmap_buf(buf, vma); + struct rchan_buf *buf = desc->file->private_data; + + return relay_mmap_prepare_buf(buf, desc); } /** @@ -1006,7 +1007,7 @@ static ssize_t relay_file_read(struct file *filp, const struct file_operations relay_file_operations = { .open = relay_file_open, .poll = relay_file_poll, - .mmap = relay_file_mmap, + .mmap_prepare = relay_file_mmap_prepare, .read = relay_file_read, .release = relay_file_release, }; diff --git a/kernel/resource.c b/kernel/resource.c index b9fa2a4ce089..e4e9bac12e6e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -341,6 +341,8 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, struct resource *res) { + /* Skip children until we find a top level range that matches */ + bool skip_children = true; struct resource *p; if (!res) @@ -351,7 +353,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, read_lock(&resource_lock); - for_each_resource(&iomem_resource, p, false) { + for_each_resource(&iomem_resource, p, skip_children) { /* If we passed the resource we are looking for, stop */ if (p->start > end) { p = NULL; @@ -362,6 +364,12 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, if (p->end < start) continue; + /* + * We found a top level range that matches what we are looking + * for. Time to start checking children too. + */ + skip_children = false; + /* Found a match, break */ if (is_type_match(p, flags, desc)) break; diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index cdea931aae30..954137775f38 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -178,8 +178,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) * this process can already run with task_group() == prev->tg or we can * race with cgroup code which can read autogroup = prev under rq->lock. * In the latter case for_each_thread() can not miss a migrating thread, - * cpu_cgroup_attach() must not be possible after cgroup_exit() and it - * can't be removed from thread list, we hold ->siglock. + * cpu_cgroup_attach() must not be possible after cgroup_task_exit() + * and it can't be removed from thread list, we hold ->siglock. * * If an exiting thread was already removed from thread list we rely on * sched_autogroup_exit_task(). diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fc358c1b6ca9..41ba0be16911 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -878,7 +878,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) rq_lock(rq, &rf); update_rq_clock(rq); - rq->donor->sched_class->task_tick(rq, rq->curr, 1); + rq->donor->sched_class->task_tick(rq, rq->donor, 1); rq_unlock(rq, &rf); return HRTIMER_NORESTART; @@ -5143,6 +5143,14 @@ static struct rq *finish_task_switch(struct task_struct *prev) if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); + /* + * sched_ext_dead() must come before cgroup_task_dead() to + * prevent cgroups from being removed while its member tasks are + * visible to SCX schedulers. + */ + sched_ext_dead(prev); + cgroup_task_dead(prev); + /* Task is done with its stack. */ put_task_stack(prev); @@ -7352,15 +7360,12 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) p->prio = prio; } out_unlock: - /* Avoid rq from going away on us: */ - preempt_disable(); + /* Caller holds task_struct::pi_lock, IRQs are still disabled */ rq_unpin_lock(rq, &rf); __balance_callbacks(rq); rq_repin_lock(rq, &rf); __task_rq_unlock(rq, p, &rf); - - preempt_enable(); } #endif /* CONFIG_RT_MUTEXES */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 67f540c23717..319439fe1870 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2675,6 +2675,7 @@ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu return NULL; } +/* Access rule: must be called on local CPU with preemption disabled */ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); static int find_later_rq(struct task_struct *task) @@ -3117,11 +3118,43 @@ void __init init_sched_dl_class(void) GFP_KERNEL, cpu_to_node(i)); } +/* + * This function always returns a non-empty bitmap in @cpus. This is because + * if a root domain has reserved bandwidth for DL tasks, the DL bandwidth + * check will prevent CPU hotplug from deactivating all CPUs in that domain. + */ +static void dl_get_task_effective_cpus(struct task_struct *p, struct cpumask *cpus) +{ + const struct cpumask *hk_msk; + + hk_msk = housekeeping_cpumask(HK_TYPE_DOMAIN); + if (housekeeping_enabled(HK_TYPE_DOMAIN)) { + if (!cpumask_intersects(p->cpus_ptr, hk_msk)) { + /* + * CPUs isolated by isolcpu="domain" always belong to + * def_root_domain. + */ + cpumask_andnot(cpus, cpu_active_mask, hk_msk); + return; + } + } + + /* + * If a root domain holds a DL task, it must have active CPUs. So + * active CPUs can always be found by walking up the task's cpuset + * hierarchy up to the partition root. + */ + cpuset_cpus_allowed_locked(p, cpus); +} + +/* The caller should hold cpuset_mutex */ void dl_add_task_root_domain(struct task_struct *p) { struct rq_flags rf; struct rq *rq; struct dl_bw *dl_b; + unsigned int cpu; + struct cpumask *msk = this_cpu_cpumask_var_ptr(local_cpu_mask_dl); raw_spin_lock_irqsave(&p->pi_lock, rf.flags); if (!dl_task(p) || dl_entity_is_special(&p->dl)) { @@ -3129,16 +3162,25 @@ void dl_add_task_root_domain(struct task_struct *p) return; } - rq = __task_rq_lock(p, &rf); - + /* + * Get an active rq, whose rq->rd traces the correct root + * domain. + * Ideally this would be under cpuset reader lock until rq->rd is + * fetched. However, sleepable locks cannot nest inside pi_lock, so we + * rely on the caller of dl_add_task_root_domain() holds 'cpuset_mutex' + * to guarantee the CPU stays in the cpuset. + */ + dl_get_task_effective_cpus(p, msk); + cpu = cpumask_first_and(cpu_active_mask, msk); + BUG_ON(cpu >= nr_cpu_ids); + rq = cpu_rq(cpu); dl_b = &rq->rd->dl_bw; - raw_spin_lock(&dl_b->lock); + /* End of fetching rd */ + raw_spin_lock(&dl_b->lock); __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span)); - raw_spin_unlock(&dl_b->lock); - - task_rq_unlock(rq, p, &rf); + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); } void dl_clear_root_domain(struct root_domain *rd) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 6827689a0966..05f5a49e9649 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -33,9 +33,10 @@ static DEFINE_MUTEX(scx_enable_mutex); DEFINE_STATIC_KEY_FALSE(__scx_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); -static unsigned long scx_in_softlockup; -static atomic_t scx_breather_depth = ATOMIC_INIT(0); static int scx_bypass_depth; +static cpumask_var_t scx_bypass_lb_donee_cpumask; +static cpumask_var_t scx_bypass_lb_resched_cpumask; +static bool scx_aborting; static bool scx_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); @@ -68,18 +69,18 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; static struct delayed_work scx_watchdog_work; /* - * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence + * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of kick_sync sequence * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated * lazily when enabling and freed when disabling to avoid waste when sched_ext * isn't active. */ -struct scx_kick_pseqs { +struct scx_kick_syncs { struct rcu_head rcu; - unsigned long seqs[]; + unsigned long syncs[]; }; -static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs); +static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs); /* * Direct dispatch marker. @@ -143,26 +144,70 @@ static struct scx_dump_data scx_dump_data = { /* /sys/kernel/sched_ext interface */ static struct kset *scx_kset; +/* + * Parameters that can be adjusted through /sys/module/sched_ext/parameters. + * There usually is no reason to modify these as normal scheduler operation + * shouldn't be affected by them. The knobs are primarily for debugging. + */ +static u64 scx_slice_dfl = SCX_SLICE_DFL; +static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC; +static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US; + +static int set_slice_us(const char *val, const struct kernel_param *kp) +{ + return param_set_uint_minmax(val, kp, 100, 100 * USEC_PER_MSEC); +} + +static const struct kernel_param_ops slice_us_param_ops = { + .set = set_slice_us, + .get = param_get_uint, +}; + +static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp) +{ + return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC); +} + +static const struct kernel_param_ops bypass_lb_intv_us_param_ops = { + .set = set_bypass_lb_intv_us, + .get = param_get_uint, +}; + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "sched_ext." + +module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600); +MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)"); +module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600); +MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)"); + +#undef MODULE_PARAM_PREFIX + #define CREATE_TRACE_POINTS #include <trace/events/sched_ext.h> static void process_ddsp_deferred_locals(struct rq *rq); +static u32 reenq_local(struct rq *rq); static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); -static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, +static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, s64 exit_code, const char *fmt, va_list args); -static __printf(4, 5) void scx_exit(struct scx_sched *sch, +static __printf(4, 5) bool scx_exit(struct scx_sched *sch, enum scx_exit_kind kind, s64 exit_code, const char *fmt, ...) { va_list args; + bool ret; va_start(args, fmt); - scx_vexit(sch, kind, exit_code, fmt, args); + ret = scx_vexit(sch, kind, exit_code, fmt, args); va_end(args); + + return ret; } #define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) +#define scx_verror(sch, fmt, args) scx_vexit((sch), SCX_EXIT_ERROR, 0, fmt, args) #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) @@ -200,7 +245,15 @@ static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id) { - return rhashtable_lookup_fast(&sch->dsq_hash, &dsq_id, dsq_hash_params); + return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params); +} + +static const struct sched_class *scx_setscheduler_class(struct task_struct *p) +{ + if (p->sched_class == &stop_sched_class) + return &stop_sched_class; + + return __setscheduler_class(p->policy, p->prio); } /* @@ -469,19 +522,16 @@ struct scx_task_iter { * RCU read lock or obtaining a reference count. * * All tasks which existed when the iteration started are guaranteed to be - * visited as long as they still exist. + * visited as long as they are not dead. */ static void scx_task_iter_start(struct scx_task_iter *iter) { - BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & - ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); + memset(iter, 0, sizeof(*iter)); raw_spin_lock_irq(&scx_tasks_lock); iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); - iter->locked_task = NULL; - iter->cnt = 0; iter->list_locked = true; } @@ -547,14 +597,13 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) struct list_head *cursor = &iter->cursor.tasks_node; struct sched_ext_entity *pos; - __scx_task_iter_maybe_relock(iter); - if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { scx_task_iter_unlock(iter); cond_resched(); - __scx_task_iter_maybe_relock(iter); } + __scx_task_iter_maybe_relock(iter); + list_for_each_entry(pos, cursor, tasks_node) { if (&pos->tasks_node == &scx_tasks) return NULL; @@ -755,6 +804,11 @@ static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err static void run_deferred(struct rq *rq) { process_ddsp_deferred_locals(rq); + + if (local_read(&rq->scx.reenq_local_deferred)) { + local_set(&rq->scx.reenq_local_deferred, 0); + reenq_local(rq); + } } static void deferred_bal_cb_workfn(struct rq *rq) @@ -775,12 +829,28 @@ static void deferred_irq_workfn(struct irq_work *irq_work) * schedule_deferred - Schedule execution of deferred actions on an rq * @rq: target rq * - * Schedule execution of deferred actions on @rq. Must be called with @rq - * locked. Deferred actions are executed with @rq locked but unpinned, and thus - * can unlock @rq to e.g. migrate tasks to other rqs. + * Schedule execution of deferred actions on @rq. Deferred actions are executed + * with @rq locked but unpinned, and thus can unlock @rq to e.g. migrate tasks + * to other rqs. */ static void schedule_deferred(struct rq *rq) { + /* + * Queue an irq work. They are executed on IRQ re-enable which may take + * a bit longer than the scheduler hook in schedule_deferred_locked(). + */ + irq_work_queue(&rq->scx.deferred_irq_work); +} + +/** + * schedule_deferred_locked - Schedule execution of deferred actions on an rq + * @rq: target rq + * + * Schedule execution of deferred actions on @rq. Equivalent to + * schedule_deferred() but requires @rq to be locked and can be more efficient. + */ +static void schedule_deferred_locked(struct rq *rq) +{ lockdep_assert_rq_held(rq); /* @@ -812,12 +882,11 @@ static void schedule_deferred(struct rq *rq) } /* - * No scheduler hooks available. Queue an irq work. They are executed on - * IRQ re-enable which may take a bit longer than the scheduler hooks. - * The above WAKEUP and BALANCE paths should cover most of the cases and - * the time to IRQ re-enable shouldn't be long. + * No scheduler hooks available. Use the generic irq_work path. The + * above WAKEUP and BALANCE paths should cover most of the cases and the + * time to IRQ re-enable shouldn't be long. */ - irq_work_queue(&rq->scx.deferred_irq_work); + schedule_deferred(rq); } /** @@ -902,7 +971,7 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) { - p->scx.slice = SCX_SLICE_DFL; + p->scx.slice = READ_ONCE(scx_slice_dfl); __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); } @@ -916,7 +985,9 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, !RB_EMPTY_NODE(&p->scx.dsq_priq)); if (!is_local) { - raw_spin_lock(&dsq->lock); + raw_spin_lock_nested(&dsq->lock, + (enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0); + if (unlikely(dsq->id == SCX_DSQ_INVALID)) { scx_error(sch, "attempting to dispatch to a destroyed dsq"); /* fall back to the global dsq */ @@ -965,8 +1036,11 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, container_of(rbp, struct task_struct, scx.dsq_priq); list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node); + /* first task unchanged - no update needed */ } else { list_add(&p->scx.dsq_list.node, &dsq->list); + /* not builtin and new task is at head - use fastpath */ + rcu_assign_pointer(dsq->first_task, p); } } else { /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ @@ -974,10 +1048,19 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks", dsq->id); - if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) + if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) { list_add(&p->scx.dsq_list.node, &dsq->list); - else + /* new task inserted at head - use fastpath */ + if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) + rcu_assign_pointer(dsq->first_task, p); + } else { + bool was_empty; + + was_empty = list_empty(&dsq->list); list_add_tail(&p->scx.dsq_list.node, &dsq->list); + if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) + rcu_assign_pointer(dsq->first_task, p); + } } /* seq records the order tasks are queued, used by BPF DSQ iterator */ @@ -1034,6 +1117,13 @@ static void task_unlink_from_dsq(struct task_struct *p, list_del_init(&p->scx.dsq_list.node); dsq_mod_nr(dsq, -1); + + if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) { + struct task_struct *first_task; + + first_task = nldsq_next_task(dsq, NULL, false); + rcu_assign_pointer(dsq->first_task, first_task); + } } static void dispatch_dequeue(struct rq *rq, struct task_struct *p) @@ -1041,6 +1131,8 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) struct scx_dispatch_q *dsq = p->scx.dsq; bool is_local = dsq == &rq->scx.local_dsq; + lockdep_assert_rq_held(rq); + if (!dsq) { /* * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals. @@ -1087,6 +1179,20 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) raw_spin_unlock(&dsq->lock); } +/* + * Abbreviated version of dispatch_dequeue() that can be used when both @p's rq + * and dsq are locked. + */ +static void dispatch_dequeue_locked(struct task_struct *p, + struct scx_dispatch_q *dsq) +{ + lockdep_assert_rq_held(task_rq(p)); + lockdep_assert_held(&dsq->lock); + + task_unlink_from_dsq(p, dsq); + p->scx.dsq = NULL; +} + static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, struct rq *rq, u64 dsq_id, struct task_struct *p) @@ -1192,7 +1298,7 @@ static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); list_add_tail(&p->scx.dsq_list.node, &rq->scx.ddsp_deferred_locals); - schedule_deferred(rq); + schedule_deferred_locked(rq); return; } @@ -1217,6 +1323,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, { struct scx_sched *sch = scx_root; struct task_struct **ddsp_taskp; + struct scx_dispatch_q *dsq; unsigned long qseq; WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); @@ -1235,7 +1342,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (scx_rq_bypassing(rq)) { __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); - goto global; + goto bypass; } if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) @@ -1284,8 +1391,20 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, direct: direct_dispatch(sch, p, enq_flags); return; - +local_norefill: + dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); + return; local: + dsq = &rq->scx.local_dsq; + goto enqueue; +global: + dsq = find_global_dsq(sch, p); + goto enqueue; +bypass: + dsq = &task_rq(p)->scx.bypass_dsq; + goto enqueue; + +enqueue: /* * For task-ordering, slice refill must be treated as implying the end * of the current slice. Otherwise, the longer @p stays on the CPU, the @@ -1293,14 +1412,7 @@ local: */ touch_core_sched(rq, p); refill_task_slice_dfl(sch, p); -local_norefill: - dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); - return; - -global: - touch_core_sched(rq, p); /* see the comment in local: */ - refill_task_slice_dfl(sch, p); - dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags); + dispatch_enqueue(sch, dsq, p, enq_flags); } static bool task_runnable(const struct task_struct *p) @@ -1741,8 +1853,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, * @p is going from a non-local DSQ to a non-local DSQ. As * $src_dsq is already locked, do an abbreviated dequeue. */ - task_unlink_from_dsq(p, src_dsq); - p->scx.dsq = NULL; + dispatch_dequeue_locked(p, src_dsq); raw_spin_unlock(&src_dsq->lock); dispatch_enqueue(sch, dst_dsq, p, enq_flags); @@ -1751,49 +1862,12 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, return dst_rq; } -/* - * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly - * banging on the same DSQ on a large NUMA system to the point where switching - * to the bypass mode can take a long time. Inject artificial delays while the - * bypass mode is switching to guarantee timely completion. - */ -static void scx_breather(struct rq *rq) -{ - u64 until; - - lockdep_assert_rq_held(rq); - - if (likely(!atomic_read(&scx_breather_depth))) - return; - - raw_spin_rq_unlock(rq); - - until = ktime_get_ns() + NSEC_PER_MSEC; - - do { - int cnt = 1024; - while (atomic_read(&scx_breather_depth) && --cnt) - cpu_relax(); - } while (atomic_read(&scx_breather_depth) && - time_before64(ktime_get_ns(), until)); - - raw_spin_rq_lock(rq); -} - static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq, struct scx_dispatch_q *dsq) { struct task_struct *p; retry: /* - * This retry loop can repeatedly race against scx_bypass() dequeueing - * tasks from @dsq trying to put the system into the bypass mode. On - * some multi-socket machines (e.g. 2x Intel 8480c), this can live-lock - * the machine into soft lockups. Give a breather. - */ - scx_breather(rq); - - /* * The caller can't expect to successfully consume a task if the task's * addition to @dsq isn't guaranteed to be visible somehow. Test * @dsq->list without locking and skip if it seems empty. @@ -1806,6 +1880,17 @@ retry: nldsq_for_each_task(p, dsq) { struct rq *task_rq = task_rq(p); + /* + * This loop can lead to multiple lockup scenarios, e.g. the BPF + * scheduler can put an enormous number of affinitized tasks into + * a contended DSQ, or the outer retry loop can repeatedly race + * against scx_bypass() dequeueing tasks from @dsq trying to put + * the system into the bypass mode. This can easily live-lock the + * machine. If aborting, exit from all non-bypass DSQs. + */ + if (unlikely(READ_ONCE(scx_aborting)) && dsq->id != SCX_DSQ_BYPASS) + break; + if (rq == task_rq) { task_unlink_from_dsq(p, dsq); move_local_task_to_local_dsq(p, 0, dsq, rq); @@ -2089,8 +2174,14 @@ static int balance_one(struct rq *rq, struct task_struct *prev) if (consume_global_dsq(sch, rq)) goto has_tasks; - if (unlikely(!SCX_HAS_OP(sch, dispatch)) || - scx_rq_bypassing(rq) || !scx_rq_online(rq)) + if (scx_rq_bypassing(rq)) { + if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq)) + goto has_tasks; + else + goto no_tasks; + } + + if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq)) goto no_tasks; dspc->rq = rq; @@ -2241,12 +2332,6 @@ static void switch_class(struct rq *rq, struct task_struct *next) struct scx_sched *sch = scx_root; const struct sched_class *next_class = next->sched_class; - /* - * Pairs with the smp_load_acquire() issued by a CPU in - * kick_cpus_irq_workfn() who is waiting for this CPU to perform a - * resched. - */ - smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT)) return; @@ -2286,6 +2371,10 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, struct task_struct *next) { struct scx_sched *sch = scx_root; + + /* see kick_cpus_irq_workfn() */ + smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); + update_curr_scx(rq); /* see dequeue_task_scx() on why we skip when !QUEUED */ @@ -2332,18 +2421,32 @@ static struct task_struct *first_local_task(struct rq *rq) struct task_struct, scx.dsq_list.node); } -static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) +static struct task_struct * +do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) { struct task_struct *prev = rq->curr; bool keep_prev, kick_idle = false; struct task_struct *p; + /* see kick_cpus_irq_workfn() */ + smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); + rq_modified_clear(rq); + rq_unpin_lock(rq, rf); balance_one(rq, prev); rq_repin_lock(rq, rf); maybe_queue_balance_callback(rq); - if (rq_modified_above(rq, &ext_sched_class)) + + /* + * If any higher-priority sched class enqueued a runnable task on + * this rq during balance_one(), abort and return RETRY_TASK, so + * that the scheduler loop can restart. + * + * If @force_scx is true, always try to pick a SCHED_EXT task, + * regardless of any higher-priority sched classes activity. + */ + if (!force_scx && rq_modified_above(rq, &ext_sched_class)) return RETRY_TASK; keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; @@ -2386,6 +2489,11 @@ static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) return p; } +static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) +{ + return do_pick_task_scx(rq, rf, false); +} + #ifdef CONFIG_SCHED_CORE /** * scx_prio_less - Task ordering for core-sched @@ -2842,7 +2950,7 @@ void init_scx_entity(struct sched_ext_entity *scx) INIT_LIST_HEAD(&scx->runnable_node); scx->runnable_at = jiffies; scx->ddsp_dsq_id = SCX_DSQ_INVALID; - scx->slice = SCX_SLICE_DFL; + scx->slice = READ_ONCE(scx_slice_dfl); } void scx_pre_fork(struct task_struct *p) @@ -2908,7 +3016,7 @@ void scx_cancel_fork(struct task_struct *p) percpu_up_read(&scx_fork_rwsem); } -void sched_ext_free(struct task_struct *p) +void sched_ext_dead(struct task_struct *p) { unsigned long flags; @@ -3012,6 +3120,7 @@ void scx_tg_init(struct task_group *tg) tg->scx.weight = CGROUP_WEIGHT_DFL; tg->scx.bw_period_us = default_bw_period_us(); tg->scx.bw_quota_us = RUNTIME_INF; + tg->scx.idle = false; } int scx_tg_online(struct task_group *tg) @@ -3160,7 +3269,18 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) void scx_group_set_idle(struct task_group *tg, bool idle) { - /* TODO: Implement ops->cgroup_set_idle() */ + struct scx_sched *sch = scx_root; + + percpu_down_read(&scx_cgroup_ops_rwsem); + + if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_idle, NULL, + tg_cgrp(tg), idle); + + /* Update the task group's idle state */ + tg->scx.idle = idle; + + percpu_up_read(&scx_cgroup_ops_rwsem); } void scx_group_set_bandwidth(struct task_group *tg, @@ -3575,38 +3695,55 @@ bool scx_allow_ttwu_queue(const struct task_struct *p) } /** - * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler + * handle_lockup - sched_ext common lockup handler + * @fmt: format string * - * While there are various reasons why RCU CPU stalls can occur on a system - * that may not be caused by the current BPF scheduler, try kicking out the - * current scheduler in an attempt to recover the system to a good state before - * issuing panics. + * Called on system stall or lockup condition and initiates abort of sched_ext + * if enabled, which may resolve the reported lockup. + * + * Returns %true if sched_ext is enabled and abort was initiated, which may + * resolve the lockup. %false if sched_ext is not enabled or abort was already + * initiated by someone else. */ -bool scx_rcu_cpu_stall(void) +static __printf(1, 2) bool handle_lockup(const char *fmt, ...) { struct scx_sched *sch; + va_list args; + bool ret; - rcu_read_lock(); + guard(rcu)(); sch = rcu_dereference(scx_root); - if (unlikely(!sch)) { - rcu_read_unlock(); + if (unlikely(!sch)) return false; - } switch (scx_enable_state()) { case SCX_ENABLING: case SCX_ENABLED: - break; + va_start(args, fmt); + ret = scx_verror(sch, fmt, args); + va_end(args); + return ret; default: - rcu_read_unlock(); return false; } +} - scx_error(sch, "RCU CPU stall detected!"); - rcu_read_unlock(); - - return true; +/** + * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler + * + * While there are various reasons why RCU CPU stalls can occur on a system + * that may not be caused by the current BPF scheduler, try kicking out the + * current scheduler in an attempt to recover the system to a good state before + * issuing panics. + * + * Returns %true if sched_ext is enabled and abort was initiated, which may + * resolve the reported RCU stall. %false if sched_ext is not enabled or someone + * else already initiated abort. + */ +bool scx_rcu_cpu_stall(void) +{ + return handle_lockup("RCU CPU stall detected!"); } /** @@ -3617,50 +3754,240 @@ bool scx_rcu_cpu_stall(void) * live-lock the system by making many CPUs target the same DSQ to the point * where soft-lockup detection triggers. This function is called from * soft-lockup watchdog when the triggering point is close and tries to unjam - * the system by enabling the breather and aborting the BPF scheduler. + * the system and aborting the BPF scheduler. */ void scx_softlockup(u32 dur_s) { - struct scx_sched *sch; + if (!handle_lockup("soft lockup - CPU %d stuck for %us", smp_processor_id(), dur_s)) + return; - rcu_read_lock(); + printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU %d stuck for %us, disabling BPF scheduler\n", + smp_processor_id(), dur_s); +} - sch = rcu_dereference(scx_root); - if (unlikely(!sch)) - goto out_unlock; +/** + * scx_hardlockup - sched_ext hardlockup handler + * + * A poorly behaving BPF scheduler can trigger hard lockup by e.g. putting + * numerous affinitized tasks in a single queue and directing all CPUs at it. + * Try kicking out the current scheduler in an attempt to recover the system to + * a good state before taking more drastic actions. + * + * Returns %true if sched_ext is enabled and abort was initiated, which may + * resolve the reported hardlockdup. %false if sched_ext is not enabled or + * someone else already initiated abort. + */ +bool scx_hardlockup(int cpu) +{ + if (!handle_lockup("hard lockup - CPU %d", cpu)) + return false; - switch (scx_enable_state()) { - case SCX_ENABLING: - case SCX_ENABLED: - break; - default: - goto out_unlock; + printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", + cpu); + return true; +} + +static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq, + struct cpumask *donee_mask, struct cpumask *resched_mask, + u32 nr_donor_target, u32 nr_donee_target) +{ + struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq; + struct task_struct *p, *n; + struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0); + s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target; + u32 nr_balanced = 0, min_delta_us; + + /* + * All we want to guarantee is reasonable forward progress. No reason to + * fine tune. Assuming every task on @donor_dsq runs their full slice, + * consider offloading iff the total queued duration is over the + * threshold. + */ + min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV; + if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us)) + return 0; + + raw_spin_rq_lock_irq(rq); + raw_spin_lock(&donor_dsq->lock); + list_add(&cursor.node, &donor_dsq->list); +resume: + n = container_of(&cursor, struct task_struct, scx.dsq_list); + n = nldsq_next_task(donor_dsq, n, false); + + while ((p = n)) { + struct rq *donee_rq; + struct scx_dispatch_q *donee_dsq; + int donee; + + n = nldsq_next_task(donor_dsq, n, false); + + if (donor_dsq->nr <= nr_donor_target) + break; + + if (cpumask_empty(donee_mask)) + break; + + donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr); + if (donee >= nr_cpu_ids) + continue; + + donee_rq = cpu_rq(donee); + donee_dsq = &donee_rq->scx.bypass_dsq; + + /* + * $p's rq is not locked but $p's DSQ lock protects its + * scheduling properties making this test safe. + */ + if (!task_can_run_on_remote_rq(sch, p, donee_rq, false)) + continue; + + /* + * Moving $p from one non-local DSQ to another. The source rq + * and DSQ are already locked. Do an abbreviated dequeue and + * then perform enqueue without unlocking $donor_dsq. + * + * We don't want to drop and reacquire the lock on each + * iteration as @donor_dsq can be very long and potentially + * highly contended. Donee DSQs are less likely to be contended. + * The nested locking is safe as only this LB moves tasks + * between bypass DSQs. + */ + dispatch_dequeue_locked(p, donor_dsq); + dispatch_enqueue(sch, donee_dsq, p, SCX_ENQ_NESTED); + + /* + * $donee might have been idle and need to be woken up. No need + * to be clever. Kick every CPU that receives tasks. + */ + cpumask_set_cpu(donee, resched_mask); + + if (READ_ONCE(donee_dsq->nr) >= nr_donee_target) + cpumask_clear_cpu(donee, donee_mask); + + nr_balanced++; + if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) { + list_move_tail(&cursor.node, &n->scx.dsq_list.node); + raw_spin_unlock(&donor_dsq->lock); + raw_spin_rq_unlock_irq(rq); + cpu_relax(); + raw_spin_rq_lock_irq(rq); + raw_spin_lock(&donor_dsq->lock); + goto resume; + } } - /* allow only one instance, cleared at the end of scx_bypass() */ - if (test_and_set_bit(0, &scx_in_softlockup)) - goto out_unlock; + list_del_init(&cursor.node); + raw_spin_unlock(&donor_dsq->lock); + raw_spin_rq_unlock_irq(rq); - printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n", - smp_processor_id(), dur_s, scx_root->ops.name); + return nr_balanced; +} + +static void bypass_lb_node(struct scx_sched *sch, int node) +{ + const struct cpumask *node_mask = cpumask_of_node(node); + struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask; + struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask; + u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0; + u32 nr_target, nr_donor_target; + u32 before_min = U32_MAX, before_max = 0; + u32 after_min = U32_MAX, after_max = 0; + int cpu; + + /* count the target tasks and CPUs */ + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { + u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr); + + nr_tasks += nr; + nr_cpus++; + + before_min = min(nr, before_min); + before_max = max(nr, before_max); + } + + if (!nr_cpus) + return; /* - * Some CPUs may be trapped in the dispatch paths. Enable breather - * immediately; otherwise, we might even be able to get to scx_bypass(). + * We don't want CPUs to have more than $nr_donor_target tasks and + * balancing to fill donee CPUs upto $nr_target. Once targets are + * calculated, find the donee CPUs. */ - atomic_inc(&scx_breather_depth); + nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus); + nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100); - scx_error(sch, "soft lockup - CPU#%d stuck for %us", smp_processor_id(), dur_s); -out_unlock: - rcu_read_unlock(); + cpumask_clear(donee_mask); + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { + if (READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr) < nr_target) + cpumask_set_cpu(cpu, donee_mask); + } + + /* iterate !donee CPUs and see if they should be offloaded */ + cpumask_clear(resched_mask); + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { + struct rq *rq = cpu_rq(cpu); + struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq; + + if (cpumask_empty(donee_mask)) + break; + if (cpumask_test_cpu(cpu, donee_mask)) + continue; + if (READ_ONCE(donor_dsq->nr) <= nr_donor_target) + continue; + + nr_balanced += bypass_lb_cpu(sch, rq, donee_mask, resched_mask, + nr_donor_target, nr_target); + } + + for_each_cpu(cpu, resched_mask) { + struct rq *rq = cpu_rq(cpu); + + raw_spin_rq_lock_irq(rq); + resched_curr(rq); + raw_spin_rq_unlock_irq(rq); + } + + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { + u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr); + + after_min = min(nr, after_min); + after_max = max(nr, after_max); + + } + + trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced, + before_min, before_max, after_min, after_max); } -static void scx_clear_softlockup(void) +/* + * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine + * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some + * bypass DSQs can be overloaded. If there are enough tasks to saturate other + * lightly loaded CPUs, such imbalance can lead to very high execution latency + * on the overloaded CPUs and thus to hung tasks and RCU stalls. To avoid such + * outcomes, a simple load balancing mechanism is implemented by the following + * timer which runs periodically while bypass mode is in effect. + */ +static void scx_bypass_lb_timerfn(struct timer_list *timer) { - if (test_and_clear_bit(0, &scx_in_softlockup)) - atomic_dec(&scx_breather_depth); + struct scx_sched *sch; + int node; + u32 intv_us; + + sch = rcu_dereference_all(scx_root); + if (unlikely(!sch) || !READ_ONCE(scx_bypass_depth)) + return; + + for_each_node_with_cpus(node) + bypass_lb_node(sch, node); + + intv_us = READ_ONCE(scx_bypass_lb_intv_us); + if (intv_us) + mod_timer(timer, jiffies + usecs_to_jiffies(intv_us)); } +static DEFINE_TIMER(scx_bypass_lb_timer, scx_bypass_lb_timerfn); + /** * scx_bypass - [Un]bypass scx_ops and guarantee forward progress * @bypass: true for bypass, false for unbypass @@ -3704,25 +4031,34 @@ static void scx_bypass(bool bypass) sch = rcu_dereference_bh(scx_root); if (bypass) { - scx_bypass_depth++; + u32 intv_us; + + WRITE_ONCE(scx_bypass_depth, scx_bypass_depth + 1); WARN_ON_ONCE(scx_bypass_depth <= 0); if (scx_bypass_depth != 1) goto unlock; + WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC); bypass_timestamp = ktime_get_ns(); if (sch) scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); + + intv_us = READ_ONCE(scx_bypass_lb_intv_us); + if (intv_us && !timer_pending(&scx_bypass_lb_timer)) { + scx_bypass_lb_timer.expires = + jiffies + usecs_to_jiffies(intv_us); + add_timer_global(&scx_bypass_lb_timer); + } } else { - scx_bypass_depth--; + WRITE_ONCE(scx_bypass_depth, scx_bypass_depth - 1); WARN_ON_ONCE(scx_bypass_depth < 0); if (scx_bypass_depth != 0) goto unlock; + WRITE_ONCE(scx_slice_dfl, SCX_SLICE_DFL); if (sch) scx_add_event(sch, SCX_EV_BYPASS_DURATION, ktime_get_ns() - bypass_timestamp); } - atomic_inc(&scx_breather_depth); - /* * No task property is changing. We just need to make sure all currently * queued tasks are re-queued according to the new scx_rq_bypassing() @@ -3778,10 +4114,8 @@ static void scx_bypass(bool bypass) raw_spin_rq_unlock(rq); } - atomic_dec(&scx_breather_depth); unlock: raw_spin_unlock_irqrestore(&bypass_lock, flags); - scx_clear_softlockup(); } static void free_exit_info(struct scx_exit_info *ei) @@ -3834,24 +4168,17 @@ static const char *scx_exit_reason(enum scx_exit_kind kind) } } -static void free_kick_pseqs_rcu(struct rcu_head *rcu) -{ - struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu); - - kvfree(pseqs); -} - -static void free_kick_pseqs(void) +static void free_kick_syncs(void) { int cpu; for_each_possible_cpu(cpu) { - struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu); - struct scx_kick_pseqs *to_free; + struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); + struct scx_kick_syncs *to_free; - to_free = rcu_replace_pointer(*pseqs, NULL, true); + to_free = rcu_replace_pointer(*ksyncs, NULL, true); if (to_free) - call_rcu(&to_free->rcu, free_kick_pseqs_rcu); + kvfree_rcu(to_free, rcu); } } @@ -3876,6 +4203,7 @@ static void scx_disable_workfn(struct kthread_work *work) /* guarantee forward progress by bypassing scx_ops */ scx_bypass(true); + WRITE_ONCE(scx_aborting, false); switch (scx_set_enable_state(SCX_DISABLING)) { case SCX_DISABLING: @@ -3920,8 +4248,7 @@ static void scx_disable_workfn(struct kthread_work *work) while ((p = scx_task_iter_next_locked(&sti))) { unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *old_class = p->sched_class; - const struct sched_class *new_class = - __setscheduler_class(p->policy, p->prio); + const struct sched_class *new_class = scx_setscheduler_class(p); update_rq_clock(task_rq(p)); @@ -3989,7 +4316,7 @@ static void scx_disable_workfn(struct kthread_work *work) free_percpu(scx_dsp_ctx); scx_dsp_ctx = NULL; scx_dsp_max_batch = 0; - free_kick_pseqs(); + free_kick_syncs(); mutex_unlock(&scx_enable_mutex); @@ -3998,9 +4325,24 @@ done: scx_bypass(false); } -static void scx_disable(enum scx_exit_kind kind) +static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) { int none = SCX_EXIT_NONE; + + if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) + return false; + + /* + * Some CPUs may be trapped in the dispatch paths. Set the aborting + * flag to break potential live-lock scenarios, ensuring we can + * successfully reach scx_bypass(). + */ + WRITE_ONCE(scx_aborting, true); + return true; +} + +static void scx_disable(enum scx_exit_kind kind) +{ struct scx_sched *sch; if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) @@ -4009,7 +4351,7 @@ static void scx_disable(enum scx_exit_kind kind) rcu_read_lock(); sch = rcu_dereference(scx_root); if (sch) { - atomic_try_cmpxchg(&sch->exit_kind, &none, kind); + scx_claim_exit(sch, kind); kthread_queue_work(sch->helper, &sch->disable_work); } rcu_read_unlock(); @@ -4238,10 +4580,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) seq_buf_init(&ns, buf, avail); dump_newline(&ns); - dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu", + dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu", cpu, rq->scx.nr_running, rq->scx.flags, rq->scx.cpu_released, rq->scx.ops_qseq, - rq->scx.pnt_seq); + rq->scx.kick_sync); dump_line(&ns, " curr=%s[%d] class=%ps", rq->curr->comm, rq->curr->pid, rq->curr->sched_class); @@ -4325,15 +4667,14 @@ static void scx_error_irq_workfn(struct irq_work *irq_work) kthread_queue_work(sch->helper, &sch->disable_work); } -static void scx_vexit(struct scx_sched *sch, +static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, s64 exit_code, const char *fmt, va_list args) { struct scx_exit_info *ei = sch->exit_info; - int none = SCX_EXIT_NONE; - if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) - return; + if (!scx_claim_exit(sch, kind)) + return false; ei->exit_code = exit_code; #ifdef CONFIG_STACKTRACE @@ -4350,9 +4691,10 @@ static void scx_vexit(struct scx_sched *sch, ei->reason = scx_exit_reason(ei->kind); irq_work_queue(&sch->error_irq_work); + return true; } -static int alloc_kick_pseqs(void) +static int alloc_kick_syncs(void) { int cpu; @@ -4361,19 +4703,19 @@ static int alloc_kick_pseqs(void) * can exceed percpu allocator limits on large machines. */ for_each_possible_cpu(cpu) { - struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu); - struct scx_kick_pseqs *new_pseqs; + struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); + struct scx_kick_syncs *new_ksyncs; - WARN_ON_ONCE(rcu_access_pointer(*pseqs)); + WARN_ON_ONCE(rcu_access_pointer(*ksyncs)); - new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids), - GFP_KERNEL, cpu_to_node(cpu)); - if (!new_pseqs) { - free_kick_pseqs(); + new_ksyncs = kvzalloc_node(struct_size(new_ksyncs, syncs, nr_cpu_ids), + GFP_KERNEL, cpu_to_node(cpu)); + if (!new_ksyncs) { + free_kick_syncs(); return -ENOMEM; } - rcu_assign_pointer(*pseqs, new_pseqs); + rcu_assign_pointer(*ksyncs, new_ksyncs); } return 0; @@ -4460,7 +4802,7 @@ err_free_sch: return ERR_PTR(ret); } -static void check_hotplug_seq(struct scx_sched *sch, +static int check_hotplug_seq(struct scx_sched *sch, const struct sched_ext_ops *ops) { unsigned long long global_hotplug_seq; @@ -4477,8 +4819,11 @@ static void check_hotplug_seq(struct scx_sched *sch, SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, "expected hotplug seq %llu did not match actual %llu", ops->hotplug_seq, global_hotplug_seq); + return -EBUSY; } } + + return 0; } static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) @@ -4505,6 +4850,9 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT) pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n"); + if (ops->cpu_acquire || ops->cpu_release) + pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n"); + return 0; } @@ -4529,14 +4877,14 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) goto err_unlock; } - ret = alloc_kick_pseqs(); + ret = alloc_kick_syncs(); if (ret) goto err_unlock; sch = scx_alloc_and_add_sched(ops); if (IS_ERR(sch)) { ret = PTR_ERR(sch); - goto err_free_pseqs; + goto err_free_ksyncs; } /* @@ -4545,6 +4893,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) */ WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED); WARN_ON_ONCE(scx_root); + if (WARN_ON_ONCE(READ_ONCE(scx_aborting))) + WRITE_ONCE(scx_aborting, false); atomic_long_set(&scx_nr_rejected, 0); @@ -4580,7 +4930,11 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (((void (**)(void))ops)[i]) set_bit(i, sch->has_op); - check_hotplug_seq(sch, ops); + ret = check_hotplug_seq(sch, ops); + if (ret) { + cpus_read_unlock(); + goto err_disable; + } scx_idle_update_selcpu_topology(ops); cpus_read_unlock(); @@ -4697,21 +5051,18 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) while ((p = scx_task_iter_next_locked(&sti))) { unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; const struct sched_class *old_class = p->sched_class; - const struct sched_class *new_class = - __setscheduler_class(p->policy, p->prio); + const struct sched_class *new_class = scx_setscheduler_class(p); - if (!tryget_task_struct(p)) + if (scx_get_task_state(p) != SCX_TASK_READY) continue; if (old_class != new_class) queue_flags |= DEQUEUE_CLASS; scoped_guard (sched_change, p, queue_flags) { - p->scx.slice = SCX_SLICE_DFL; + p->scx.slice = READ_ONCE(scx_slice_dfl); p->sched_class = new_class; } - - put_task_struct(p); } scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); @@ -4735,8 +5086,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) return 0; -err_free_pseqs: - free_kick_pseqs(); +err_free_ksyncs: + free_kick_syncs(); err_unlock: mutex_unlock(&scx_enable_mutex); return ret; @@ -4953,6 +5304,7 @@ static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *fro static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} +static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {} #endif static void sched_ext_ops__cpu_online(s32 cpu) {} static void sched_ext_ops__cpu_offline(s32 cpu) {} @@ -4991,6 +5343,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = { .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, + .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, #endif .cpu_online = sched_ext_ops__cpu_online, .cpu_offline = sched_ext_ops__cpu_offline, @@ -5064,29 +5417,38 @@ static bool can_skip_idle_kick(struct rq *rq) return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE); } -static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs) +static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs) { struct rq *rq = cpu_rq(cpu); struct scx_rq *this_scx = &this_rq->scx; + const struct sched_class *cur_class; bool should_wait = false; unsigned long flags; raw_spin_rq_lock_irqsave(rq, flags); + cur_class = rq->curr->sched_class; /* * During CPU hotplug, a CPU may depend on kicking itself to make - * forward progress. Allow kicking self regardless of online state. + * forward progress. Allow kicking self regardless of online state. If + * @cpu is running a higher class task, we have no control over @cpu. + * Skip kicking. */ - if (cpu_online(cpu) || cpu == cpu_of(this_rq)) { + if ((cpu_online(cpu) || cpu == cpu_of(this_rq)) && + !sched_class_above(cur_class, &ext_sched_class)) { if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { - if (rq->curr->sched_class == &ext_sched_class) + if (cur_class == &ext_sched_class) rq->curr->scx.slice = 0; cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); } if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { - pseqs[cpu] = rq->scx.pnt_seq; - should_wait = true; + if (cur_class == &ext_sched_class) { + ksyncs[cpu] = rq->scx.kick_sync; + should_wait = true; + } else { + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); + } } resched_curr(rq); @@ -5118,20 +5480,20 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) { struct rq *this_rq = this_rq(); struct scx_rq *this_scx = &this_rq->scx; - struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs); + struct scx_kick_syncs __rcu *ksyncs_pcpu = __this_cpu_read(scx_kick_syncs); bool should_wait = false; - unsigned long *pseqs; + unsigned long *ksyncs; s32 cpu; - if (unlikely(!pseqs_pcpu)) { - pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs"); + if (unlikely(!ksyncs_pcpu)) { + pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_syncs"); return; } - pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs; + ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs; for_each_cpu(cpu, this_scx->cpus_to_kick) { - should_wait |= kick_one_cpu(cpu, this_rq, pseqs); + should_wait |= kick_one_cpu(cpu, this_rq, ksyncs); cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); } @@ -5145,20 +5507,21 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) return; for_each_cpu(cpu, this_scx->cpus_to_wait) { - unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq; + unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync; - if (cpu != cpu_of(this_rq)) { - /* - * Pairs with smp_store_release() issued by this CPU in - * switch_class() on the resched path. - * - * We busy-wait here to guarantee that no other task can - * be scheduled on our core before the target CPU has - * entered the resched path. - */ - while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu]) - cpu_relax(); - } + /* + * Busy-wait until the task running at the time of kicking is no + * longer running. This can be used to implement e.g. core + * scheduling. + * + * smp_cond_load_acquire() pairs with store_releases in + * pick_task_scx() and put_prev_task_scx(). The former breaks + * the wait if SCX's scheduling path is entered even if the same + * task is picked subsequently. The latter is necessary to break + * the wait when $cpu is taken by a higher sched class. + */ + if (cpu != cpu_of(this_rq)) + smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]); cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); } @@ -5257,6 +5620,7 @@ void __init init_sched_ext_class(void) int n = cpu_to_node(cpu); init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); + init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS); INIT_LIST_HEAD(&rq->scx.runnable_list); INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); @@ -5362,19 +5726,23 @@ __bpf_kfunc_start_defs(); * exhaustion. If zero, the current residual slice is maintained. If * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with * scx_bpf_kick_cpu() to trigger scheduling. + * + * Returns %true on successful insertion, %false on failure. On the root + * scheduler, %false return triggers scheduler abort and the caller doesn't need + * to check the return value. */ -__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, - u64 enq_flags) +__bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id, + u64 slice, u64 enq_flags) { struct scx_sched *sch; guard(rcu)(); sch = rcu_dereference(scx_root); if (unlikely(!sch)) - return; + return false; if (!scx_dsq_insert_preamble(sch, p, enq_flags)) - return; + return false; if (slice) p->scx.slice = slice; @@ -5382,56 +5750,114 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice p->scx.slice = p->scx.slice ?: 1; scx_dsq_insert_commit(sch, p, dsq_id, enq_flags); + + return true; +} + +/* + * COMPAT: Will be removed in v6.23 along with the ___v2 suffix. + */ +__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, + u64 slice, u64 enq_flags) +{ + scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags); +} + +static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p, + u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) +{ + if (!scx_dsq_insert_preamble(sch, p, enq_flags)) + return false; + + if (slice) + p->scx.slice = slice; + else + p->scx.slice = p->scx.slice ?: 1; + + p->scx.dsq_vtime = vtime; + + scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); + + return true; } +struct scx_bpf_dsq_insert_vtime_args { + /* @p can't be packed together as KF_RCU is not transitive */ + u64 dsq_id; + u64 slice; + u64 vtime; + u64 enq_flags; +}; + /** - * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ + * __scx_bpf_dsq_insert_vtime - Arg-wrapped vtime DSQ insertion * @p: task_struct to insert - * @dsq_id: DSQ to insert into - * @slice: duration @p can run for in nsecs, 0 to keep the current value - * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ - * @enq_flags: SCX_ENQ_* + * @args: struct containing the rest of the arguments + * @args->dsq_id: DSQ to insert into + * @args->slice: duration @p can run for in nsecs, 0 to keep the current value + * @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ + * @args->enq_flags: SCX_ENQ_* + * + * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument + * limit. BPF programs should use scx_bpf_dsq_insert_vtime() which is provided + * as an inline wrapper in common.bpf.h. * - * Insert @p into the vtime priority queue of the DSQ identified by @dsq_id. - * Tasks queued into the priority queue are ordered by @vtime. All other aspects - * are identical to scx_bpf_dsq_insert(). + * Insert @p into the vtime priority queue of the DSQ identified by + * @args->dsq_id. Tasks queued into the priority queue are ordered by + * @args->vtime. All other aspects are identical to scx_bpf_dsq_insert(). * - * @vtime ordering is according to time_before64() which considers wrapping. A - * numerically larger vtime may indicate an earlier position in the ordering and - * vice-versa. + * @args->vtime ordering is according to time_before64() which considers + * wrapping. A numerically larger vtime may indicate an earlier position in the + * ordering and vice-versa. * * A DSQ can only be used as a FIFO or priority queue at any given time and this * function must not be called on a DSQ which already has one or more FIFO tasks * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and * SCX_DSQ_GLOBAL) cannot be used as priority queues. + * + * Returns %true on successful insertion, %false on failure. On the root + * scheduler, %false return triggers scheduler abort and the caller doesn't need + * to check the return value. */ -__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, - u64 slice, u64 vtime, u64 enq_flags) +__bpf_kfunc bool +__scx_bpf_dsq_insert_vtime(struct task_struct *p, + struct scx_bpf_dsq_insert_vtime_args *args) { struct scx_sched *sch; guard(rcu)(); + sch = rcu_dereference(scx_root); if (unlikely(!sch)) - return; + return false; - if (!scx_dsq_insert_preamble(sch, p, enq_flags)) - return; + return scx_dsq_insert_vtime(sch, p, args->dsq_id, args->slice, + args->vtime, args->enq_flags); +} - if (slice) - p->scx.slice = slice; - else - p->scx.slice = p->scx.slice ?: 1; +/* + * COMPAT: Will be removed in v6.23. + */ +__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, + u64 slice, u64 vtime, u64 enq_flags) +{ + struct scx_sched *sch; - p->scx.dsq_vtime = vtime; + guard(rcu)(); - scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + + scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags); } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_RCU) +BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) @@ -5455,6 +5881,13 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, return false; /* + * If the BPF scheduler keeps calling this function repeatedly, it can + * cause similar live-lock conditions as consume_dispatch_q(). + */ + if (unlikely(READ_ONCE(scx_aborting))) + return false; + + /* * Can be called from either ops.dispatch() locking this_rq() or any * context where no rq lock is held. If latter, lock @p's task_rq which * we'll likely need anyway. @@ -5474,13 +5907,6 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, raw_spin_rq_lock(src_rq); } - /* - * If the BPF scheduler keeps calling this function repeatedly, it can - * cause similar live-lock conditions as consume_dispatch_q(). Insert a - * breather if necessary. - */ - scx_breather(src_rq); - locked_rq = src_rq; raw_spin_lock(&src_dsq->lock); @@ -5685,8 +6111,9 @@ __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq * lock (e.g. BPF timers or SYSCALL programs). * - * Returns %true if @p has been consumed, %false if @p had already been consumed - * or dequeued. + * Returns %true if @p has been consumed, %false if @p had already been + * consumed, dequeued, or, for sub-scheds, @dsq_id points to a disallowed local + * DSQ. */ __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, @@ -5738,32 +6165,12 @@ static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { .set = &scx_kfunc_ids_dispatch, }; -__bpf_kfunc_start_defs(); - -/** - * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ - * - * Iterate over all of the tasks currently enqueued on the local DSQ of the - * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of - * processed tasks. Can only be called from ops.cpu_release(). - */ -__bpf_kfunc u32 scx_bpf_reenqueue_local(void) +static u32 reenq_local(struct rq *rq) { - struct scx_sched *sch; LIST_HEAD(tasks); u32 nr_enqueued = 0; - struct rq *rq; struct task_struct *p, *n; - guard(rcu)(); - sch = rcu_dereference(scx_root); - if (unlikely(!sch)) - return 0; - - if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE)) - return 0; - - rq = cpu_rq(smp_processor_id()); lockdep_assert_rq_held(rq); /* @@ -5800,6 +6207,37 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) return nr_enqueued; } +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ + * + * Iterate over all of the tasks currently enqueued on the local DSQ of the + * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of + * processed tasks. Can only be called from ops.cpu_release(). + * + * COMPAT: Will be removed in v6.23 along with the ___v2 suffix on the void + * returning variant that can be called from anywhere. + */ +__bpf_kfunc u32 scx_bpf_reenqueue_local(void) +{ + struct scx_sched *sch; + struct rq *rq; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return 0; + + if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE)) + return 0; + + rq = cpu_rq(smp_processor_id()); + lockdep_assert_rq_held(rq); + + return reenq_local(rq); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) @@ -5872,6 +6310,34 @@ static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { __bpf_kfunc_start_defs(); +/** + * scx_bpf_task_set_slice - Set task's time slice + * @p: task of interest + * @slice: time slice to set in nsecs + * + * Set @p's time slice to @slice. Returns %true on success, %false if the + * calling scheduler doesn't have authority over @p. + */ +__bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice) +{ + p->scx.slice = slice; + return true; +} + +/** + * scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering + * @p: task of interest + * @vtime: virtual time to set + * + * Set @p's virtual time to @vtime. Returns %true on success, %false if the + * calling scheduler doesn't have authority over @p. + */ +__bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime) +{ + p->scx.dsq_vtime = vtime; + return true; +} + static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) { struct rq *this_rq; @@ -6029,6 +6495,8 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, sizeof(struct bpf_iter_scx_dsq)); BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != __alignof__(struct bpf_iter_scx_dsq)); + BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & + ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); /* * next() and destroy() will be called regardless of the return value. @@ -6047,9 +6515,8 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, if (!kit->dsq) return -ENOENT; - INIT_LIST_HEAD(&kit->cursor.node); - kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags; - kit->cursor.priv = READ_ONCE(kit->dsq->seq); + kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, flags, + READ_ONCE(kit->dsq->seq)); return 0; } @@ -6123,6 +6590,40 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) kit->dsq = NULL; } +/** + * scx_bpf_dsq_peek - Lockless peek at the first element. + * @dsq_id: DSQ to examine. + * + * Read the first element in the DSQ. This is semantically equivalent to using + * the DSQ iterator, but is lockfree. Of course, like any lockless operation, + * this provides only a point-in-time snapshot, and the contents may change + * by the time any subsequent locking operation reads the queue. + * + * Returns the pointer, or NULL indicates an empty queue OR internal error. + */ +__bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) +{ + struct scx_sched *sch; + struct scx_dispatch_q *dsq; + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return NULL; + + if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) { + scx_error(sch, "peek disallowed on builtin DSQ 0x%llx", dsq_id); + return NULL; + } + + dsq = find_user_dsq(sch, dsq_id); + if (unlikely(!dsq)) { + scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id); + return NULL; + } + + return rcu_dereference(dsq->first_task); +} + __bpf_kfunc_end_defs(); static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, @@ -6277,6 +6778,24 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, } /** + * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ + * + * Iterate over all of the tasks currently enqueued on the local DSQ of the + * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from + * anywhere. + */ +__bpf_kfunc void scx_bpf_reenqueue_local___v2(void) +{ + struct rq *rq; + + guard(preempt)(); + + rq = this_rq(); + local_set(&rq->scx.reenq_local_deferred, 1); + schedule_deferred(rq); +} + +/** * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU * @cpu: CPU of interest * @@ -6677,15 +7196,19 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_any) +BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_RCU); +BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_RCU); BTF_ID_FLAGS(func, scx_bpf_kick_cpu) BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) +BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) @@ -6776,6 +7299,12 @@ static int __init scx_init(void) return ret; } + if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) || + !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) { + pr_err("sched_ext: Failed to allocate cpumasks\n"); + return -ENOMEM; + } + return 0; } __initcall(scx_init); diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index d2434c954848..3d9d404d5cd2 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -995,26 +995,56 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, return prev_cpu; } +struct scx_bpf_select_cpu_and_args { + /* @p and @cpus_allowed can't be packed together as KF_RCU is not transitive */ + s32 prev_cpu; + u64 wake_flags; + u64 flags; +}; + /** - * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p, - * prioritizing those in @cpus_allowed + * __scx_bpf_select_cpu_and - Arg-wrapped CPU selection with cpumask * @p: task_struct to select a CPU for - * @prev_cpu: CPU @p was on previously - * @wake_flags: %SCX_WAKE_* flags * @cpus_allowed: cpumask of allowed CPUs - * @flags: %SCX_PICK_IDLE* flags + * @args: struct containing the rest of the arguments + * @args->prev_cpu: CPU @p was on previously + * @args->wake_flags: %SCX_WAKE_* flags + * @args->flags: %SCX_PICK_IDLE* flags + * + * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument + * limit. BPF programs should use scx_bpf_select_cpu_and() which is provided + * as an inline wrapper in common.bpf.h. * * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked * context such as a BPF test_run() call, as long as built-in CPU selection * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE * is set. * - * @p, @prev_cpu and @wake_flags match ops.select_cpu(). + * @p, @args->prev_cpu and @args->wake_flags match ops.select_cpu(). * * Returns the selected idle CPU, which will be automatically awakened upon * returning from ops.select_cpu() and can be used for direct dispatch, or * a negative value if no idle CPU is available. */ +__bpf_kfunc s32 +__scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed, + struct scx_bpf_select_cpu_and_args *args) +{ + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + return select_cpu_from_kfunc(sch, p, args->prev_cpu, args->wake_flags, + cpus_allowed, args->flags); +} + +/* + * COMPAT: Will be removed in v6.22. + */ __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, const struct cpumask *cpus_allowed, u64 flags) { @@ -1383,6 +1413,7 @@ BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) +BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_idle) diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index b3617abed510..386c677e4c9a 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -23,6 +23,11 @@ enum scx_consts { * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. */ SCX_TASK_ITER_BATCH = 32, + + SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC, + SCX_BYPASS_LB_DONOR_PCT = 125, + SCX_BYPASS_LB_MIN_DELTA_DIV = 4, + SCX_BYPASS_LB_BATCH = 256, }; enum scx_exit_kind { @@ -697,12 +702,23 @@ struct sched_ext_ops { * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be * interpreted in the same fashion and specifies how much @cgrp can * burst temporarily. The specific control mechanism and thus the - * interpretation of @period_us and burstiness is upto to the BPF + * interpretation of @period_us and burstiness is up to the BPF * scheduler. */ void (*cgroup_set_bandwidth)(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us); + /** + * @cgroup_set_idle: A cgroup's idle state is being changed + * @cgrp: cgroup whose idle state is being updated + * @idle: whether the cgroup is entering or exiting idle state + * + * Update @cgrp's idle state to @idle. This callback is invoked when + * a cgroup transitions between idle and non-idle states, allowing the + * BPF scheduler to adjust its behavior accordingly. + */ + void (*cgroup_set_idle)(struct cgroup *cgrp, bool idle); + #endif /* CONFIG_EXT_GROUP_SCHED */ /* @@ -884,6 +900,10 @@ struct scx_sched { struct scx_dispatch_q **global_dsqs; struct scx_sched_pcpu __percpu *pcpu; + /* + * Updates to the following warned bitfields can race causing RMW issues + * but it doesn't really matter. + */ bool warned_zero_slice:1; bool warned_deprecated_rq:1; @@ -948,6 +968,7 @@ enum scx_enq_flags { SCX_ENQ_CLEAR_OPSS = 1LLU << 56, SCX_ENQ_DSQ_PRIQ = 1LLU << 57, + SCX_ENQ_NESTED = 1LLU << 58, }; enum scx_deq_flags { @@ -986,8 +1007,10 @@ enum scx_kick_flags { SCX_KICK_PREEMPT = 1LLU << 1, /* - * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will - * return after the target CPU finishes picking the next task. + * The scx_bpf_kick_cpu() call will return after the current SCX task of + * the target CPU switches out. This can be used to implement e.g. core + * scheduling. This has no effect if the current task on the target CPU + * is not on SCX. */ SCX_KICK_WAIT = 1LLU << 2, }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 769d7b7990df..da46c3164537 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4034,6 +4034,9 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) if (child_cfs_rq_on_list(cfs_rq)) return false; + if (cfs_rq->tg_load_avg_contrib) + return false; + return true; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8590113e4a60..d30cca6870f5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -803,10 +803,12 @@ struct scx_rq { cpumask_var_t cpus_to_kick_if_idle; cpumask_var_t cpus_to_preempt; cpumask_var_t cpus_to_wait; - unsigned long pnt_seq; + unsigned long kick_sync; + local_t reenq_local_deferred; struct balance_callback deferred_bal_cb; struct irq_work deferred_irq_work; struct irq_work kick_cpus_irq_work; + struct scx_dispatch_q bypass_dsq; }; #endif /* CONFIG_SCHED_CLASS_EXT */ @@ -1165,7 +1167,7 @@ struct rq { * one CPU and if it got migrated afterwards it may decrease * it on another CPU. Always updated under the runqueue lock: */ - unsigned long nr_uninterruptible; + unsigned long nr_uninterruptible; #ifdef CONFIG_SCHED_PROXY_EXEC struct task_struct __rcu *donor; /* Scheduling context */ diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index cbf7206b3f9d..c903f1a42891 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -180,8 +180,13 @@ static inline void psi_dequeue(struct task_struct *p, int flags) * avoid walking all ancestors twice, psi_task_switch() handles * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. * Do nothing here. + * + * In the SCHED_PROXY_EXECUTION case we may do sleeping + * dequeues that are not followed by a task switch, so check + * TSK_ONCPU is set to ensure the task switch is imminent. + * Otherwise clear the flags as usual. */ - if (flags & DEQUEUE_SLEEP) + if ((flags & DEQUEUE_SLEEP) && (p->psi_flags & TSK_ONCPU)) return; /* diff --git a/kernel/scs.c b/kernel/scs.c index d7809affe740..772488afd5b9 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -135,7 +135,7 @@ static void scs_check_usage(struct task_struct *tsk) if (!IS_ENABLED(CONFIG_DEBUG_STACK_USAGE)) return; - for (p = task_scs(tsk); p < __scs_magic(tsk); ++p) { + for (p = task_scs(tsk); p < __scs_magic(task_scs(tsk)); ++p) { if (!READ_ONCE_NOCHECK(*p)) break; used += sizeof(*p); diff --git a/kernel/smp.c b/kernel/smp.c index 02f52291fae4..f349960f79ca 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -1088,6 +1088,28 @@ void wake_up_all_idle_cpus(void) EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); /** + * cpus_peek_for_pending_ipi - Check for pending IPI for CPUs + * @mask: The CPU mask for the CPUs to check. + * + * This function walks through the @mask to check if there are any pending IPIs + * scheduled, for any of the CPUs in the @mask. It does not guarantee + * correctness as it only provides a racy snapshot. + * + * Returns true if there is a pending IPI scheduled and false otherwise. + */ +bool cpus_peek_for_pending_ipi(const struct cpumask *mask) +{ + unsigned int cpu; + + for_each_cpu(cpu, mask) { + if (!llist_empty(per_cpu_ptr(&call_single_queue, cpu))) + return true; + } + + return false; +} + +/** * struct smp_call_on_cpu_struct - Call a function on a specific CPU * @work: &work_struct * @done: &completion to signal diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cb6196e3fa99..2cd767b9680e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -13,7 +13,6 @@ #include <linux/highuid.h> #include <linux/writeback.h> #include <linux/initrd.h> -#include <linux/times.h> #include <linux/limits.h> #include <linux/syscalls.h> #include <linux/capability.h> @@ -55,7 +54,8 @@ static const int cap_last_cap = CAP_LAST_CAP; * to the buffer. * * These write modes control how current file position affects the behavior of - * updating sysctl values through the proc interface on each write. + * updating internal kernel (SYSCTL_USER_TO_KERN) sysctl values through the proc + * interface on each write. */ enum sysctl_writes_mode { SYSCTL_WRITES_LEGACY = -1, @@ -73,7 +73,7 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; #ifdef CONFIG_PROC_SYSCTL -static int _proc_do_string(char *data, int maxlen, int write, +static int _proc_do_string(char *data, int maxlen, int dir, char *buffer, size_t *lenp, loff_t *ppos) { size_t len; @@ -84,7 +84,7 @@ static int _proc_do_string(char *data, int maxlen, int write, return 0; } - if (write) { + if (SYSCTL_USER_TO_KERN(dir)) { if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) { /* Only continue writes not past the end of buffer. */ len = strlen(data); @@ -172,7 +172,7 @@ static bool proc_first_pos_non_zero_ignore(loff_t *ppos, /** * proc_dostring - read a string sysctl * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file + * @dir: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -186,13 +186,13 @@ static bool proc_first_pos_non_zero_ignore(loff_t *ppos, * * Returns 0 on success. */ -int proc_dostring(const struct ctl_table *table, int write, +int proc_dostring(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { - if (write) + if (SYSCTL_USER_TO_KERN(dir)) proc_first_pos_non_zero_ignore(ppos, table); - return _proc_do_string(table->data, table->maxlen, write, buffer, lenp, + return _proc_do_string(table->data, table->maxlen, dir, buffer, lenp, ppos); } @@ -354,74 +354,55 @@ static void proc_put_char(void **buf, size_t *size, char c) } } -static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - if (*negp) { - if (*lvalp > (unsigned long) INT_MAX + 1) - return -EINVAL; - WRITE_ONCE(*valp, -*lvalp); - } else { - if (*lvalp > (unsigned long) INT_MAX) - return -EINVAL; - WRITE_ONCE(*valp, *lvalp); - } - } else { - int val = READ_ONCE(*valp); - if (val < 0) { - *negp = true; - *lvalp = -(unsigned long)val; - } else { - *negp = false; - *lvalp = (unsigned long)val; - } - } - return 0; -} +static SYSCTL_USER_TO_KERN_INT_CONV(, SYSCTL_CONV_IDENTITY) +static SYSCTL_KERN_TO_USER_INT_CONV(, SYSCTL_CONV_IDENTITY) + +static SYSCTL_INT_CONV_CUSTOM(, sysctl_user_to_kern_int_conv, + sysctl_kern_to_user_int_conv, false) +static SYSCTL_INT_CONV_CUSTOM(_minmax, sysctl_user_to_kern_int_conv, + sysctl_kern_to_user_int_conv, true) + -static int do_proc_douintvec_conv(unsigned long *lvalp, - unsigned int *valp, - int write, void *data) +static SYSCTL_USER_TO_KERN_UINT_CONV(, SYSCTL_CONV_IDENTITY) + +int sysctl_kern_to_user_uint_conv(unsigned long *u_ptr, + const unsigned int *k_ptr) { - if (write) { - if (*lvalp > UINT_MAX) - return -EINVAL; - WRITE_ONCE(*valp, *lvalp); - } else { - unsigned int val = READ_ONCE(*valp); - *lvalp = (unsigned long)val; - } + unsigned int val = READ_ONCE(*k_ptr); + *u_ptr = (unsigned long)val; return 0; } +static SYSCTL_UINT_CONV_CUSTOM(, sysctl_user_to_kern_uint_conv, + sysctl_kern_to_user_uint_conv, false) +static SYSCTL_UINT_CONV_CUSTOM(_minmax, sysctl_user_to_kern_uint_conv, + sysctl_kern_to_user_uint_conv, true) + static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; -static int __do_proc_dointvec(void *tbl_data, const struct ctl_table *table, - int write, void *buffer, - size_t *lenp, loff_t *ppos, - int (*conv)(bool *negp, unsigned long *lvalp, int *valp, - int write, void *data), - void *data) +static int do_proc_dointvec(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(bool *negp, unsigned long *u_ptr, int *k_ptr, + int dir, const struct ctl_table *table)) { int *i, vleft, first = 1, err = 0; size_t left; char *p; - if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { + if (!table->data || !table->maxlen || !*lenp || + (*ppos && SYSCTL_KERN_TO_USER(dir))) { *lenp = 0; return 0; } - i = (int *) tbl_data; + i = (int *) table->data; vleft = table->maxlen / sizeof(*i); left = *lenp; if (!conv) - conv = do_proc_dointvec_conv; + conv = do_proc_int_conv; - if (write) { + if (SYSCTL_USER_TO_KERN(dir)) { if (proc_first_pos_non_zero_ignore(ppos, table)) goto out; @@ -434,7 +415,7 @@ static int __do_proc_dointvec(void *tbl_data, const struct ctl_table *table, unsigned long lval; bool neg; - if (write) { + if (SYSCTL_USER_TO_KERN(dir)) { proc_skip_spaces(&p, &left); if (!left) @@ -444,12 +425,12 @@ static int __do_proc_dointvec(void *tbl_data, const struct ctl_table *table, sizeof(proc_wspace_sep), NULL); if (err) break; - if (conv(&neg, &lval, i, 1, data)) { + if (conv(&neg, &lval, i, 1, table)) { err = -EINVAL; break; } } else { - if (conv(&neg, &lval, i, 0, data)) { + if (conv(&neg, &lval, i, 0, table)) { err = -EINVAL; break; } @@ -459,11 +440,11 @@ static int __do_proc_dointvec(void *tbl_data, const struct ctl_table *table, } } - if (!write && !first && left && !err) + if (SYSCTL_KERN_TO_USER(dir) && !first && left && !err) proc_put_char(&buffer, &left, '\n'); - if (write && !err && left) + if (SYSCTL_USER_TO_KERN(dir) && !err && left) proc_skip_spaces(&p, &left); - if (write && first) + if (SYSCTL_USER_TO_KERN(dir) && first) return err ? : -EINVAL; *lenp -= left; out: @@ -471,24 +452,11 @@ out: return err; } -static int do_proc_dointvec(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(bool *negp, unsigned long *lvalp, int *valp, - int write, void *data), - void *data) -{ - return __do_proc_dointvec(table->data, table, write, - buffer, lenp, ppos, conv, data); -} - -static int do_proc_douintvec_w(unsigned int *tbl_data, - const struct ctl_table *table, - void *buffer, +static int do_proc_douintvec_w(const struct ctl_table *table, void *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data) + int (*conv)(unsigned long *u_ptr, + unsigned int *k_ptr, int dir, + const struct ctl_table *table)) { unsigned long lval; int err = 0; @@ -518,7 +486,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, goto out_free; } - if (conv(&lval, tbl_data, 1, data)) { + if (conv(&lval, (unsigned int *) table->data, 1, table)) { err = -EINVAL; goto out_free; } @@ -532,18 +500,16 @@ out_free: return 0; - /* This is in keeping with old __do_proc_dointvec() */ bail_early: *ppos += *lenp; return err; } -static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer, +static int do_proc_douintvec_r(const struct ctl_table *table, void *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data) + int (*conv)(unsigned long *u_ptr, + unsigned int *k_ptr, int dir, + const struct ctl_table *table)) { unsigned long lval; int err = 0; @@ -551,7 +517,7 @@ static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer, left = *lenp; - if (conv(&lval, tbl_data, 0, data)) { + if (conv(&lval, (unsigned int *) table->data, 0, table)) { err = -EINVAL; goto out; } @@ -569,23 +535,21 @@ out: return err; } -static int __do_proc_douintvec(void *tbl_data, const struct ctl_table *table, - int write, void *buffer, - size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data) +static int do_proc_douintvec(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *u_ptr, + unsigned int *k_ptr, int dir, + const struct ctl_table *table)) { - unsigned int *i, vleft; + unsigned int vleft; - if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { + if (!table->data || !table->maxlen || !*lenp || + (*ppos && SYSCTL_KERN_TO_USER(dir))) { *lenp = 0; return 0; } - i = (unsigned int *) tbl_data; - vleft = table->maxlen / sizeof(*i); + vleft = table->maxlen / sizeof(unsigned int); /* * Arrays are not supported, keep this simple. *Do not* add @@ -597,29 +561,26 @@ static int __do_proc_douintvec(void *tbl_data, const struct ctl_table *table, } if (!conv) - conv = do_proc_douintvec_conv; + conv = do_proc_uint_conv; - if (write) - return do_proc_douintvec_w(i, table, buffer, lenp, ppos, - conv, data); - return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); + if (SYSCTL_USER_TO_KERN(dir)) + return do_proc_douintvec_w(table, buffer, lenp, ppos, conv); + return do_proc_douintvec_r(table, buffer, lenp, ppos, conv); } -int do_proc_douintvec(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data) +int proc_douintvec_conv(const struct ctl_table *table, int dir, void *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *u_ptr, unsigned int *k_ptr, + int dir, const struct ctl_table *table)) { - return __do_proc_douintvec(table->data, table, write, - buffer, lenp, ppos, conv, data); + return do_proc_douintvec(table, dir, buffer, lenp, ppos, conv); } + /** * proc_dobool - read/write a bool * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file + * @dir: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -632,7 +593,7 @@ int do_proc_douintvec(const struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_dobool(const struct ctl_table *table, int write, void *buffer, +int proc_dobool(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tmp; @@ -648,10 +609,10 @@ int proc_dobool(const struct ctl_table *table, int write, void *buffer, tmp.data = &val; val = READ_ONCE(*data); - res = proc_dointvec(&tmp, write, buffer, lenp, ppos); + res = proc_dointvec(&tmp, dir, buffer, lenp, ppos); if (res) return res; - if (write) + if (SYSCTL_USER_TO_KERN(dir)) WRITE_ONCE(*data, val); return 0; } @@ -659,7 +620,7 @@ int proc_dobool(const struct ctl_table *table, int write, void *buffer, /** * proc_dointvec - read a vector of integers * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file + * @dir: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -669,16 +630,16 @@ int proc_dobool(const struct ctl_table *table, int write, void *buffer, * * Returns 0 on success. */ -int proc_dointvec(const struct ctl_table *table, int write, void *buffer, +int proc_dointvec(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); + return do_proc_dointvec(table, dir, buffer, lenp, ppos, NULL); } /** * proc_douintvec - read a vector of unsigned integers * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file + * @dir: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -688,57 +649,17 @@ int proc_dointvec(const struct ctl_table *table, int write, void *buffer, * * Returns 0 on success. */ -int proc_douintvec(const struct ctl_table *table, int write, void *buffer, +int proc_douintvec(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_douintvec(table, write, buffer, lenp, ppos, - do_proc_douintvec_conv, NULL); -} - -/** - * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure - * @min: pointer to minimum allowable value - * @max: pointer to maximum allowable value - * - * The do_proc_dointvec_minmax_conv_param structure provides the - * minimum and maximum values for doing range checking for those sysctl - * parameters that use the proc_dointvec_minmax() handler. - */ -struct do_proc_dointvec_minmax_conv_param { - int *min; - int *max; -}; - -static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - int tmp, ret; - struct do_proc_dointvec_minmax_conv_param *param = data; - /* - * If writing, first do so via a temporary local int so we can - * bounds-check it before touching *valp. - */ - int *ip = write ? &tmp : valp; - - ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data); - if (ret) - return ret; - - if (write) { - if ((param->min && *param->min > tmp) || - (param->max && *param->max < tmp)) - return -EINVAL; - WRITE_ONCE(*valp, tmp); - } - - return 0; + return do_proc_douintvec(table, dir, buffer, lenp, ppos, + do_proc_uint_conv); } /** * proc_dointvec_minmax - read a vector of integers with min/max values * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file + * @dir: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -749,62 +670,20 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, * This routine will ensure the values are within the range specified by * table->extra1 (min) and table->extra2 (max). * - * Returns 0 on success or -EINVAL on write when the range check fails. + * Returns 0 on success or -EINVAL when the range check fails and + * SYSCTL_USER_TO_KERN(dir) == true */ -int proc_dointvec_minmax(const struct ctl_table *table, int write, +int proc_dointvec_minmax(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { - struct do_proc_dointvec_minmax_conv_param param = { - .min = (int *) table->extra1, - .max = (int *) table->extra2, - }; - return do_proc_dointvec(table, write, buffer, lenp, ppos, - do_proc_dointvec_minmax_conv, ¶m); -} - -/** - * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure - * @min: pointer to minimum allowable value - * @max: pointer to maximum allowable value - * - * The do_proc_douintvec_minmax_conv_param structure provides the - * minimum and maximum values for doing range checking for those sysctl - * parameters that use the proc_douintvec_minmax() handler. - */ -struct do_proc_douintvec_minmax_conv_param { - unsigned int *min; - unsigned int *max; -}; - -static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, - unsigned int *valp, - int write, void *data) -{ - int ret; - unsigned int tmp; - struct do_proc_douintvec_minmax_conv_param *param = data; - /* write via temporary local uint for bounds-checking */ - unsigned int *up = write ? &tmp : valp; - - ret = do_proc_douintvec_conv(lvalp, up, write, data); - if (ret) - return ret; - - if (write) { - if ((param->min && *param->min > tmp) || - (param->max && *param->max < tmp)) - return -ERANGE; - - WRITE_ONCE(*valp, tmp); - } - - return 0; + return do_proc_dointvec(table, dir, buffer, lenp, ppos, + do_proc_int_conv_minmax); } /** * proc_douintvec_minmax - read a vector of unsigned ints with min/max values * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file + * @dir: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -818,23 +697,20 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, * check for UINT_MAX to avoid having to support wrap around uses from * userspace. * - * Returns 0 on success or -ERANGE on write when the range check fails. + * Returns 0 on success or -ERANGE when range check failes and + * SYSCTL_USER_TO_KERN(dir) == true */ -int proc_douintvec_minmax(const struct ctl_table *table, int write, +int proc_douintvec_minmax(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { - struct do_proc_douintvec_minmax_conv_param param = { - .min = (unsigned int *) table->extra1, - .max = (unsigned int *) table->extra2, - }; - return do_proc_douintvec(table, write, buffer, lenp, ppos, - do_proc_douintvec_minmax_conv, ¶m); + return do_proc_douintvec(table, dir, buffer, lenp, ppos, + do_proc_uint_conv_minmax); } /** * proc_dou8vec_minmax - read a vector of unsigned chars with min/max values * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file + * @dir: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -846,66 +722,64 @@ int proc_douintvec_minmax(const struct ctl_table *table, int write, * This routine will ensure the values are within the range specified by * table->extra1 (min) and table->extra2 (max). * - * Returns 0 on success or an error on write when the range check fails. + * Returns 0 on success or an error on SYSCTL_USER_TO_KERN(dir) == true + * and the range check fails. */ -int proc_dou8vec_minmax(const struct ctl_table *table, int write, +int proc_dou8vec_minmax(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tmp; unsigned int min = 0, max = 255U, val; u8 *data = table->data; - struct do_proc_douintvec_minmax_conv_param param = { - .min = &min, - .max = &max, - }; int res; /* Do not support arrays yet. */ if (table->maxlen != sizeof(u8)) return -EINVAL; - if (table->extra1) - min = *(unsigned int *) table->extra1; - if (table->extra2) - max = *(unsigned int *) table->extra2; - tmp = *table; tmp.maxlen = sizeof(val); tmp.data = &val; + if (!tmp.extra1) + tmp.extra1 = (unsigned int *) &min; + if (!tmp.extra2) + tmp.extra2 = (unsigned int *) &max; + val = READ_ONCE(*data); - res = do_proc_douintvec(&tmp, write, buffer, lenp, ppos, - do_proc_douintvec_minmax_conv, ¶m); + res = do_proc_douintvec(&tmp, dir, buffer, lenp, ppos, + do_proc_uint_conv_minmax); if (res) return res; - if (write) + if (SYSCTL_USER_TO_KERN(dir)) WRITE_ONCE(*data, val); return 0; } EXPORT_SYMBOL_GPL(proc_dou8vec_minmax); -static int __do_proc_doulongvec_minmax(void *data, - const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos, - unsigned long convmul, unsigned long convdiv) +static int do_proc_doulongvec_minmax(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos, + unsigned long convmul, + unsigned long convdiv) { unsigned long *i, *min, *max; int vleft, first = 1, err = 0; size_t left; char *p; - if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { + if (!table->data || !table->maxlen || !*lenp || + (*ppos && SYSCTL_KERN_TO_USER(dir))) { *lenp = 0; return 0; } - i = data; + i = table->data; min = table->extra1; max = table->extra2; vleft = table->maxlen / sizeof(unsigned long); left = *lenp; - if (write) { + if (SYSCTL_USER_TO_KERN(dir)) { if (proc_first_pos_non_zero_ignore(ppos, table)) goto out; @@ -917,7 +791,7 @@ static int __do_proc_doulongvec_minmax(void *data, for (; left && vleft--; i++, first = 0) { unsigned long val; - if (write) { + if (SYSCTL_USER_TO_KERN(dir)) { bool neg; proc_skip_spaces(&p, &left); @@ -946,11 +820,11 @@ static int __do_proc_doulongvec_minmax(void *data, } } - if (!write && !first && left && !err) + if (SYSCTL_KERN_TO_USER(dir) && !first && left && !err) proc_put_char(&buffer, &left, '\n'); - if (write && !err) + if (SYSCTL_USER_TO_KERN(dir) && !err) proc_skip_spaces(&p, &left); - if (write && first) + if (SYSCTL_USER_TO_KERN(dir) && first) return err ? : -EINVAL; *lenp -= left; out: @@ -958,18 +832,18 @@ out: return err; } -static int do_proc_doulongvec_minmax(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, - unsigned long convdiv) +int proc_doulongvec_minmax_conv(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos, + unsigned long convmul, unsigned long convdiv) { - return __do_proc_doulongvec_minmax(table->data, table, write, - buffer, lenp, ppos, convmul, convdiv); + return do_proc_doulongvec_minmax(table, dir, buffer, lenp, ppos, + convmul, convdiv); } /** * proc_doulongvec_minmax - read a vector of long integers with min/max values * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file + * @dir: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -982,216 +856,24 @@ static int do_proc_doulongvec_minmax(const struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_doulongvec_minmax(const struct ctl_table *table, int write, +int proc_doulongvec_minmax(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); -} - -/** - * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long - * values from/to the user buffer, treated as an ASCII string. The values - * are treated as milliseconds, and converted to jiffies when they are stored. - * - * This routine will ensure the values are within the range specified by - * table->extra1 (min) and table->extra2 (max). - * - * Returns 0 on success. - */ -int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_doulongvec_minmax(table, write, buffer, - lenp, ppos, HZ, 1000l); + return proc_doulongvec_minmax_conv(table, dir, buffer, lenp, ppos, 1l, 1l); } - -static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) +int proc_dointvec_conv(const struct ctl_table *table, int dir, void *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(bool *negp, unsigned long *u_ptr, int *k_ptr, + int dir, const struct ctl_table *table)) { - if (write) { - if (*lvalp > INT_MAX / HZ) - return 1; - if (*negp) - WRITE_ONCE(*valp, -*lvalp * HZ); - else - WRITE_ONCE(*valp, *lvalp * HZ); - } else { - int val = READ_ONCE(*valp); - unsigned long lval; - if (val < 0) { - *negp = true; - lval = -(unsigned long)val; - } else { - *negp = false; - lval = (unsigned long)val; - } - *lvalp = lval / HZ; - } - return 0; -} - -static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ) - return 1; - *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); - } else { - int val = *valp; - unsigned long lval; - if (val < 0) { - *negp = true; - lval = -(unsigned long)val; - } else { - *negp = false; - lval = (unsigned long)val; - } - *lvalp = jiffies_to_clock_t(lval); - } - return 0; -} - -static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); - - if (jif > INT_MAX) - return 1; - WRITE_ONCE(*valp, (int)jif); - } else { - int val = READ_ONCE(*valp); - unsigned long lval; - if (val < 0) { - *negp = true; - lval = -(unsigned long)val; - } else { - *negp = false; - lval = (unsigned long)val; - } - *lvalp = jiffies_to_msecs(lval); - } - return 0; -} - -static int do_proc_dointvec_ms_jiffies_minmax_conv(bool *negp, unsigned long *lvalp, - int *valp, int write, void *data) -{ - int tmp, ret; - struct do_proc_dointvec_minmax_conv_param *param = data; - /* - * If writing, first do so via a temporary local int so we can - * bounds-check it before touching *valp. - */ - int *ip = write ? &tmp : valp; - - ret = do_proc_dointvec_ms_jiffies_conv(negp, lvalp, ip, write, data); - if (ret) - return ret; - - if (write) { - if ((param->min && *param->min > tmp) || - (param->max && *param->max < tmp)) - return -EINVAL; - *valp = tmp; - } - return 0; -} - -/** - * proc_dointvec_jiffies - read a vector of integers as seconds - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in seconds, and are converted into - * jiffies. - * - * Returns 0 on success. - */ -int proc_dointvec_jiffies(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table,write,buffer,lenp,ppos, - do_proc_dointvec_jiffies_conv,NULL); -} - -int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct do_proc_dointvec_minmax_conv_param param = { - .min = (int *) table->extra1, - .max = (int *) table->extra2, - }; - return do_proc_dointvec(table, write, buffer, lenp, ppos, - do_proc_dointvec_ms_jiffies_minmax_conv, ¶m); -} - -/** - * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: pointer to the file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in 1/USER_HZ seconds, and - * are converted into jiffies. - * - * Returns 0 on success. - */ -int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table, write, buffer, lenp, ppos, - do_proc_dointvec_userhz_jiffies_conv, NULL); -} - -/** - * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: the current position in the file - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in 1/1000 seconds, and - * are converted into jiffies. - * - * Returns 0 on success. - */ -int proc_dointvec_ms_jiffies(const struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table, write, buffer, lenp, ppos, - do_proc_dointvec_ms_jiffies_conv, NULL); + return do_proc_dointvec(table, dir, buffer, lenp, ppos, conv); } /** * proc_do_large_bitmap - read/write from/to a large bitmap * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file + * @dir: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -1205,7 +887,7 @@ int proc_dointvec_ms_jiffies(const struct ctl_table *table, int write, void *buf * * Returns 0 on success. */ -int proc_do_large_bitmap(const struct ctl_table *table, int write, +int proc_do_large_bitmap(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { int err = 0; @@ -1215,12 +897,12 @@ int proc_do_large_bitmap(const struct ctl_table *table, int write, unsigned long *tmp_bitmap = NULL; char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; - if (!bitmap || !bitmap_len || !left || (*ppos && !write)) { + if (!bitmap || !bitmap_len || !left || (*ppos && SYSCTL_KERN_TO_USER(dir))) { *lenp = 0; return 0; } - if (write) { + if (SYSCTL_USER_TO_KERN(dir)) { char *p = buffer; size_t skipped = 0; @@ -1321,7 +1003,7 @@ int proc_do_large_bitmap(const struct ctl_table *table, int write, } if (!err) { - if (write) { + if (SYSCTL_USER_TO_KERN(dir)) { if (*ppos) bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); else @@ -1337,85 +1019,70 @@ int proc_do_large_bitmap(const struct ctl_table *table, int write, #else /* CONFIG_PROC_SYSCTL */ -int proc_dostring(const struct ctl_table *table, int write, +int proc_dostring(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dobool(const struct ctl_table *table, int write, +int proc_dobool(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec(const struct ctl_table *table, int write, +int proc_dointvec(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_douintvec(const struct ctl_table *table, int write, +int proc_douintvec(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_minmax(const struct ctl_table *table, int write, +int proc_dointvec_minmax(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_douintvec_minmax(const struct ctl_table *table, int write, +int proc_douintvec_minmax(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dou8vec_minmax(const struct ctl_table *table, int write, +int proc_dou8vec_minmax(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_jiffies(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int write, +int proc_doulongvec_minmax(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_ms_jiffies(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_doulongvec_minmax(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +int proc_doulongvec_minmax_conv(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos, + unsigned long convmul, unsigned long convdiv) { return -ENOSYS; } -int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +int proc_dointvec_conv(const struct ctl_table *table, int dir, void *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(bool *negp, unsigned long *u_ptr, int *k_ptr, + int dir, const struct ctl_table *table)) { return -ENOSYS; } -int proc_do_large_bitmap(const struct ctl_table *table, int write, +int proc_do_large_bitmap(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; @@ -1424,7 +1091,7 @@ int proc_do_large_bitmap(const struct ctl_table *table, int write, #endif /* CONFIG_PROC_SYSCTL */ #if defined(CONFIG_SYSCTL) -int proc_do_static_key(const struct ctl_table *table, int write, +int proc_do_static_key(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { struct static_key *key = (struct static_key *)table->data; @@ -1438,13 +1105,13 @@ int proc_do_static_key(const struct ctl_table *table, int write, .extra2 = SYSCTL_ONE, }; - if (write && !capable(CAP_SYS_ADMIN)) + if (SYSCTL_USER_TO_KERN(dir) && !capable(CAP_SYS_ADMIN)) return -EPERM; mutex_lock(&static_key_mutex); val = static_key_enabled(key); - ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); - if (write && !ret) { + ret = proc_dointvec_minmax(&tmp, dir, buffer, lenp, ppos); + if (SYSCTL_USER_TO_KERN(dir) && !ret) { if (val) static_key_enable(key); else @@ -1514,12 +1181,8 @@ int __init sysctl_init_bases(void) EXPORT_SYMBOL(proc_dobool); EXPORT_SYMBOL(proc_dointvec); EXPORT_SYMBOL(proc_douintvec); -EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL_GPL(proc_douintvec_minmax); -EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); -EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); EXPORT_SYMBOL(proc_doulongvec_minmax); -EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); EXPORT_SYMBOL(proc_do_large_bitmap); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 34eeacac2253..d31a6d40d38d 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -99,3 +99,128 @@ void __init register_refined_jiffies(long cycles_per_second) __clocksource_register(&refined_jiffies); } + +#define SYSCTL_CONV_MULT_HZ(val) ((val) * HZ) +#define SYSCTL_CONV_DIV_HZ(val) ((val) / HZ) + +static SYSCTL_USER_TO_KERN_INT_CONV(_hz, SYSCTL_CONV_MULT_HZ) +static SYSCTL_KERN_TO_USER_INT_CONV(_hz, SYSCTL_CONV_DIV_HZ) +static SYSCTL_USER_TO_KERN_INT_CONV(_userhz, clock_t_to_jiffies) +static SYSCTL_KERN_TO_USER_INT_CONV(_userhz, jiffies_to_clock_t) +static SYSCTL_USER_TO_KERN_INT_CONV(_ms, msecs_to_jiffies) +static SYSCTL_KERN_TO_USER_INT_CONV(_ms, jiffies_to_msecs) + +static SYSCTL_INT_CONV_CUSTOM(_jiffies, sysctl_user_to_kern_int_conv_hz, + sysctl_kern_to_user_int_conv_hz, false) +static SYSCTL_INT_CONV_CUSTOM(_userhz_jiffies, + sysctl_user_to_kern_int_conv_userhz, + sysctl_kern_to_user_int_conv_userhz, false) +static SYSCTL_INT_CONV_CUSTOM(_ms_jiffies, sysctl_user_to_kern_int_conv_ms, + sysctl_kern_to_user_int_conv_ms, false) +static SYSCTL_INT_CONV_CUSTOM(_ms_jiffies_minmax, + sysctl_user_to_kern_int_conv_ms, + sysctl_kern_to_user_int_conv_ms, true) + +/** + * proc_dointvec_jiffies - read a vector of integers as seconds + * @table: the sysctl table + * @dir: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in seconds, and are converted into + * jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_jiffies(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_dointvec_conv(table, dir, buffer, lenp, ppos, + do_proc_int_conv_jiffies); +} +EXPORT_SYMBOL(proc_dointvec_jiffies); + +/** + * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds + * @table: the sysctl table + * @dir: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: pointer to the file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/USER_HZ seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + if (SYSCTL_USER_TO_KERN(dir) && USER_HZ < HZ) + return -EINVAL; + return proc_dointvec_conv(table, dir, buffer, lenp, ppos, + do_proc_int_conv_userhz_jiffies); +} +EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); + +/** + * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds + * @table: the sysctl table + * @dir: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: the current position in the file + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/1000 seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir, void *buffer, + size_t *lenp, loff_t *ppos) +{ + return proc_dointvec_conv(table, dir, buffer, lenp, ppos, + do_proc_int_conv_ms_jiffies); +} +EXPORT_SYMBOL(proc_dointvec_ms_jiffies); + +int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_dointvec_conv(table, dir, buffer, lenp, ppos, + do_proc_int_conv_ms_jiffies_minmax); +} + +/** + * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values + * @table: the sysctl table + * @dir: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long + * values from/to the user buffer, treated as an ASCII string. The values + * are treated as milliseconds, and converted to jiffies when they are stored. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_doulongvec_minmax_conv(table, dir, buffer, lenp, ppos, + HZ, 1000l); +} +EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); + diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index cc1afec306b3..f39111830ca3 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -296,6 +296,11 @@ int sched_clock_suspend(void) return 0; } +static int sched_clock_syscore_suspend(void *data) +{ + return sched_clock_suspend(); +} + void sched_clock_resume(void) { struct clock_read_data *rd = &cd.read_data[0]; @@ -305,14 +310,23 @@ void sched_clock_resume(void) rd->read_sched_clock = cd.actual_read_sched_clock; } -static struct syscore_ops sched_clock_ops = { - .suspend = sched_clock_suspend, - .resume = sched_clock_resume, +static void sched_clock_syscore_resume(void *data) +{ + sched_clock_resume(); +} + +static const struct syscore_ops sched_clock_syscore_ops = { + .suspend = sched_clock_syscore_suspend, + .resume = sched_clock_syscore_resume, +}; + +static struct syscore sched_clock_syscore = { + .ops = &sched_clock_syscore_ops, }; static int __init sched_clock_syscore_init(void) { - register_syscore_ops(&sched_clock_ops); + register_syscore(&sched_clock_syscore); return 0; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4790da895203..3ec3daa4acab 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1994,6 +1994,11 @@ void timekeeping_resume(void) timerfd_resume(); } +static void timekeeping_syscore_resume(void *data) +{ + timekeeping_resume(); +} + int timekeeping_suspend(void) { struct timekeeper *tks = &tk_core.shadow_timekeeper; @@ -2061,15 +2066,24 @@ int timekeeping_suspend(void) return 0; } +static int timekeeping_syscore_suspend(void *data) +{ + return timekeeping_suspend(); +} + /* sysfs resume/suspend bits for timekeeping */ -static struct syscore_ops timekeeping_syscore_ops = { - .resume = timekeeping_resume, - .suspend = timekeeping_suspend, +static const struct syscore_ops timekeeping_syscore_ops = { + .resume = timekeeping_syscore_resume, + .suspend = timekeeping_syscore_suspend, +}; + +static struct syscore timekeeping_syscore = { + .ops = &timekeeping_syscore_ops, }; static int __init timekeeping_init_ops(void) { - register_syscore_ops(&timekeeping_syscore_ops); + register_syscore(&timekeeping_syscore); return 0; } device_initcall(timekeeping_init_ops); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d2c79da81e4f..bfa2ec46e075 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -80,6 +80,12 @@ config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE If the architecture generates __patchable_function_entries sections but does not want them included in the ftrace locations. +config HAVE_DYNAMIC_FTRACE_WITH_JMP + bool + help + If the architecture supports to replace the __fentry__ with a + "jmp" instruction. + config HAVE_SYSCALL_TRACEPOINTS bool help @@ -330,6 +336,26 @@ config DYNAMIC_FTRACE_WITH_ARGS depends on DYNAMIC_FTRACE depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS +config DYNAMIC_FTRACE_WITH_JMP + def_bool y + depends on DYNAMIC_FTRACE + depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS + depends on HAVE_DYNAMIC_FTRACE_WITH_JMP + +config FUNCTION_SELF_TRACING + bool "Function trace tracing code" + depends on FUNCTION_TRACER + help + Normally all the tracing code is set to notrace, where the function + tracer will ignore all the tracing functions. Sometimes it is useful + for debugging to trace some of the tracing infratructure itself. + Enable this to allow some of the tracing infrastructure to be traced + by the function tracer. Note, this will likely add noise to function + tracing if events and other tracing features are enabled along with + function tracing. + + If unsure, say N. + config FPROBE bool "Kernel Function Probe (fprobe)" depends on HAVE_FUNCTION_GRAPH_FREGS && HAVE_FTRACE_GRAPH_FUNC @@ -575,6 +601,20 @@ config FTRACE_SYSCALLS help Basic tracer to catch the syscall entry and exit events. +config TRACE_SYSCALL_BUF_SIZE_DEFAULT + int "System call user read max size" + range 0 165 + default 63 + depends on FTRACE_SYSCALLS + help + Some system call trace events will record the data from a user + space address that one of the parameters point to. The amount of + data per event is limited. That limit is set by this config and + this config also affects how much user space data perf can read. + + For a tracing instance, this size may be changed by writing into + its syscall_user_buf_size file. + config TRACER_SNAPSHOT bool "Create a snapshot trace buffer" select TRACER_MAX_TRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index dcb4e02afc5f..fc5dcc888e13 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -16,6 +16,23 @@ obj-y += trace_selftest_dynamic.o endif endif +# Allow some files to be function traced +ifdef CONFIG_FUNCTION_SELF_TRACING +CFLAGS_trace_output.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_seq.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_stat.o = $(CC_FLAGS_FTRACE) +CFLAGS_tracing_map.o = $(CC_FLAGS_FTRACE) +CFLAGS_synth_event_gen_test.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_syscalls.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events_filter.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events_trigger.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events_synth.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events_hist.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events_user.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_dynevent.o = $(CC_FLAGS_FTRACE) +endif + ifdef CONFIG_FTRACE_STARTUP_TEST CFLAGS_trace_kprobe_selftest.o = $(CC_FLAGS_FTRACE) obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe_selftest.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 6941145b5058..d031c8d80be4 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -63,13 +63,116 @@ static int blk_probes_ref; static void blk_register_tracepoints(void); static void blk_unregister_tracepoints(void); +static void record_blktrace_event(struct blk_io_trace *t, pid_t pid, int cpu, + sector_t sector, int bytes, u64 what, + dev_t dev, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) + +{ + /* + * These two are not needed in ftrace as they are in the + * generic trace_entry, filled by tracing_generic_entry_update, + * but for the trace_event->bin() synthesizer benefit we do it + * here too. + */ + t->cpu = cpu; + t->pid = pid; + + t->sector = sector; + t->bytes = bytes; + t->action = lower_32_bits(what); + t->device = dev; + t->error = error; + t->pdu_len = pdu_len + cgid_len; + + if (cgid_len) + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); + if (pdu_len) + memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); +} + +static void record_blktrace_event2(struct blk_io_trace2 *t2, pid_t pid, int cpu, + sector_t sector, int bytes, u64 what, + dev_t dev, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, + int pdu_len) +{ + t2->pid = pid; + t2->cpu = cpu; + + t2->sector = sector; + t2->bytes = bytes; + t2->action = what; + t2->device = dev; + t2->error = error; + t2->pdu_len = pdu_len + cgid_len; + + if (cgid_len) + memcpy((void *)t2 + sizeof(*t2), &cgid, cgid_len); + if (pdu_len) + memcpy((void *)t2 + sizeof(*t2) + cgid_len, pdu_data, pdu_len); +} + +static void relay_blktrace_event1(struct blk_trace *bt, unsigned long sequence, + pid_t pid, int cpu, sector_t sector, int bytes, + u64 what, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) +{ + struct blk_io_trace *t; + size_t trace_len = sizeof(*t) + pdu_len + cgid_len; + + t = relay_reserve(bt->rchan, trace_len); + if (!t) + return; + + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; + t->sequence = sequence; + t->time = ktime_to_ns(ktime_get()); + + record_blktrace_event(t, pid, cpu, sector, bytes, what, bt->dev, error, + cgid, cgid_len, pdu_data, pdu_len); +} + +static void relay_blktrace_event2(struct blk_trace *bt, unsigned long sequence, + pid_t pid, int cpu, sector_t sector, + int bytes, u64 what, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) +{ + struct blk_io_trace2 *t; + size_t trace_len = sizeof(struct blk_io_trace2) + pdu_len + cgid_len; + + t = relay_reserve(bt->rchan, trace_len); + if (!t) + return; + + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE2_VERSION; + t->sequence = sequence; + t->time = ktime_to_ns(ktime_get()); + + record_blktrace_event2(t, pid, cpu, sector, bytes, what, bt->dev, error, + cgid, cgid_len, pdu_data, pdu_len); +} + +static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence, + pid_t pid, int cpu, sector_t sector, int bytes, + u64 what, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) +{ + if (bt->version == 2) + return relay_blktrace_event2(bt, sequence, pid, cpu, sector, + bytes, what, error, cgid, cgid_len, + pdu_data, pdu_len); + return relay_blktrace_event1(bt, sequence, pid, cpu, sector, bytes, + what, error, cgid, cgid_len, pdu_data, + pdu_len); +} + /* * Send out a notify message. */ -static void trace_note(struct blk_trace *bt, pid_t pid, int action, +static void trace_note(struct blk_trace *bt, pid_t pid, u64 action, const void *data, size_t len, u64 cgid) { - struct blk_io_trace *t; struct ring_buffer_event *event = NULL; struct trace_buffer *buffer = NULL; unsigned int trace_ctx = 0; @@ -77,38 +180,30 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; + action = lower_32_bits(action | (cgid ? __BLK_TN_CGROUP : 0)); if (blk_tracer) { + struct blk_io_trace2 *t; + size_t trace_len = sizeof(*t) + cgid_len + len; + buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + len + cgid_len, - trace_ctx); + trace_len, trace_ctx); if (!event) return; t = ring_buffer_event_data(event); - goto record_it; + record_blktrace_event2(t, pid, cpu, 0, 0, + action, bt->dev, 0, cgid, cgid_len, + (void *)data, len); + trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); + return; } if (!bt->rchan) return; - t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len); - if (t) { - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; - t->time = ktime_to_ns(ktime_get()); -record_it: - t->device = bt->dev; - t->action = action | (cgid ? __BLK_TN_CGROUP : 0); - t->pid = pid; - t->cpu = cpu; - t->pdu_len = len + cgid_len; - if (cgid_len) - memcpy((void *)t + sizeof(*t), &cgid, cgid_len); - memcpy((void *) t + sizeof(*t) + cgid_len, data, len); - - if (blk_tracer) - trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); - } + relay_blktrace_event(bt, 0, pid, cpu, 0, 0, action, 0, cgid, + cgid_len, (void *)data, len); } /* @@ -182,7 +277,7 @@ void __blk_trace_note_message(struct blk_trace *bt, } EXPORT_SYMBOL_GPL(__blk_trace_note_message); -static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, +static int act_log_check(struct blk_trace *bt, u64 what, sector_t sector, pid_t pid) { if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) @@ -213,13 +308,12 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), * blk_io_trace structure and places it in a per-cpu subbuffer. */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, - const blk_opf_t opf, u32 what, int error, + const blk_opf_t opf, u64 what, int error, int pdu_len, void *pdu_data, u64 cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; struct trace_buffer *buffer = NULL; - struct blk_io_trace *t; unsigned long flags = 0; unsigned long *sequence; unsigned int trace_ctx = 0; @@ -228,6 +322,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; const enum req_op op = opf & REQ_OP_MASK; + size_t trace_len; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; @@ -238,10 +333,47 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= MASK_TC_BIT(opf, META); what |= MASK_TC_BIT(opf, PREFLUSH); what |= MASK_TC_BIT(opf, FUA); - if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE) + + switch (op) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: what |= BLK_TC_ACT(BLK_TC_DISCARD); - if (op == REQ_OP_FLUSH) + break; + case REQ_OP_FLUSH: what |= BLK_TC_ACT(BLK_TC_FLUSH); + break; + case REQ_OP_ZONE_APPEND: + what |= BLK_TC_ACT(BLK_TC_ZONE_APPEND); + break; + case REQ_OP_ZONE_RESET: + what |= BLK_TC_ACT(BLK_TC_ZONE_RESET); + break; + case REQ_OP_ZONE_RESET_ALL: + what |= BLK_TC_ACT(BLK_TC_ZONE_RESET_ALL); + break; + case REQ_OP_ZONE_FINISH: + what |= BLK_TC_ACT(BLK_TC_ZONE_FINISH); + break; + case REQ_OP_ZONE_OPEN: + what |= BLK_TC_ACT(BLK_TC_ZONE_OPEN); + break; + case REQ_OP_ZONE_CLOSE: + what |= BLK_TC_ACT(BLK_TC_ZONE_CLOSE); + break; + case REQ_OP_WRITE_ZEROES: + what |= BLK_TC_ACT(BLK_TC_WRITE_ZEROES); + break; + default: + break; + } + + /* Drop trace events for zone operations with blktrace v1 */ + if (bt->version == 1 && (what >> BLK_TC_SHIFT) > BLK_TC_END_V1) { + pr_debug_ratelimited("blktrace v1 cannot trace zone operation 0x%llx\n", + (unsigned long long)what); + return; + } + if (cgid) what |= __BLK_TA_CGROUP; @@ -255,13 +387,68 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); + switch (bt->version) { + case 1: + trace_len = sizeof(struct blk_io_trace); + break; + case 2: + default: + /* + * ftrace always uses v2 (blk_io_trace2) format. + * + * For sysfs-enabled tracing path (enabled via + * /sys/block/DEV/trace/enable), blk_trace_setup_queue() + * never initializes bt->version, leaving it 0 from + * kzalloc(). We must handle version==0 safely here. + * + * Fall through to default to ensure we never hit the + * old bug where default set trace_len=0, causing + * buffer underflow and memory corruption. + * + * Always use v2 format for ftrace and normalize + * bt->version to 2 when uninitialized. + */ + trace_len = sizeof(struct blk_io_trace2); + if (bt->version == 0) + bt->version = 2; + break; + } + trace_len += pdu_len + cgid_len; event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + pdu_len + cgid_len, - trace_ctx); + trace_len, trace_ctx); if (!event) return; - t = ring_buffer_event_data(event); - goto record_it; + + switch (bt->version) { + case 1: + record_blktrace_event(ring_buffer_event_data(event), + pid, cpu, sector, bytes, + what, bt->dev, error, cgid, cgid_len, + pdu_data, pdu_len); + break; + case 2: + default: + /* + * Use v2 recording function (record_blktrace_event2) + * which writes blk_io_trace2 structure with correct + * field layout: + * - 32-bit pid at offset 28 + * - 64-bit action at offset 32 + * + * Fall through to default handles version==0 case + * (from sysfs path), ensuring we always use correct + * v2 recording function to match the v2 buffer + * allocated above. + */ + record_blktrace_event2(ring_buffer_event_data(event), + pid, cpu, sector, bytes, + what, bt->dev, error, cgid, cgid_len, + pdu_data, pdu_len); + break; + } + + trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); + return; } if (unlikely(tsk->btrace_seq != blktrace_seq)) @@ -273,41 +460,10 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, * from coming in and stepping on our toes. */ local_irq_save(flags); - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len); - if (t) { - sequence = per_cpu_ptr(bt->sequence, cpu); - - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; - t->sequence = ++(*sequence); - t->time = ktime_to_ns(ktime_get()); -record_it: - /* - * These two are not needed in ftrace as they are in the - * generic trace_entry, filled by tracing_generic_entry_update, - * but for the trace_event->bin() synthesizer benefit we do it - * here too. - */ - t->cpu = cpu; - t->pid = pid; - - t->sector = sector; - t->bytes = bytes; - t->action = what; - t->device = bt->dev; - t->error = error; - t->pdu_len = pdu_len + cgid_len; - - if (cgid_len) - memcpy((void *)t + sizeof(*t), &cgid, cgid_len); - if (pdu_len) - memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); - - if (blk_tracer) { - trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); - return; - } - } - + sequence = per_cpu_ptr(bt->sequence, cpu); + (*sequence)++; + relay_blktrace_event(bt, *sequence, pid, cpu, sector, bytes, + what, error, cgid, cgid_len, pdu_data, pdu_len); local_irq_restore(flags); } @@ -494,9 +650,10 @@ static void blk_trace_setup_lba(struct blk_trace *bt, /* * Setup everything required to start tracing */ -static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, - struct blk_user_trace_setup *buts) +static struct blk_trace *blk_trace_setup_prepare(struct request_queue *q, + char *name, dev_t dev, + u32 buf_size, u32 buf_nr, + struct block_device *bdev) { struct blk_trace *bt = NULL; struct dentry *dir = NULL; @@ -504,31 +661,19 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, lockdep_assert_held(&q->debugfs_mutex); - if (!buts->buf_size || !buts->buf_nr) - return -EINVAL; - - strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE); - - /* - * some device names have larger paths - convert the slashes - * to underscores for this to work as expected - */ - strreplace(buts->name, '/', '_'); - /* * bdev can be NULL, as with scsi-generic, this is a helpful as * we can be. */ if (rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex))) { - pr_warn("Concurrent blktraces are not allowed on %s\n", - buts->name); - return -EBUSY; + pr_warn("Concurrent blktraces are not allowed on %s\n", name); + return ERR_PTR(-EBUSY); } bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) - return -ENOMEM; + return ERR_PTR(-ENOMEM); ret = -ENOMEM; bt->sequence = alloc_percpu(unsigned long); @@ -548,7 +693,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (bdev && !bdev_is_partition(bdev)) dir = q->debugfs_dir; else - bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); + bt->dir = dir = debugfs_create_dir(name, blk_debugfs_root); /* * As blktrace relies on debugfs for its interface the debugfs directory @@ -556,8 +701,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, * files or directories. */ if (IS_ERR_OR_NULL(dir)) { - pr_warn("debugfs_dir not present for %s so skipping\n", - buts->name); + pr_warn("debugfs_dir not present for %s so skipping\n", name); ret = -ENOENT; goto err; } @@ -569,17 +713,40 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); - bt->rchan = relay_open("trace", dir, buts->buf_size, - buts->buf_nr, &blk_relay_callbacks, bt); + bt->rchan = relay_open("trace", dir, buf_size, buf_nr, + &blk_relay_callbacks, bt); if (!bt->rchan) goto err; + blk_trace_setup_lba(bt, bdev); + + return bt; + +err: + blk_trace_free(q, bt); + + return ERR_PTR(ret); +} + +static void blk_trace_setup_finalize(struct request_queue *q, + char *name, int version, + struct blk_trace *bt, + struct blk_user_trace_setup2 *buts) + +{ + strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE2); + + /* + * some device names have larger paths - convert the slashes + * to underscores for this to work as expected + */ + strreplace(buts->name, '/', '_'); + + bt->version = version; bt->act_mask = buts->act_mask; if (!bt->act_mask) bt->act_mask = (u16) -1; - blk_trace_setup_lba(bt, bdev); - /* overwrite with user settings */ if (buts->start_lba) bt->start_lba = buts->start_lba; @@ -591,30 +758,43 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, rcu_assign_pointer(q->blk_trace, bt); get_probe_ref(); - - ret = 0; -err: - if (ret) - blk_trace_free(q, bt); - return ret; } int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg) { + struct blk_user_trace_setup2 buts2; struct blk_user_trace_setup buts; + struct blk_trace *bt; int ret; ret = copy_from_user(&buts, arg, sizeof(buts)); if (ret) return -EFAULT; + if (!buts.buf_size || !buts.buf_nr) + return -EINVAL; + + buts2 = (struct blk_user_trace_setup2) { + .act_mask = buts.act_mask, + .buf_size = buts.buf_size, + .buf_nr = buts.buf_nr, + .start_lba = buts.start_lba, + .end_lba = buts.end_lba, + .pid = buts.pid, + }; + mutex_lock(&q->debugfs_mutex); - ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + bt = blk_trace_setup_prepare(q, name, dev, buts.buf_size, buts.buf_nr, + bdev); + if (IS_ERR(bt)) { + mutex_unlock(&q->debugfs_mutex); + return PTR_ERR(bt); + } + blk_trace_setup_finalize(q, name, 1, bt, &buts2); + strcpy(buts.name, buts2.name); mutex_unlock(&q->debugfs_mutex); - if (ret) - return ret; if (copy_to_user(arg, &buts, sizeof(buts))) { blk_trace_remove(q); @@ -624,19 +804,54 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, } EXPORT_SYMBOL_GPL(blk_trace_setup); +static int blk_trace_setup2(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, char __user *arg) +{ + struct blk_user_trace_setup2 buts2; + struct blk_trace *bt; + + if (copy_from_user(&buts2, arg, sizeof(buts2))) + return -EFAULT; + + if (!buts2.buf_size || !buts2.buf_nr) + return -EINVAL; + + if (buts2.flags != 0) + return -EINVAL; + + mutex_lock(&q->debugfs_mutex); + bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr, + bdev); + if (IS_ERR(bt)) { + mutex_unlock(&q->debugfs_mutex); + return PTR_ERR(bt); + } + blk_trace_setup_finalize(q, name, 2, bt, &buts2); + mutex_unlock(&q->debugfs_mutex); + + if (copy_to_user(arg, &buts2, sizeof(buts2))) { + blk_trace_remove(q); + return -EFAULT; + } + return 0; +} + #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) static int compat_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg) { - struct blk_user_trace_setup buts; + struct blk_user_trace_setup2 buts2; struct compat_blk_user_trace_setup cbuts; - int ret; + struct blk_trace *bt; if (copy_from_user(&cbuts, arg, sizeof(cbuts))) return -EFAULT; - buts = (struct blk_user_trace_setup) { + if (!cbuts.buf_size || !cbuts.buf_nr) + return -EINVAL; + + buts2 = (struct blk_user_trace_setup2) { .act_mask = cbuts.act_mask, .buf_size = cbuts.buf_size, .buf_nr = cbuts.buf_nr, @@ -646,12 +861,16 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, }; mutex_lock(&q->debugfs_mutex); - ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr, + bdev); + if (IS_ERR(bt)) { + mutex_unlock(&q->debugfs_mutex); + return PTR_ERR(bt); + } + blk_trace_setup_finalize(q, name, 1, bt, &buts2); mutex_unlock(&q->debugfs_mutex); - if (ret) - return ret; - if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { + if (copy_to_user(arg, &buts2.name, ARRAY_SIZE(buts2.name))) { blk_trace_remove(q); return -EFAULT; } @@ -707,6 +926,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) char b[BDEVNAME_SIZE]; switch (cmd) { + case BLKTRACESETUP2: + snprintf(b, sizeof(b), "%pg", bdev); + ret = blk_trace_setup2(q, b, bdev->bd_dev, bdev, arg); + break; case BLKTRACESETUP: snprintf(b, sizeof(b), "%pg", bdev); ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); @@ -794,7 +1017,7 @@ blk_trace_request_get_cgid(struct request *rq) * **/ static void blk_add_trace_rq(struct request *rq, blk_status_t error, - unsigned int nr_bytes, u32 what, u64 cgid) + unsigned int nr_bytes, u64 what, u64 cgid) { struct blk_trace *bt; @@ -846,6 +1069,22 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, blk_trace_request_get_cgid(rq)); } +static void blk_add_trace_zone_update_request(void *ignore, struct request *rq) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(rq->q->blk_trace); + if (likely(!bt) || bt->version < 2) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ZONE_APPEND, + blk_trace_request_get_cgid(rq)); +} + /** * blk_add_trace_bio - Add a trace for a bio oriented action * @q: queue the io is for @@ -858,7 +1097,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, * **/ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what, int error) + u64 what, int error) { struct blk_trace *bt; @@ -924,7 +1163,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(depth); - u32 what; + u64 what; if (explicit) what = BLK_TA_UNPLUG_IO; @@ -936,6 +1175,37 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, rcu_read_unlock(); } +static void blk_add_trace_zone_plug(void *ignore, struct request_queue *q, + unsigned int zno, sector_t sector, + unsigned int sectors) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (bt && bt->version >= 2) + __blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0, + BLK_TA_ZONE_PLUG, 0, 0, NULL, 0); + rcu_read_unlock(); + + return; +} + +static void blk_add_trace_zone_unplug(void *ignore, struct request_queue *q, + unsigned int zno, sector_t sector, + unsigned int sectors) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (bt && bt->version >= 2) + __blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0, + BLK_TA_ZONE_UNPLUG, 0, 0, NULL, 0); + rcu_read_unlock(); + return; +} + static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) { struct request_queue *q = bio->bi_bdev->bd_disk->queue; @@ -1076,6 +1346,15 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_getrq(blk_add_trace_getrq, NULL); WARN_ON(ret); + ret = register_trace_blk_zone_append_update_request_bio( + blk_add_trace_zone_update_request, NULL); + WARN_ON(ret); + ret = register_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug, + NULL); + WARN_ON(ret); + ret = register_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug, + NULL); + WARN_ON(ret); ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); @@ -1095,6 +1374,10 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_split(blk_add_trace_split, NULL); unregister_trace_block_unplug(blk_add_trace_unplug, NULL); unregister_trace_block_plug(blk_add_trace_plug, NULL); + unregister_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug, NULL); + unregister_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug, NULL); + unregister_trace_blk_zone_append_update_request_bio( + blk_add_trace_zone_update_request, NULL); unregister_trace_block_getrq(blk_add_trace_getrq, NULL); unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); @@ -1113,7 +1396,7 @@ static void blk_unregister_tracepoints(void) * struct blk_io_tracer formatting routines */ -static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) +static void fill_rwbs(char *rwbs, const struct blk_io_trace2 *t) { int i = 0; int tc = t->action >> BLK_TC_SHIFT; @@ -1128,7 +1411,10 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) if (tc & BLK_TC_DISCARD) rwbs[i++] = 'D'; - else if (tc & BLK_TC_WRITE) + else if (tc & BLK_TC_WRITE_ZEROES) { + rwbs[i++] = 'W'; + rwbs[i++] = 'Z'; + } else if (tc & BLK_TC_WRITE) rwbs[i++] = 'W'; else if (t->bytes) rwbs[i++] = 'R'; @@ -1148,9 +1434,9 @@ out: } static inline -const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) +const struct blk_io_trace2 *te_blk_io_trace(const struct trace_entry *ent) { - return (const struct blk_io_trace *)ent; + return (const struct blk_io_trace2 *)ent; } static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) @@ -1209,7 +1495,7 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act, unsigned long long ts = iter->ts; unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); unsigned secs = (unsigned long)ts; - const struct blk_io_trace *t = te_blk_io_trace(iter->ent); + const struct blk_io_trace2 *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); @@ -1223,7 +1509,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, bool has_cg) { char rwbs[RWBS_LEN]; - const struct blk_io_trace *t = te_blk_io_trace(iter->ent); + const struct blk_io_trace2 *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); if (has_cg) { @@ -1444,7 +1730,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; - const struct blk_io_trace *t; + const struct blk_io_trace2 *t; u16 what; bool long_act; blk_log_action_t *log_action; @@ -1452,7 +1738,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, t = te_blk_io_trace(iter->ent); what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP; - long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE); + long_act = !!(tr->trace_flags & TRACE_ITER(VERBOSE)); log_action = classic ? &blk_log_action_classic : &blk_log_action; has_cg = t->action & __BLK_TA_CGROUP; @@ -1481,8 +1767,8 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, static void blk_trace_synthesize_old_trace(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; - struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; - const int offset = offsetof(struct blk_io_trace, sector); + struct blk_io_trace2 *t = (struct blk_io_trace2 *)iter->ent; + const int offset = offsetof(struct blk_io_trace2, sector); struct blk_io_trace old = { .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, .time = iter->ts, @@ -1517,9 +1803,9 @@ blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) /* don't output context-info for blk_classic output */ if (bit == TRACE_BLK_OPT_CLASSIC) { if (set) - tr->trace_flags &= ~TRACE_ITER_CONTEXT_INFO; + tr->trace_flags &= ~TRACE_ITER(CONTEXT_INFO); else - tr->trace_flags |= TRACE_ITER_CONTEXT_INFO; + tr->trace_flags |= TRACE_ITER(CONTEXT_INFO); } return 0; } @@ -1559,6 +1845,10 @@ static int __init init_blk_tracer(void) return 1; } + BUILD_BUG_ON(__alignof__(struct blk_user_trace_setup2) % + __alignof__(long)); + BUILD_BUG_ON(__alignof__(struct blk_io_trace2) % __alignof__(long)); + return 0; } @@ -1667,6 +1957,7 @@ static const struct { { BLK_TC_DISCARD, "discard" }, { BLK_TC_DRV_DATA, "drv_data" }, { BLK_TC_FUA, "fua" }, + { BLK_TC_WRITE_ZEROES, "write-zeroes" }, }; static int blk_trace_str2mask(const char *str) @@ -1880,6 +2171,10 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf) rwbs[i++] = 'Z'; rwbs[i++] = 'C'; break; + case REQ_OP_WRITE_ZEROES: + rwbs[i++] = 'W'; + rwbs[i++] = 'Z'; + break; default: rwbs[i++] = 'N'; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4f87c16d915a..d57727abaade 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2529,7 +2529,7 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) return run_ctx->entry_ip; } -static int +static __always_inline int kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, unsigned long entry_ip, struct ftrace_regs *fregs, bool is_return, void *data) @@ -3372,13 +3372,13 @@ typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struc * direct calls into all the specific callback implementations * (copy_user_data_sleepable, copy_user_data_nofault, and so on) */ -static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 doff, u32 size, +static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size, const void *unsafe_src, copy_fn_t str_copy_fn, struct task_struct *tsk) { struct bpf_dynptr_kern *dst; - u32 chunk_sz, off; + u64 chunk_sz, off; void *dst_slice; int cnt, err; char buf[256]; @@ -3392,7 +3392,7 @@ static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 do return -E2BIG; for (off = 0; off < size; off += chunk_sz - 1) { - chunk_sz = min_t(u32, sizeof(buf), size - off); + chunk_sz = min_t(u64, sizeof(buf), size - off); /* Expect str_copy_fn to return count of copied bytes, including * zero terminator. Next iteration increment off by chunk_sz - 1 to * overwrite NUL. @@ -3409,14 +3409,14 @@ static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 do return off; } -static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 doff, - u32 size, const void *unsafe_src, +static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64 doff, + u64 size, const void *unsafe_src, copy_fn_t copy_fn, struct task_struct *tsk) { struct bpf_dynptr_kern *dst; void *dst_slice; char buf[256]; - u32 off, chunk_sz; + u64 off, chunk_sz; int err; dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); @@ -3428,7 +3428,7 @@ static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 return -E2BIG; for (off = 0; off < size; off += chunk_sz) { - chunk_sz = min_t(u32, sizeof(buf), size - off); + chunk_sz = min_t(u64, sizeof(buf), size - off); err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk); if (err) return err; @@ -3514,58 +3514,58 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid return bpf_send_signal_common(sig, type, task, value); } -__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign) +__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_data_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr__ign) +__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign, copy_kernel_data_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign) +__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_str_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr__ign) +__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign, copy_kernel_str_nofault, NULL); } -__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign) +__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_data_sleepable, NULL); } -__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign) +__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_str_sleepable, NULL); } -__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign, +__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_data_sleepable, tsk); } -__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign, +__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 484ad7a18463..cc48d16be43e 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -163,7 +163,7 @@ enum { #define RET_STACK(t, offset) ((struct ftrace_ret_stack *)(&(t)->ret_stack[offset])) /* - * Each fgraph_ops has a reservered unsigned long at the end (top) of the + * Each fgraph_ops has a reserved unsigned long at the end (top) of the * ret_stack to store task specific state. */ #define SHADOW_STACK_TASK_VARS(ret_stack) \ @@ -498,9 +498,6 @@ found: return get_data_type_data(current, offset); } -/* Both enabled by default (can be cleared by function_graph tracer flags */ -bool fgraph_sleep_time = true; - #ifdef CONFIG_DYNAMIC_FTRACE /* * archs can override this function if they must do something @@ -1019,15 +1016,11 @@ void fgraph_init_ops(struct ftrace_ops *dst_ops, mutex_init(&dst_ops->local_hash.regex_lock); INIT_LIST_HEAD(&dst_ops->subop_list); dst_ops->flags |= FTRACE_OPS_FL_INITIALIZED; + dst_ops->private = src_ops->private; } #endif } -void ftrace_graph_sleep_time_control(bool enable) -{ - fgraph_sleep_time = enable; -} - /* * Simply points to ftrace_stub, but with the proper protocol. * Defined by the linker script in linux/vmlinux.lds.h @@ -1098,7 +1091,7 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt, * Does the user want to count the time a function was asleep. * If so, do not update the time stamps. */ - if (fgraph_sleep_time) + if (!fgraph_no_sleep_time) return; timestamp = trace_clock_local(); @@ -1376,6 +1369,13 @@ int register_ftrace_graph(struct fgraph_ops *gops) ftrace_graph_active++; + /* Always save the function, and reset at unregistering */ + gops->saved_func = gops->entryfunc; +#ifdef CONFIG_DYNAMIC_FTRACE + if (ftrace_pids_enabled(&gops->ops)) + gops->entryfunc = fgraph_pid_func; +#endif + if (ftrace_graph_active == 2) ftrace_graph_disable_direct(true); @@ -1395,8 +1395,6 @@ int register_ftrace_graph(struct fgraph_ops *gops) } else { init_task_vars(gops->idx); } - /* Always save the function, and reset at unregistering */ - gops->saved_func = gops->entryfunc; gops->ops.flags |= FTRACE_OPS_FL_GRAPH; diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 5a807d62e76d..1188eefef07c 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -10,6 +10,7 @@ #include <linux/kprobes.h> #include <linux/list.h> #include <linux/mutex.h> +#include <linux/rhashtable.h> #include <linux/slab.h> #include <linux/sort.h> @@ -29,7 +30,7 @@ * fprobe_table: hold 'fprobe_hlist::hlist' for checking the fprobe still * exists. The key is the address of fprobe instance. * fprobe_ip_table: hold 'fprobe_hlist::array[*]' for searching the fprobe - * instance related to the funciton address. The key is the ftrace IP + * instance related to the function address. The key is the ftrace IP * address. * * When unregistering the fprobe, fprobe_hlist::fp and fprobe_hlist::array[*].fp @@ -41,60 +42,68 @@ * - RCU hlist traversal under disabling preempt */ static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE]; -static struct hlist_head fprobe_ip_table[FPROBE_IP_TABLE_SIZE]; +static struct rhltable fprobe_ip_table; static DEFINE_MUTEX(fprobe_mutex); +static struct fgraph_ops fprobe_graph_ops; -/* - * Find first fprobe in the hlist. It will be iterated twice in the entry - * probe, once for correcting the total required size, the second time is - * calling back the user handlers. - * Thus the hlist in the fprobe_table must be sorted and new probe needs to - * be added *before* the first fprobe. - */ -static struct fprobe_hlist_node *find_first_fprobe_node(unsigned long ip) +static u32 fprobe_node_hashfn(const void *data, u32 len, u32 seed) { - struct fprobe_hlist_node *node; - struct hlist_head *head; + return hash_ptr(*(unsigned long **)data, 32); +} - head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)]; - hlist_for_each_entry_rcu(node, head, hlist, - lockdep_is_held(&fprobe_mutex)) { - if (node->addr == ip) - return node; - } - return NULL; +static int fprobe_node_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + unsigned long key = *(unsigned long *)arg->key; + const struct fprobe_hlist_node *n = ptr; + + return n->addr != key; } -NOKPROBE_SYMBOL(find_first_fprobe_node); -/* Node insertion and deletion requires the fprobe_mutex */ -static void insert_fprobe_node(struct fprobe_hlist_node *node) +static u32 fprobe_node_obj_hashfn(const void *data, u32 len, u32 seed) { - unsigned long ip = node->addr; - struct fprobe_hlist_node *next; - struct hlist_head *head; + const struct fprobe_hlist_node *n = data; + + return hash_ptr((void *)n->addr, 32); +} + +static const struct rhashtable_params fprobe_rht_params = { + .head_offset = offsetof(struct fprobe_hlist_node, hlist), + .key_offset = offsetof(struct fprobe_hlist_node, addr), + .key_len = sizeof_field(struct fprobe_hlist_node, addr), + .hashfn = fprobe_node_hashfn, + .obj_hashfn = fprobe_node_obj_hashfn, + .obj_cmpfn = fprobe_node_cmp, + .automatic_shrinking = true, +}; +/* Node insertion and deletion requires the fprobe_mutex */ +static int insert_fprobe_node(struct fprobe_hlist_node *node) +{ lockdep_assert_held(&fprobe_mutex); - next = find_first_fprobe_node(ip); - if (next) { - hlist_add_before_rcu(&node->hlist, &next->hlist); - return; - } - head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)]; - hlist_add_head_rcu(&node->hlist, head); + return rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params); } /* Return true if there are synonims */ static bool delete_fprobe_node(struct fprobe_hlist_node *node) { lockdep_assert_held(&fprobe_mutex); + bool ret; /* Avoid double deleting */ if (READ_ONCE(node->fp) != NULL) { WRITE_ONCE(node->fp, NULL); - hlist_del_rcu(&node->hlist); + rhltable_remove(&fprobe_ip_table, &node->hlist, + fprobe_rht_params); } - return !!find_first_fprobe_node(node->addr); + + rcu_read_lock(); + ret = !!rhltable_lookup(&fprobe_ip_table, &node->addr, + fprobe_rht_params); + rcu_read_unlock(); + + return ret; } /* Check existence of the fprobe */ @@ -246,12 +255,128 @@ static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent return ret; } -static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, - struct ftrace_regs *fregs) +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_ARGS) || defined(CONFIG_DYNAMIC_FTRACE_WITH_REGS) +/* ftrace_ops callback, this processes fprobes which have only entry_handler. */ +static void fprobe_ftrace_entry(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + struct fprobe_hlist_node *node; + struct rhlist_head *head, *pos; + struct fprobe *fp; + int bit; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + /* + * ftrace_test_recursion_trylock() disables preemption, but + * rhltable_lookup() checks whether rcu_read_lcok is held. + * So we take rcu_read_lock() here. + */ + rcu_read_lock(); + head = rhltable_lookup(&fprobe_ip_table, &ip, fprobe_rht_params); + + rhl_for_each_entry_rcu(node, pos, head, hlist) { + if (node->addr != ip) + break; + fp = READ_ONCE(node->fp); + if (unlikely(!fp || fprobe_disabled(fp) || fp->exit_handler)) + continue; + + if (fprobe_shared_with_kprobes(fp)) + __fprobe_kprobe_handler(ip, parent_ip, fp, fregs, NULL); + else + __fprobe_handler(ip, parent_ip, fp, fregs, NULL); + } + rcu_read_unlock(); + ftrace_test_recursion_unlock(bit); +} +NOKPROBE_SYMBOL(fprobe_ftrace_entry); + +static struct ftrace_ops fprobe_ftrace_ops = { + .func = fprobe_ftrace_entry, + .flags = FTRACE_OPS_FL_SAVE_ARGS, +}; +static int fprobe_ftrace_active; + +static int fprobe_ftrace_add_ips(unsigned long *addrs, int num) +{ + int ret; + + lockdep_assert_held(&fprobe_mutex); + + ret = ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 0, 0); + if (ret) + return ret; + + if (!fprobe_ftrace_active) { + ret = register_ftrace_function(&fprobe_ftrace_ops); + if (ret) { + ftrace_free_filter(&fprobe_ftrace_ops); + return ret; + } + } + fprobe_ftrace_active++; + return 0; +} + +static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num) +{ + lockdep_assert_held(&fprobe_mutex); + + fprobe_ftrace_active--; + if (!fprobe_ftrace_active) + unregister_ftrace_function(&fprobe_ftrace_ops); + if (num) + ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 1, 0); +} + +static bool fprobe_is_ftrace(struct fprobe *fp) +{ + return !fp->exit_handler; +} + +#ifdef CONFIG_MODULES +static void fprobe_set_ips(unsigned long *ips, unsigned int cnt, int remove, + int reset) +{ + ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, remove, reset); + ftrace_set_filter_ips(&fprobe_ftrace_ops, ips, cnt, remove, reset); +} +#endif +#else +static int fprobe_ftrace_add_ips(unsigned long *addrs, int num) +{ + return -ENOENT; +} + +static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num) +{ +} + +static bool fprobe_is_ftrace(struct fprobe *fp) +{ + return false; +} + +#ifdef CONFIG_MODULES +static void fprobe_set_ips(unsigned long *ips, unsigned int cnt, int remove, + int reset) +{ + ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, remove, reset); +} +#endif +#endif /* !CONFIG_DYNAMIC_FTRACE_WITH_ARGS && !CONFIG_DYNAMIC_FTRACE_WITH_REGS */ + +/* fgraph_ops callback, this processes fprobes which have exit_handler. */ +static int fprobe_fgraph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs) { - struct fprobe_hlist_node *node, *first; unsigned long *fgraph_data = NULL; unsigned long func = trace->func; + struct fprobe_hlist_node *node; + struct rhlist_head *head, *pos; unsigned long ret_ip; int reserved_words; struct fprobe *fp; @@ -260,14 +385,12 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, if (WARN_ON_ONCE(!fregs)) return 0; - first = node = find_first_fprobe_node(func); - if (unlikely(!first)) - return 0; - + guard(rcu)(); + head = rhltable_lookup(&fprobe_ip_table, &func, fprobe_rht_params); reserved_words = 0; - hlist_for_each_entry_from_rcu(node, hlist) { + rhl_for_each_entry_rcu(node, pos, head, hlist) { if (node->addr != func) - break; + continue; fp = READ_ONCE(node->fp); if (!fp || !fp->exit_handler) continue; @@ -278,15 +401,14 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, reserved_words += FPROBE_HEADER_SIZE_IN_LONG + SIZE_IN_LONG(fp->entry_data_size); } - node = first; if (reserved_words) { fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long)); if (unlikely(!fgraph_data)) { - hlist_for_each_entry_from_rcu(node, hlist) { + rhl_for_each_entry_rcu(node, pos, head, hlist) { if (node->addr != func) - break; + continue; fp = READ_ONCE(node->fp); - if (fp && !fprobe_disabled(fp)) + if (fp && !fprobe_disabled(fp) && !fprobe_is_ftrace(fp)) fp->nmissed++; } return 0; @@ -299,14 +421,14 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, */ ret_ip = ftrace_regs_get_return_address(fregs); used = 0; - hlist_for_each_entry_from_rcu(node, hlist) { + rhl_for_each_entry_rcu(node, pos, head, hlist) { int data_size; void *data; if (node->addr != func) - break; + continue; fp = READ_ONCE(node->fp); - if (!fp || fprobe_disabled(fp)) + if (unlikely(!fp || fprobe_disabled(fp) || fprobe_is_ftrace(fp))) continue; data_size = fp->entry_data_size; @@ -334,7 +456,7 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, /* If any exit_handler is set, data must be used. */ return used != 0; } -NOKPROBE_SYMBOL(fprobe_entry); +NOKPROBE_SYMBOL(fprobe_fgraph_entry); static void fprobe_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops, @@ -373,7 +495,7 @@ static void fprobe_return(struct ftrace_graph_ret *trace, NOKPROBE_SYMBOL(fprobe_return); static struct fgraph_ops fprobe_graph_ops = { - .entryfunc = fprobe_entry, + .entryfunc = fprobe_fgraph_entry, .retfunc = fprobe_return, }; static int fprobe_graph_active; @@ -449,25 +571,18 @@ static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long ad return 0; } -static void fprobe_remove_node_in_module(struct module *mod, struct hlist_head *head, - struct fprobe_addr_list *alist) +static void fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node, + struct fprobe_addr_list *alist) { - struct fprobe_hlist_node *node; - int ret = 0; - - hlist_for_each_entry_rcu(node, head, hlist, - lockdep_is_held(&fprobe_mutex)) { - if (!within_module(node->addr, mod)) - continue; - if (delete_fprobe_node(node)) - continue; - /* - * If failed to update alist, just continue to update hlist. - * Therefore, at list user handler will not hit anymore. - */ - if (!ret) - ret = fprobe_addr_list_add(alist, node->addr); - } + if (!within_module(node->addr, mod)) + return; + if (delete_fprobe_node(node)) + return; + /* + * If failed to update alist, just continue to update hlist. + * Therefore, at list user handler will not hit anymore. + */ + fprobe_addr_list_add(alist, node->addr); } /* Handle module unloading to manage fprobe_ip_table. */ @@ -475,8 +590,9 @@ static int fprobe_module_callback(struct notifier_block *nb, unsigned long val, void *data) { struct fprobe_addr_list alist = {.size = FPROBE_IPS_BATCH_INIT}; + struct fprobe_hlist_node *node; + struct rhashtable_iter iter; struct module *mod = data; - int i; if (val != MODULE_STATE_GOING) return NOTIFY_DONE; @@ -487,12 +603,19 @@ static int fprobe_module_callback(struct notifier_block *nb, return NOTIFY_DONE; mutex_lock(&fprobe_mutex); - for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++) - fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist); + rhltable_walk_enter(&fprobe_ip_table, &iter); + do { + rhashtable_walk_start(&iter); + + while ((node = rhashtable_walk_next(&iter)) && !IS_ERR(node)) + fprobe_remove_node_in_module(mod, node, &alist); + + rhashtable_walk_stop(&iter); + } while (node == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&iter); if (alist.index > 0) - ftrace_set_filter_ips(&fprobe_graph_ops.ops, - alist.addrs, alist.index, 1, 0); + fprobe_set_ips(alist.addrs, alist.index, 1, 0); mutex_unlock(&fprobe_mutex); kfree(alist.addrs); @@ -725,11 +848,23 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num) mutex_lock(&fprobe_mutex); hlist_array = fp->hlist_array; - ret = fprobe_graph_add_ips(addrs, num); + if (fprobe_is_ftrace(fp)) + ret = fprobe_ftrace_add_ips(addrs, num); + else + ret = fprobe_graph_add_ips(addrs, num); + if (!ret) { add_fprobe_hash(fp); - for (i = 0; i < hlist_array->size; i++) - insert_fprobe_node(&hlist_array->array[i]); + for (i = 0; i < hlist_array->size; i++) { + ret = insert_fprobe_node(&hlist_array->array[i]); + if (ret) + break; + } + /* fallback on insert error */ + if (ret) { + for (i--; i >= 0; i--) + delete_fprobe_node(&hlist_array->array[i]); + } } mutex_unlock(&fprobe_mutex); @@ -813,7 +948,10 @@ int unregister_fprobe(struct fprobe *fp) } del_fprobe_hash(fp); - fprobe_graph_remove_ips(addrs, count); + if (fprobe_is_ftrace(fp)) + fprobe_ftrace_remove_ips(addrs, count); + else + fprobe_graph_remove_ips(addrs, count); kfree_rcu(hlist_array, rcu); fp->hlist_array = NULL; @@ -825,3 +963,10 @@ out: return ret; } EXPORT_SYMBOL_GPL(unregister_fprobe); + +static int __init fprobe_initcall(void) +{ + rhltable_init(&fprobe_ip_table, &fprobe_rht_params); + return 0; +} +core_initcall(fprobe_initcall); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 59cfacb8a5bb..3ec2033c0774 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -534,7 +534,9 @@ static int function_stat_headers(struct seq_file *m) static int function_stat_show(struct seq_file *m, void *v) { + struct trace_array *tr = trace_get_global_array(); struct ftrace_profile *rec = v; + const char *refsymbol = NULL; char str[KSYM_SYMBOL_LEN]; #ifdef CONFIG_FUNCTION_GRAPH_TRACER static struct trace_seq s; @@ -554,7 +556,29 @@ static int function_stat_show(struct seq_file *m, void *v) return 0; #endif - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + if (tr->trace_flags & TRACE_ITER(PROF_TEXT_OFFSET)) { + unsigned long offset; + + if (core_kernel_text(rec->ip)) { + refsymbol = "_text"; + offset = rec->ip - (unsigned long)_text; + } else { + struct module *mod; + + guard(rcu)(); + mod = __module_text_address(rec->ip); + if (mod) { + refsymbol = mod->name; + /* Calculate offset from module's text entry address. */ + offset = rec->ip - (unsigned long)mod->mem[MOD_TEXT].base; + } + } + if (refsymbol) + snprintf(str, sizeof(str), " %s+%#lx", refsymbol, offset); + } + if (!refsymbol) + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + seq_printf(m, " %-30.30s %10lu", str, rec->counter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -838,6 +862,8 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace, return 1; } +bool fprofile_no_sleep_time; + static void profile_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops, struct ftrace_regs *fregs) @@ -863,7 +889,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace, calltime = rettime - profile_data->calltime; - if (!fgraph_sleep_time) { + if (fprofile_no_sleep_time) { if (current->ftrace_sleeptime) calltime -= current->ftrace_sleeptime - profile_data->sleeptime; } @@ -5951,7 +5977,8 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { del = __ftrace_lookup_ip(direct_functions, entry->ip); - if (del && del->direct == addr) { + if (del && ftrace_jmp_get(del->direct) == + ftrace_jmp_get(addr)) { remove_hash_entry(direct_functions, del); kfree(del); } @@ -6016,8 +6043,15 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) if (ftrace_hash_empty(hash)) return -EINVAL; + /* This is a "raw" address, and this should never happen. */ + if (WARN_ON_ONCE(ftrace_is_jmp(addr))) + return -EINVAL; + mutex_lock(&direct_mutex); + if (ops->flags & FTRACE_OPS_FL_JMP) + addr = ftrace_jmp_set(addr); + /* Make sure requested entries are not already registered.. */ size = 1 << hash->size_bits; for (i = 0; i < size; i++) { @@ -6067,7 +6101,7 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) new_hash = NULL; ops->func = call_direct_funcs; - ops->flags = MULTI_FLAGS; + ops->flags |= MULTI_FLAGS; ops->trampoline = FTRACE_REGS_ADDR; ops->direct_call = addr; @@ -6138,6 +6172,13 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) lockdep_assert_held_once(&direct_mutex); + /* This is a "raw" address, and this should never happen. */ + if (WARN_ON_ONCE(ftrace_is_jmp(addr))) + return -EINVAL; + + if (ops->flags & FTRACE_OPS_FL_JMP) + addr = ftrace_jmp_set(addr); + /* Enable the tmp_ops to have the same functions as the direct ops */ ftrace_ops_init(&tmp_ops); tmp_ops.func_hash = ops->func_hash; diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index 090bb5ea4a19..dbee72d69d0a 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -3,6 +3,7 @@ * Copyright (C) 2021 VMware Inc, Steven Rostedt <rostedt@goodmis.org> */ #include <linux/spinlock.h> +#include <linux/seqlock.h> #include <linux/irq_work.h> #include <linux/slab.h> #include "trace.h" @@ -126,7 +127,7 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid) { union upper_chunk *upper_chunk; union lower_chunk *lower_chunk; - unsigned long flags; + unsigned int seq; unsigned int upper1; unsigned int upper2; unsigned int lower; @@ -138,14 +139,16 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid) if (pid_split(pid, &upper1, &upper2, &lower) < 0) return false; - raw_spin_lock_irqsave(&pid_list->lock, flags); - upper_chunk = pid_list->upper[upper1]; - if (upper_chunk) { - lower_chunk = upper_chunk->data[upper2]; - if (lower_chunk) - ret = test_bit(lower, lower_chunk->data); - } - raw_spin_unlock_irqrestore(&pid_list->lock, flags); + do { + seq = read_seqcount_begin(&pid_list->seqcount); + ret = false; + upper_chunk = pid_list->upper[upper1]; + if (upper_chunk) { + lower_chunk = upper_chunk->data[upper2]; + if (lower_chunk) + ret = test_bit(lower, lower_chunk->data); + } + } while (read_seqcount_retry(&pid_list->seqcount, seq)); return ret; } @@ -178,6 +181,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid) return -EINVAL; raw_spin_lock_irqsave(&pid_list->lock, flags); + write_seqcount_begin(&pid_list->seqcount); upper_chunk = pid_list->upper[upper1]; if (!upper_chunk) { upper_chunk = get_upper_chunk(pid_list); @@ -199,6 +203,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid) set_bit(lower, lower_chunk->data); ret = 0; out: + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock_irqrestore(&pid_list->lock, flags); return ret; } @@ -230,6 +235,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid) return -EINVAL; raw_spin_lock_irqsave(&pid_list->lock, flags); + write_seqcount_begin(&pid_list->seqcount); upper_chunk = pid_list->upper[upper1]; if (!upper_chunk) goto out; @@ -250,6 +256,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid) } } out: + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock_irqrestore(&pid_list->lock, flags); return 0; } @@ -340,8 +347,10 @@ static void pid_list_refill_irq(struct irq_work *iwork) again: raw_spin_lock(&pid_list->lock); + write_seqcount_begin(&pid_list->seqcount); upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks; lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks; + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock(&pid_list->lock); if (upper_count <= 0 && lower_count <= 0) @@ -370,6 +379,7 @@ static void pid_list_refill_irq(struct irq_work *iwork) } raw_spin_lock(&pid_list->lock); + write_seqcount_begin(&pid_list->seqcount); if (upper) { *upper_next = pid_list->upper_list; pid_list->upper_list = upper; @@ -380,6 +390,7 @@ static void pid_list_refill_irq(struct irq_work *iwork) pid_list->lower_list = lower; pid_list->free_lower_chunks += lcnt; } + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock(&pid_list->lock); /* @@ -419,6 +430,7 @@ struct trace_pid_list *trace_pid_list_alloc(void) init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq); raw_spin_lock_init(&pid_list->lock); + seqcount_raw_spinlock_init(&pid_list->seqcount, &pid_list->lock); for (i = 0; i < CHUNK_ALLOC; i++) { union upper_chunk *chunk; diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h index 62e73f1ac85f..0b45fb0eadb9 100644 --- a/kernel/trace/pid_list.h +++ b/kernel/trace/pid_list.h @@ -76,6 +76,7 @@ union upper_chunk { }; struct trace_pid_list { + seqcount_raw_spinlock_t seqcount; raw_spinlock_t lock; struct irq_work refill_irqwork; union upper_chunk *upper[UPPER1_SIZE]; // 1 or 2K in size diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index afcd3747264d..41c9f5d079be 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -402,6 +402,41 @@ static void free_buffer_page(struct buffer_page *bpage) } /* + * For best performance, allocate cpu buffer data cache line sized + * and per CPU. + */ +#define alloc_cpu_buffer(cpu) (struct ring_buffer_per_cpu *) \ + kzalloc_node(ALIGN(sizeof(struct ring_buffer_per_cpu), \ + cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); + +#define alloc_cpu_page(cpu) (struct buffer_page *) \ + kzalloc_node(ALIGN(sizeof(struct buffer_page), \ + cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); + +static struct buffer_data_page *alloc_cpu_data(int cpu, int order) +{ + struct buffer_data_page *dpage; + struct page *page; + gfp_t mflags; + + /* + * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails + * gracefully without invoking oom-killer and the system is not + * destabilized. + */ + mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_COMP | __GFP_ZERO; + + page = alloc_pages_node(cpu_to_node(cpu), mflags, order); + if (!page) + return NULL; + + dpage = page_address(page); + rb_init_page(dpage); + + return dpage; +} + +/* * We need to fit the time_stamp delta into 27 bits. */ static inline bool test_time_stamp(u64 delta) @@ -1735,7 +1770,7 @@ static bool rb_meta_init(struct trace_buffer *buffer, int scratch_size) bmeta->total_size = total_size; bmeta->buffers_offset = (void *)ptr - (void *)bmeta; - /* Zero out the scatch pad */ + /* Zero out the scratch pad */ memset((void *)bmeta + sizeof(*bmeta), 0, bmeta->buffers_offset - sizeof(*bmeta)); return false; @@ -2204,7 +2239,6 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_cpu_meta *meta = NULL; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; - gfp_t mflags; long i; /* @@ -2219,13 +2253,6 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, return -ENOMEM; /* - * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails - * gracefully without invoking oom-killer and the system is not - * destabilized. - */ - mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL; - - /* * If a user thread allocates too much, and si_mem_available() * reports there's enough memory, even though there is not. * Make sure the OOM killer kills this thread. This can happen @@ -2241,10 +2268,8 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu); for (i = 0; i < nr_pages; i++) { - struct page *page; - bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - mflags, cpu_to_node(cpu_buffer->cpu)); + bpage = alloc_cpu_page(cpu_buffer->cpu); if (!bpage) goto free_pages; @@ -2267,13 +2292,10 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, bpage->range = 1; bpage->id = i + 1; } else { - page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), - mflags | __GFP_COMP | __GFP_ZERO, - cpu_buffer->buffer->subbuf_order); - if (!page) + int order = cpu_buffer->buffer->subbuf_order; + bpage->page = alloc_cpu_data(cpu_buffer->cpu, order); + if (!bpage->page) goto free_pages; - bpage->page = page_address(page); - rb_init_page(bpage->page); } bpage->order = cpu_buffer->buffer->subbuf_order; @@ -2324,14 +2346,12 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) { - struct ring_buffer_per_cpu *cpu_buffer __free(kfree) = NULL; + struct ring_buffer_per_cpu *cpu_buffer __free(kfree) = + alloc_cpu_buffer(cpu); struct ring_buffer_cpu_meta *meta; struct buffer_page *bpage; - struct page *page; int ret; - cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), - GFP_KERNEL, cpu_to_node(cpu)); if (!cpu_buffer) return NULL; @@ -2347,8 +2367,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); mutex_init(&cpu_buffer->mapping_lock); - bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL, cpu_to_node(cpu)); + bpage = alloc_cpu_page(cpu); if (!bpage) return NULL; @@ -2370,13 +2389,10 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; } else { - page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_COMP | __GFP_ZERO, - cpu_buffer->buffer->subbuf_order); - if (!page) + int order = cpu_buffer->buffer->subbuf_order; + bpage->page = alloc_cpu_data(cpu, order); + if (!bpage->page) goto fail_free_reader; - bpage->page = page_address(page); - rb_init_page(bpage->page); } INIT_LIST_HEAD(&cpu_buffer->reader_page->list); @@ -6073,7 +6089,7 @@ static void rb_clear_buffer_page(struct buffer_page *page) * id field, and updated via this function. * * But for a fixed memory mapped buffer, the id is already assigned for - * fixed memory ording in the memory layout and can not be used. Instead + * fixed memory ordering in the memory layout and can not be used. Instead * the index of where the page lies in the memory layout is used. * * For the normal pages, set the buffer page id with the passed in @id @@ -6464,7 +6480,6 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_read_page *bpage = NULL; unsigned long flags; - struct page *page; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return ERR_PTR(-ENODEV); @@ -6486,22 +6501,16 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); - if (bpage->data) - goto out; - - page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY | __GFP_COMP | __GFP_ZERO, - cpu_buffer->buffer->subbuf_order); - if (!page) { - kfree(bpage); - return ERR_PTR(-ENOMEM); + if (bpage->data) { + rb_init_page(bpage->data); + } else { + bpage->data = alloc_cpu_data(cpu, cpu_buffer->buffer->subbuf_order); + if (!bpage->data) { + kfree(bpage); + return ERR_PTR(-ENOMEM); + } } - bpage->data = page_address(page); - - out: - rb_init_page(bpage->data); - return bpage; } EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); @@ -7660,7 +7669,7 @@ static __init int test_ringbuffer(void) /* * Show buffer is enabled before setting rb_test_started. * Yes there's a small race window where events could be - * dropped and the thread wont catch it. But when a ring + * dropped and the thread won't catch it. But when a ring * buffer gets enabled, there will always be some kind of * delay before other CPUs see it. Thus, we don't care about * those dropped events. We care about events dropped after diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index cdc3aea12c93..593e3b59e42e 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -433,7 +433,7 @@ static int __init ring_buffer_benchmark_init(void) { int ret; - /* make a one meg buffer in overwite mode */ + /* make a one meg buffer in overwrite mode */ buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE); if (!buffer) return -ENOMEM; diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c index 74c6bcc2c749..76537b8a4343 100644 --- a/kernel/trace/rv/reactor_panic.c +++ b/kernel/trace/rv/reactor_panic.c @@ -13,13 +13,9 @@ #include <linux/init.h> #include <linux/rv.h> -__printf(1, 2) static void rv_panic_reaction(const char *msg, ...) +__printf(1, 0) static void rv_panic_reaction(const char *msg, va_list args) { - va_list args; - - va_start(args, msg); vpanic(msg, args); - va_end(args); } static struct rv_reactor rv_panic = { diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c index 2dae2916c05f..48c934e315b3 100644 --- a/kernel/trace/rv/reactor_printk.c +++ b/kernel/trace/rv/reactor_printk.c @@ -12,13 +12,9 @@ #include <linux/init.h> #include <linux/rv.h> -__printf(1, 2) static void rv_printk_reaction(const char *msg, ...) +__printf(1, 0) static void rv_printk_reaction(const char *msg, va_list args) { - va_list args; - - va_start(args, msg); vprintk_deferred(msg, args); - va_end(args); } static struct rv_reactor rv_printk = { diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 43e9ea473cda..ee4e68102f17 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -375,15 +375,13 @@ static ssize_t monitor_enable_write_data(struct file *filp, const char __user *u if (retval) return retval; - mutex_lock(&rv_interface_lock); + guard(mutex)(&rv_interface_lock); if (val) retval = rv_enable_monitor(mon); else retval = rv_disable_monitor(mon); - mutex_unlock(&rv_interface_lock); - return retval ? : count; } @@ -422,35 +420,27 @@ static const struct file_operations interface_desc_fops = { static int create_monitor_dir(struct rv_monitor *mon, struct rv_monitor *parent) { struct dentry *root = parent ? parent->root_d : get_monitors_root(); - const char *name = mon->name; + struct dentry *dir __free(rv_remove) = rv_create_dir(mon->name, root); struct dentry *tmp; int retval; - mon->root_d = rv_create_dir(name, root); - if (!mon->root_d) + if (!dir) return -ENOMEM; - tmp = rv_create_file("enable", RV_MODE_WRITE, mon->root_d, mon, &interface_enable_fops); - if (!tmp) { - retval = -ENOMEM; - goto out_remove_root; - } + tmp = rv_create_file("enable", RV_MODE_WRITE, dir, mon, &interface_enable_fops); + if (!tmp) + return -ENOMEM; - tmp = rv_create_file("desc", RV_MODE_READ, mon->root_d, mon, &interface_desc_fops); - if (!tmp) { - retval = -ENOMEM; - goto out_remove_root; - } + tmp = rv_create_file("desc", RV_MODE_READ, dir, mon, &interface_desc_fops); + if (!tmp) + return -ENOMEM; - retval = reactor_populate_monitor(mon); + retval = reactor_populate_monitor(mon, dir); if (retval) - goto out_remove_root; + return retval; + mon->root_d = no_free_ptr(dir); return 0; - -out_remove_root: - rv_remove(mon->root_d); - return retval; } /* @@ -568,7 +558,7 @@ static void disable_all_monitors(void) struct rv_monitor *mon; int enabled = 0; - mutex_lock(&rv_interface_lock); + guard(mutex)(&rv_interface_lock); list_for_each_entry(mon, &rv_monitors_list, list) enabled += __rv_disable_monitor(mon, false); @@ -581,8 +571,6 @@ static void disable_all_monitors(void) */ tracepoint_synchronize_unregister(); } - - mutex_unlock(&rv_interface_lock); } static int enabled_monitors_open(struct inode *inode, struct file *file) @@ -623,7 +611,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user if (!len) return count; - mutex_lock(&rv_interface_lock); + guard(mutex)(&rv_interface_lock); retval = -EINVAL; @@ -644,13 +632,11 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user else retval = rv_disable_monitor(mon); - if (!retval) - retval = count; - - break; + if (retval) + return retval; + return count; } - mutex_unlock(&rv_interface_lock); return retval; } @@ -737,7 +723,7 @@ static ssize_t monitoring_on_write_data(struct file *filp, const char __user *us if (retval) return retval; - mutex_lock(&rv_interface_lock); + guard(mutex)(&rv_interface_lock); if (val) turn_monitoring_on_with_reset(); @@ -750,8 +736,6 @@ static ssize_t monitoring_on_write_data(struct file *filp, const char __user *us */ tracepoint_synchronize_unregister(); - mutex_unlock(&rv_interface_lock); - return count; } @@ -784,28 +768,26 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) return -EINVAL; } - mutex_lock(&rv_interface_lock); + guard(mutex)(&rv_interface_lock); list_for_each_entry(r, &rv_monitors_list, list) { if (strcmp(monitor->name, r->name) == 0) { pr_info("Monitor %s is already registered\n", monitor->name); - retval = -EEXIST; - goto out_unlock; + return -EEXIST; } } if (parent && rv_is_nested_monitor(parent)) { pr_info("Parent monitor %s is already nested, cannot nest further\n", parent->name); - retval = -EINVAL; - goto out_unlock; + return -EINVAL; } monitor->parent = parent; retval = create_monitor_dir(monitor, parent); if (retval) - goto out_unlock; + return retval; /* keep children close to the parent for easier visualisation */ if (parent) @@ -813,9 +795,7 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) else list_add_tail(&monitor->list, &rv_monitors_list); -out_unlock: - mutex_unlock(&rv_interface_lock); - return retval; + return 0; } /** @@ -826,13 +806,12 @@ out_unlock: */ int rv_unregister_monitor(struct rv_monitor *monitor) { - mutex_lock(&rv_interface_lock); + guard(mutex)(&rv_interface_lock); rv_disable_monitor(monitor); list_del(&monitor->list); destroy_monitor_dir(monitor); - mutex_unlock(&rv_interface_lock); return 0; } @@ -840,39 +819,36 @@ int __init rv_init_interface(void) { struct dentry *tmp; int retval; + struct dentry *root_dir __free(rv_remove) = rv_create_dir("rv", NULL); - rv_root.root_dir = rv_create_dir("rv", NULL); - if (!rv_root.root_dir) - goto out_err; + if (!root_dir) + return 1; - rv_root.monitors_dir = rv_create_dir("monitors", rv_root.root_dir); + rv_root.monitors_dir = rv_create_dir("monitors", root_dir); if (!rv_root.monitors_dir) - goto out_err; + return 1; - tmp = rv_create_file("available_monitors", RV_MODE_READ, rv_root.root_dir, NULL, + tmp = rv_create_file("available_monitors", RV_MODE_READ, root_dir, NULL, &available_monitors_ops); if (!tmp) - goto out_err; + return 1; - tmp = rv_create_file("enabled_monitors", RV_MODE_WRITE, rv_root.root_dir, NULL, + tmp = rv_create_file("enabled_monitors", RV_MODE_WRITE, root_dir, NULL, &enabled_monitors_ops); if (!tmp) - goto out_err; + return 1; - tmp = rv_create_file("monitoring_on", RV_MODE_WRITE, rv_root.root_dir, NULL, + tmp = rv_create_file("monitoring_on", RV_MODE_WRITE, root_dir, NULL, &monitoring_on_fops); if (!tmp) - goto out_err; - retval = init_rv_reactors(rv_root.root_dir); + return 1; + retval = init_rv_reactors(root_dir); if (retval) - goto out_err; + return 1; turn_monitoring_on(); - return 0; + rv_root.root_dir = no_free_ptr(root_dir); -out_err: - rv_remove(rv_root.root_dir); - printk(KERN_ERR "RV: Error while creating the RV interface\n"); - return 1; + return 0; } diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h index 1485a70c1bf4..2c0f51ff9d5c 100644 --- a/kernel/trace/rv/rv.h +++ b/kernel/trace/rv/rv.h @@ -17,6 +17,8 @@ struct rv_interface { #define rv_create_file tracefs_create_file #define rv_remove tracefs_remove +DEFINE_FREE(rv_remove, struct dentry *, if (_T) rv_remove(_T)); + #define MAX_RV_MONITOR_NAME_SIZE 32 #define MAX_RV_REACTOR_NAME_SIZE 32 @@ -30,10 +32,10 @@ bool rv_is_container_monitor(struct rv_monitor *mon); bool rv_is_nested_monitor(struct rv_monitor *mon); #ifdef CONFIG_RV_REACTORS -int reactor_populate_monitor(struct rv_monitor *mon); +int reactor_populate_monitor(struct rv_monitor *mon, struct dentry *root); int init_rv_reactors(struct dentry *root_dir); #else -static inline int reactor_populate_monitor(struct rv_monitor *mon) +static inline int reactor_populate_monitor(struct rv_monitor *mon, struct dentry *root) { return 0; } diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index d32859fec238..460af07f7aba 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -61,6 +61,7 @@ * printk */ +#include <linux/lockdep.h> #include <linux/slab.h> #include "rv.h" @@ -232,9 +233,7 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, seq_f = file->private_data; mon = seq_f->private; - mutex_lock(&rv_interface_lock); - - retval = -EINVAL; + guard(mutex)(&rv_interface_lock); list_for_each_entry(reactor, &rv_reactors_list, list) { if (strcmp(ptr, reactor->name) != 0) @@ -242,13 +241,10 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, monitor_swap_reactors(mon, reactor); - retval = count; - break; + return count; } - mutex_unlock(&rv_interface_lock); - - return retval; + return -EINVAL; } /* @@ -309,18 +305,14 @@ static int __rv_register_reactor(struct rv_reactor *reactor) */ int rv_register_reactor(struct rv_reactor *reactor) { - int retval = 0; - if (strlen(reactor->name) >= MAX_RV_REACTOR_NAME_SIZE) { pr_info("Reactor %s has a name longer than %d\n", reactor->name, MAX_RV_MONITOR_NAME_SIZE); return -EINVAL; } - mutex_lock(&rv_interface_lock); - retval = __rv_register_reactor(reactor); - mutex_unlock(&rv_interface_lock); - return retval; + guard(mutex)(&rv_interface_lock); + return __rv_register_reactor(reactor); } /** @@ -331,9 +323,8 @@ int rv_register_reactor(struct rv_reactor *reactor) */ int rv_unregister_reactor(struct rv_reactor *reactor) { - mutex_lock(&rv_interface_lock); + guard(mutex)(&rv_interface_lock); list_del(&reactor->list); - mutex_unlock(&rv_interface_lock); return 0; } @@ -347,7 +338,7 @@ static bool __read_mostly reacting_on; * * Returns 1 if on, 0 otherwise. */ -bool rv_reacting_on(void) +static bool rv_reacting_on(void) { /* Ensures that concurrent monitors read consistent reacting_on */ smp_rmb(); @@ -389,7 +380,7 @@ static ssize_t reacting_on_write_data(struct file *filp, const char __user *user if (retval) return retval; - mutex_lock(&rv_interface_lock); + guard(mutex)(&rv_interface_lock); if (val) turn_reacting_on(); @@ -402,8 +393,6 @@ static ssize_t reacting_on_write_data(struct file *filp, const char __user *user */ tracepoint_synchronize_unregister(); - mutex_unlock(&rv_interface_lock); - return count; } @@ -416,14 +405,15 @@ static const struct file_operations reacting_on_fops = { /** * reactor_populate_monitor - creates per monitor reactors file * @mon: The monitor. + * @root: The directory of the monitor. * * Returns 0 if successful, error otherwise. */ -int reactor_populate_monitor(struct rv_monitor *mon) +int reactor_populate_monitor(struct rv_monitor *mon, struct dentry *root) { struct dentry *tmp; - tmp = rv_create_file("reactors", RV_MODE_WRITE, mon->root_d, mon, &monitor_reactors_ops); + tmp = rv_create_file("reactors", RV_MODE_WRITE, root, mon, &monitor_reactors_ops); if (!tmp) return -ENOMEM; @@ -438,7 +428,7 @@ int reactor_populate_monitor(struct rv_monitor *mon) /* * Nop reactor register */ -__printf(1, 2) static void rv_nop_reaction(const char *msg, ...) +__printf(1, 0) static void rv_nop_reaction(const char *msg, va_list args) { } @@ -450,30 +440,42 @@ static struct rv_reactor rv_nop = { int init_rv_reactors(struct dentry *root_dir) { - struct dentry *available, *reacting; int retval; - available = rv_create_file("available_reactors", RV_MODE_READ, root_dir, NULL, - &available_reactors_ops); - if (!available) - goto out_err; + struct dentry *available __free(rv_remove) = + rv_create_file("available_reactors", RV_MODE_READ, root_dir, + NULL, &available_reactors_ops); + + struct dentry *reacting __free(rv_remove) = + rv_create_file("reacting_on", RV_MODE_WRITE, root_dir, NULL, &reacting_on_fops); - reacting = rv_create_file("reacting_on", RV_MODE_WRITE, root_dir, NULL, &reacting_on_fops); - if (!reacting) - goto rm_available; + if (!reacting || !available) + return -ENOMEM; retval = __rv_register_reactor(&rv_nop); if (retval) - goto rm_reacting; + return retval; turn_reacting_on(); + retain_and_null_ptr(available); + retain_and_null_ptr(reacting); return 0; +} + +void rv_react(struct rv_monitor *monitor, const char *msg, ...) +{ + static DEFINE_WAIT_OVERRIDE_MAP(rv_react_map, LD_WAIT_FREE); + va_list args; + + if (!rv_reacting_on() || !monitor->react) + return; + + va_start(args, msg); + + lock_map_acquire_try(&rv_react_map); + monitor->react(msg, args); + lock_map_release(&rv_react_map); -rm_reacting: - rv_remove(reacting); -rm_available: - rv_remove(available); -out_err: - return -ENOMEM; + va_end(args); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 304e93597126..e575956ef9b5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -20,6 +20,7 @@ #include <linux/security.h> #include <linux/seq_file.h> #include <linux/irqflags.h> +#include <linux/syscalls.h> #include <linux/debugfs.h> #include <linux/tracefs.h> #include <linux/pagemap.h> @@ -93,17 +94,13 @@ static bool tracepoint_printk_stop_on_boot __initdata; static bool traceoff_after_boot __initdata; static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); -/* For tracers that don't implement custom flags */ -static struct tracer_opt dummy_tracer_opt[] = { - { } +/* Store tracers and their flags per instance */ +struct tracers { + struct list_head list; + struct tracer *tracer; + struct tracer_flags *flags; }; -static int -dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) -{ - return 0; -} - /* * To prevent the comm cache from being overwritten when no * tracing is active, only save the comm when a trace event @@ -128,7 +125,7 @@ cpumask_var_t __read_mostly tracing_buffer_mask; * If there is an oops (or kernel panic) and the ftrace_dump_on_oops * is set, then ftrace_dump is called. This will output the contents * of the ftrace buffers to the console. This is very useful for - * capturing traces that lead to crashes and outputing it to a + * capturing traces that lead to crashes and outputting it to a * serial console. * * It is default off, but you can enable it with either specifying @@ -137,7 +134,7 @@ cpumask_var_t __read_mostly tracing_buffer_mask; * Set 1 if you want to dump buffers of all CPUs * Set 2 if you want to dump the buffer of the CPU that triggered oops * Set instance name if you want to dump the specific trace instance - * Multiple instance dump is also supported, and instances are seperated + * Multiple instance dump is also supported, and instances are separated * by commas. */ /* Set to string format zero to disable by default */ @@ -512,22 +509,23 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export); /* trace_flags holds trace_options default values */ #define TRACE_DEFAULT_FLAGS \ - (FUNCTION_DEFAULT_FLAGS | \ - TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | \ - TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \ - TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \ - TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \ - TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK | \ - TRACE_ITER_COPY_MARKER) + (FUNCTION_DEFAULT_FLAGS | FPROFILE_DEFAULT_FLAGS | \ + TRACE_ITER(PRINT_PARENT) | TRACE_ITER(PRINTK) | \ + TRACE_ITER(ANNOTATE) | TRACE_ITER(CONTEXT_INFO) | \ + TRACE_ITER(RECORD_CMD) | TRACE_ITER(OVERWRITE) | \ + TRACE_ITER(IRQ_INFO) | TRACE_ITER(MARKERS) | \ + TRACE_ITER(HASH_PTR) | TRACE_ITER(TRACE_PRINTK) | \ + TRACE_ITER(COPY_MARKER)) /* trace_options that are only supported by global_trace */ -#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ - TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD) +#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER(PRINTK) | \ + TRACE_ITER(PRINTK_MSGONLY) | TRACE_ITER(RECORD_CMD) | \ + TRACE_ITER(PROF_TEXT_OFFSET) | FPROFILE_DEFAULT_FLAGS) /* trace_flags that are default zero for instances */ #define ZEROED_TRACE_FLAGS \ - (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK | \ - TRACE_ITER_COPY_MARKER) + (TRACE_ITER(EVENT_FORK) | TRACE_ITER(FUNC_FORK) | TRACE_ITER(TRACE_PRINTK) | \ + TRACE_ITER(COPY_MARKER)) /* * The global_trace is the descriptor that holds the top-level tracing @@ -558,9 +556,9 @@ static void update_printk_trace(struct trace_array *tr) if (printk_trace == tr) return; - printk_trace->trace_flags &= ~TRACE_ITER_TRACE_PRINTK; + printk_trace->trace_flags &= ~TRACE_ITER(TRACE_PRINTK); printk_trace = tr; - tr->trace_flags |= TRACE_ITER_TRACE_PRINTK; + tr->trace_flags |= TRACE_ITER(TRACE_PRINTK); } /* Returns true if the status of tr changed */ @@ -573,7 +571,7 @@ static bool update_marker_trace(struct trace_array *tr, int enabled) return false; list_add_rcu(&tr->marker_list, &marker_copies); - tr->trace_flags |= TRACE_ITER_COPY_MARKER; + tr->trace_flags |= TRACE_ITER(COPY_MARKER); return true; } @@ -581,7 +579,7 @@ static bool update_marker_trace(struct trace_array *tr, int enabled) return false; list_del_init(&tr->marker_list); - tr->trace_flags &= ~TRACE_ITER_COPY_MARKER; + tr->trace_flags &= ~TRACE_ITER(COPY_MARKER); return true; } @@ -1139,7 +1137,7 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip, unsigned int trace_ctx; int alloc; - if (!(tr->trace_flags & TRACE_ITER_PRINTK)) + if (!(tr->trace_flags & TRACE_ITER(PRINTK))) return 0; if (unlikely(tracing_selftest_running && tr == &global_trace)) @@ -1205,7 +1203,7 @@ int __trace_bputs(unsigned long ip, const char *str) if (!printk_binsafe(tr)) return __trace_puts(ip, str, strlen(str)); - if (!(tr->trace_flags & TRACE_ITER_PRINTK)) + if (!(tr->trace_flags & TRACE_ITER(PRINTK))) return 0; if (unlikely(tracing_selftest_running || tracing_disabled)) @@ -2173,6 +2171,7 @@ static int save_selftest(struct tracer *type) static int run_tracer_selftest(struct tracer *type) { struct trace_array *tr = &global_trace; + struct tracer_flags *saved_flags = tr->current_trace_flags; struct tracer *saved_tracer = tr->current_trace; int ret; @@ -2203,6 +2202,7 @@ static int run_tracer_selftest(struct tracer *type) tracing_reset_online_cpus(&tr->array_buffer); tr->current_trace = type; + tr->current_trace_flags = type->flags ? : type->default_flags; #ifdef CONFIG_TRACER_MAX_TRACE if (type->use_max_tr) { @@ -2219,6 +2219,7 @@ static int run_tracer_selftest(struct tracer *type) ret = type->selftest(type, tr); /* the test is responsible for resetting too */ tr->current_trace = saved_tracer; + tr->current_trace_flags = saved_flags; if (ret) { printk(KERN_CONT "FAILED!\n"); /* Add the warning after printing 'FAILED' */ @@ -2311,10 +2312,23 @@ static inline int do_run_tracer_selftest(struct tracer *type) } #endif /* CONFIG_FTRACE_STARTUP_TEST */ -static void add_tracer_options(struct trace_array *tr, struct tracer *t); +static int add_tracer(struct trace_array *tr, struct tracer *t); static void __init apply_trace_boot_options(void); +static void free_tracers(struct trace_array *tr) +{ + struct tracers *t, *n; + + lockdep_assert_held(&trace_types_lock); + + list_for_each_entry_safe(t, n, &tr->tracers, list) { + list_del(&t->list); + kfree(t->flags); + kfree(t); + } +} + /** * register_tracer - register a tracer with the ftrace system. * @type: the plugin for the tracer @@ -2323,6 +2337,7 @@ static void __init apply_trace_boot_options(void); */ int __init register_tracer(struct tracer *type) { + struct trace_array *tr; struct tracer *t; int ret = 0; @@ -2354,31 +2369,25 @@ int __init register_tracer(struct tracer *type) } } - if (!type->set_flag) - type->set_flag = &dummy_set_flag; - if (!type->flags) { - /*allocate a dummy tracer_flags*/ - type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL); - if (!type->flags) { - ret = -ENOMEM; - goto out; - } - type->flags->val = 0; - type->flags->opts = dummy_tracer_opt; - } else - if (!type->flags->opts) - type->flags->opts = dummy_tracer_opt; - /* store the tracer for __set_tracer_option */ - type->flags->trace = type; + if (type->flags) + type->flags->trace = type; ret = do_run_tracer_selftest(type); if (ret < 0) goto out; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + ret = add_tracer(tr, type); + if (ret < 0) { + /* The tracer will still exist but without options */ + pr_warn("Failed to create tracer options for %s\n", type->name); + break; + } + } + type->next = trace_types; trace_types = type; - add_tracer_options(&global_trace, type); out: mutex_unlock(&trace_types_lock); @@ -2391,7 +2400,7 @@ int __init register_tracer(struct tracer *type) printk(KERN_INFO "Starting tracer '%s'\n", type->name); /* Do we want this tracer to start on bootup? */ - tracing_set_tracer(&global_trace, type->name); + WARN_ON(tracing_set_tracer(&global_trace, type->name) < 0); default_bootup_tracer = NULL; apply_trace_boot_options(); @@ -3078,7 +3087,7 @@ static inline void ftrace_trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip, struct pt_regs *regs) { - if (!(tr->trace_flags & TRACE_ITER_STACKTRACE)) + if (!(tr->trace_flags & TRACE_ITER(STACKTRACE))) return; __ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs); @@ -3139,7 +3148,7 @@ ftrace_trace_userstack(struct trace_array *tr, struct ring_buffer_event *event; struct userstack_entry *entry; - if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE)) + if (!(tr->trace_flags & TRACE_ITER(USERSTACKTRACE))) return; /* @@ -3484,7 +3493,7 @@ int trace_array_printk(struct trace_array *tr, if (tr == &global_trace) return 0; - if (!(tr->trace_flags & TRACE_ITER_PRINTK)) + if (!(tr->trace_flags & TRACE_ITER(PRINTK))) return 0; va_start(ap, fmt); @@ -3521,7 +3530,7 @@ int trace_array_printk_buf(struct trace_buffer *buffer, int ret; va_list ap; - if (!(printk_trace->trace_flags & TRACE_ITER_PRINTK)) + if (!(printk_trace->trace_flags & TRACE_ITER(PRINTK))) return 0; va_start(ap, fmt); @@ -3791,7 +3800,7 @@ const char *trace_event_format(struct trace_iterator *iter, const char *fmt) if (WARN_ON_ONCE(!fmt)) return fmt; - if (!iter->tr || iter->tr->trace_flags & TRACE_ITER_HASH_PTR) + if (!iter->tr || iter->tr->trace_flags & TRACE_ITER(HASH_PTR)) return fmt; p = fmt; @@ -4113,7 +4122,7 @@ static void print_event_info(struct array_buffer *buf, struct seq_file *m) static void print_func_help_header(struct array_buffer *buf, struct seq_file *m, unsigned int flags) { - bool tgid = flags & TRACE_ITER_RECORD_TGID; + bool tgid = flags & TRACE_ITER(RECORD_TGID); print_event_info(buf, m); @@ -4124,7 +4133,7 @@ static void print_func_help_header(struct array_buffer *buf, struct seq_file *m, static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m, unsigned int flags) { - bool tgid = flags & TRACE_ITER_RECORD_TGID; + bool tgid = flags & TRACE_ITER(RECORD_TGID); static const char space[] = " "; int prec = tgid ? 12 : 2; @@ -4197,7 +4206,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; struct trace_array *tr = iter->tr; - if (!(tr->trace_flags & TRACE_ITER_ANNOTATE)) + if (!(tr->trace_flags & TRACE_ITER(ANNOTATE))) return; if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) @@ -4219,6 +4228,22 @@ static void test_cpu_buff_start(struct trace_iterator *iter) iter->cpu); } +#ifdef CONFIG_FTRACE_SYSCALLS +static bool is_syscall_event(struct trace_event *event) +{ + return (event->funcs == &enter_syscall_print_funcs) || + (event->funcs == &exit_syscall_print_funcs); + +} +#define syscall_buf_size CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT +#else +static inline bool is_syscall_event(struct trace_event *event) +{ + return false; +} +#define syscall_buf_size 0 +#endif /* CONFIG_FTRACE_SYSCALLS */ + static enum print_line_t print_trace_fmt(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; @@ -4233,7 +4258,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); - if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) { if (iter->iter_flags & TRACE_FILE_LAT_FMT) trace_print_lat_context(iter); else @@ -4244,17 +4269,19 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) return TRACE_TYPE_PARTIAL_LINE; if (event) { - if (tr->trace_flags & TRACE_ITER_FIELDS) + if (tr->trace_flags & TRACE_ITER(FIELDS)) return print_event_fields(iter, event); /* * For TRACE_EVENT() events, the print_fmt is not * safe to use if the array has delta offsets * Force printing via the fields. */ - if ((tr->text_delta) && - event->type > __TRACE_LAST_TYPE) + if ((tr->text_delta)) { + /* ftrace and system call events are still OK */ + if ((event->type > __TRACE_LAST_TYPE) && + !is_syscall_event(event)) return print_event_fields(iter, event); - + } return event->funcs->trace(iter, sym_flags, event); } @@ -4272,7 +4299,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) entry = iter->ent; - if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) trace_seq_printf(s, "%d %d %llu ", entry->pid, iter->cpu, iter->ts); @@ -4298,7 +4325,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) entry = iter->ent; - if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) { SEQ_PUT_HEX_FIELD(s, entry->pid); SEQ_PUT_HEX_FIELD(s, iter->cpu); SEQ_PUT_HEX_FIELD(s, iter->ts); @@ -4327,7 +4354,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) entry = iter->ent; - if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) { SEQ_PUT_FIELD(s, entry->pid); SEQ_PUT_FIELD(s, iter->cpu); SEQ_PUT_FIELD(s, iter->ts); @@ -4398,27 +4425,27 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) } if (iter->ent->type == TRACE_BPUTS && - trace_flags & TRACE_ITER_PRINTK && - trace_flags & TRACE_ITER_PRINTK_MSGONLY) + trace_flags & TRACE_ITER(PRINTK) && + trace_flags & TRACE_ITER(PRINTK_MSGONLY)) return trace_print_bputs_msg_only(iter); if (iter->ent->type == TRACE_BPRINT && - trace_flags & TRACE_ITER_PRINTK && - trace_flags & TRACE_ITER_PRINTK_MSGONLY) + trace_flags & TRACE_ITER(PRINTK) && + trace_flags & TRACE_ITER(PRINTK_MSGONLY)) return trace_print_bprintk_msg_only(iter); if (iter->ent->type == TRACE_PRINT && - trace_flags & TRACE_ITER_PRINTK && - trace_flags & TRACE_ITER_PRINTK_MSGONLY) + trace_flags & TRACE_ITER(PRINTK) && + trace_flags & TRACE_ITER(PRINTK_MSGONLY)) return trace_print_printk_msg_only(iter); - if (trace_flags & TRACE_ITER_BIN) + if (trace_flags & TRACE_ITER(BIN)) return print_bin_fmt(iter); - if (trace_flags & TRACE_ITER_HEX) + if (trace_flags & TRACE_ITER(HEX)) return print_hex_fmt(iter); - if (trace_flags & TRACE_ITER_RAW) + if (trace_flags & TRACE_ITER(RAW)) return print_raw_fmt(iter); return print_trace_fmt(iter); @@ -4436,7 +4463,7 @@ void trace_latency_header(struct seq_file *m) if (iter->iter_flags & TRACE_FILE_LAT_FMT) print_trace_header(m, iter); - if (!(tr->trace_flags & TRACE_ITER_VERBOSE)) + if (!(tr->trace_flags & TRACE_ITER(VERBOSE))) print_lat_help_header(m); } @@ -4446,7 +4473,7 @@ void trace_default_header(struct seq_file *m) struct trace_array *tr = iter->tr; unsigned long trace_flags = tr->trace_flags; - if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + if (!(trace_flags & TRACE_ITER(CONTEXT_INFO))) return; if (iter->iter_flags & TRACE_FILE_LAT_FMT) { @@ -4454,11 +4481,11 @@ void trace_default_header(struct seq_file *m) if (trace_empty(iter)) return; print_trace_header(m, iter); - if (!(trace_flags & TRACE_ITER_VERBOSE)) + if (!(trace_flags & TRACE_ITER(VERBOSE))) print_lat_help_header(m); } else { - if (!(trace_flags & TRACE_ITER_VERBOSE)) { - if (trace_flags & TRACE_ITER_IRQ_INFO) + if (!(trace_flags & TRACE_ITER(VERBOSE))) { + if (trace_flags & TRACE_ITER(IRQ_INFO)) print_func_help_header_irq(iter->array_buffer, m, trace_flags); else @@ -4682,8 +4709,10 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) * If pause-on-trace is enabled, then stop the trace while * dumping, unless this is the "snapshot" file */ - if (!iter->snapshot && (tr->trace_flags & TRACE_ITER_PAUSE_ON_TRACE)) + if (!iter->snapshot && (tr->trace_flags & TRACE_ITER(PAUSE_ON_TRACE))) { + iter->iter_flags |= TRACE_FILE_PAUSE; tracing_stop_tr(tr); + } if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { @@ -4815,7 +4844,7 @@ static int tracing_release(struct inode *inode, struct file *file) if (iter->trace && iter->trace->close) iter->trace->close(iter); - if (!iter->snapshot && tr->stop_count) + if (iter->iter_flags & TRACE_FILE_PAUSE) /* reenable tracing if it was previously enabled */ tracing_start_tr(tr); @@ -4876,7 +4905,7 @@ static int tracing_open(struct inode *inode, struct file *file) iter = __tracing_open(inode, file, false); if (IS_ERR(iter)) ret = PTR_ERR(iter); - else if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) + else if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) iter->iter_flags |= TRACE_FILE_LAT_FMT; } @@ -5139,21 +5168,26 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) { struct tracer_opt *trace_opts; struct trace_array *tr = m->private; + struct tracer_flags *flags; u32 tracer_flags; int i; guard(mutex)(&trace_types_lock); - tracer_flags = tr->current_trace->flags->val; - trace_opts = tr->current_trace->flags->opts; - for (i = 0; trace_options[i]; i++) { - if (tr->trace_flags & (1 << i)) + if (tr->trace_flags & (1ULL << i)) seq_printf(m, "%s\n", trace_options[i]); else seq_printf(m, "no%s\n", trace_options[i]); } + flags = tr->current_trace_flags; + if (!flags || !flags->opts) + return 0; + + tracer_flags = flags->val; + trace_opts = flags->opts; + for (i = 0; trace_opts[i].name; i++) { if (tracer_flags & trace_opts[i].bit) seq_printf(m, "%s\n", trace_opts[i].name); @@ -5169,9 +5203,10 @@ static int __set_tracer_option(struct trace_array *tr, struct tracer_opt *opts, int neg) { struct tracer *trace = tracer_flags->trace; - int ret; + int ret = 0; - ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); + if (trace->set_flag) + ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); if (ret) return ret; @@ -5185,37 +5220,41 @@ static int __set_tracer_option(struct trace_array *tr, /* Try to assign a tracer specific option */ static int set_tracer_option(struct trace_array *tr, char *cmp, int neg) { - struct tracer *trace = tr->current_trace; - struct tracer_flags *tracer_flags = trace->flags; + struct tracer_flags *tracer_flags = tr->current_trace_flags; struct tracer_opt *opts = NULL; int i; + if (!tracer_flags || !tracer_flags->opts) + return 0; + for (i = 0; tracer_flags->opts[i].name; i++) { opts = &tracer_flags->opts[i]; if (strcmp(cmp, opts->name) == 0) - return __set_tracer_option(tr, trace->flags, opts, neg); + return __set_tracer_option(tr, tracer_flags, opts, neg); } return -EINVAL; } /* Some tracers require overwrite to stay enabled */ -int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) +int trace_keep_overwrite(struct tracer *tracer, u64 mask, int set) { - if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set) + if (tracer->enabled && (mask & TRACE_ITER(OVERWRITE)) && !set) return -1; return 0; } -int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) +int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled) { - if ((mask == TRACE_ITER_RECORD_TGID) || - (mask == TRACE_ITER_RECORD_CMD) || - (mask == TRACE_ITER_TRACE_PRINTK) || - (mask == TRACE_ITER_COPY_MARKER)) + switch (mask) { + case TRACE_ITER(RECORD_TGID): + case TRACE_ITER(RECORD_CMD): + case TRACE_ITER(TRACE_PRINTK): + case TRACE_ITER(COPY_MARKER): lockdep_assert_held(&event_mutex); + } /* do nothing if flag is already set */ if (!!(tr->trace_flags & mask) == !!enabled) @@ -5226,7 +5265,8 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (tr->current_trace->flag_changed(tr, mask, !!enabled)) return -EINVAL; - if (mask == TRACE_ITER_TRACE_PRINTK) { + switch (mask) { + case TRACE_ITER(TRACE_PRINTK): if (enabled) { update_printk_trace(tr); } else { @@ -5238,50 +5278,64 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) return -EINVAL; /* * An instance must always have it set. - * by default, that's the global_trace instane. + * by default, that's the global_trace instance. */ if (printk_trace == tr) update_printk_trace(&global_trace); } - } + break; - if (mask == TRACE_ITER_COPY_MARKER) + case TRACE_ITER(COPY_MARKER): update_marker_trace(tr, enabled); + /* update_marker_trace updates the tr->trace_flags */ + return 0; + } if (enabled) tr->trace_flags |= mask; else tr->trace_flags &= ~mask; - if (mask == TRACE_ITER_RECORD_CMD) + switch (mask) { + case TRACE_ITER(RECORD_CMD): trace_event_enable_cmd_record(enabled); + break; - if (mask == TRACE_ITER_RECORD_TGID) { + case TRACE_ITER(RECORD_TGID): if (trace_alloc_tgid_map() < 0) { - tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; + tr->trace_flags &= ~TRACE_ITER(RECORD_TGID); return -ENOMEM; } trace_event_enable_tgid_record(enabled); - } + break; - if (mask == TRACE_ITER_EVENT_FORK) + case TRACE_ITER(EVENT_FORK): trace_event_follow_fork(tr, enabled); + break; - if (mask == TRACE_ITER_FUNC_FORK) + case TRACE_ITER(FUNC_FORK): ftrace_pid_follow_fork(tr, enabled); + break; - if (mask == TRACE_ITER_OVERWRITE) { + case TRACE_ITER(OVERWRITE): ring_buffer_change_overwrite(tr->array_buffer.buffer, enabled); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled); #endif - } + break; - if (mask == TRACE_ITER_PRINTK) { + case TRACE_ITER(PRINTK): trace_printk_start_stop_comm(enabled); trace_printk_control(enabled); + break; + +#if defined(CONFIG_FUNCTION_PROFILER) && defined(CONFIG_FUNCTION_GRAPH_TRACER) + case TRACE_GRAPH_GRAPH_TIME: + ftrace_graph_graph_time_control(enabled); + break; +#endif } return 0; @@ -5311,7 +5365,7 @@ int trace_set_options(struct trace_array *tr, char *option) if (ret < 0) ret = set_tracer_option(tr, cmp, neg); else - ret = set_tracer_flag(tr, 1 << ret, !neg); + ret = set_tracer_flag(tr, 1ULL << ret, !neg); mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); @@ -6215,11 +6269,6 @@ int tracing_update_buffers(struct trace_array *tr) return ret; } -struct trace_option_dentry; - -static void -create_trace_option_files(struct trace_array *tr, struct tracer *tracer); - /* * Used to clear out the tracer before deletion of an instance. * Must have trace_types_lock held. @@ -6235,26 +6284,15 @@ static void tracing_set_nop(struct trace_array *tr) tr->current_trace->reset(tr); tr->current_trace = &nop_trace; + tr->current_trace_flags = nop_trace.flags; } static bool tracer_options_updated; -static void add_tracer_options(struct trace_array *tr, struct tracer *t) -{ - /* Only enable if the directory has been created already. */ - if (!tr->dir && !(tr->flags & TRACE_ARRAY_FL_GLOBAL)) - return; - - /* Only create trace option files after update_tracer_options finish */ - if (!tracer_options_updated) - return; - - create_trace_option_files(tr, t); -} - int tracing_set_tracer(struct trace_array *tr, const char *buf) { - struct tracer *t; + struct tracer *trace = NULL; + struct tracers *t; #ifdef CONFIG_TRACER_MAX_TRACE bool had_max_tr; #endif @@ -6272,18 +6310,20 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) ret = 0; } - for (t = trace_types; t; t = t->next) { - if (strcmp(t->name, buf) == 0) + list_for_each_entry(t, &tr->tracers, list) { + if (strcmp(t->tracer->name, buf) == 0) { + trace = t->tracer; break; + } } - if (!t) + if (!trace) return -EINVAL; - if (t == tr->current_trace) + if (trace == tr->current_trace) return 0; #ifdef CONFIG_TRACER_SNAPSHOT - if (t->use_max_tr) { + if (trace->use_max_tr) { local_irq_disable(); arch_spin_lock(&tr->max_lock); ret = tr->cond_snapshot ? -EBUSY : 0; @@ -6294,14 +6334,14 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) } #endif /* Some tracers won't work on kernel command line */ - if (system_state < SYSTEM_RUNNING && t->noboot) { + if (system_state < SYSTEM_RUNNING && trace->noboot) { pr_warn("Tracer '%s' is not allowed on command line, ignored\n", - t->name); + trace->name); return -EINVAL; } /* Some tracers are only allowed for the top level buffer */ - if (!trace_ok_for_array(t, tr)) + if (!trace_ok_for_array(trace, tr)) return -EINVAL; /* If trace pipe files are being read, we can't change the tracer */ @@ -6320,8 +6360,9 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) /* Current trace needs to be nop_trace before synchronize_rcu */ tr->current_trace = &nop_trace; + tr->current_trace_flags = nop_trace.flags; - if (had_max_tr && !t->use_max_tr) { + if (had_max_tr && !trace->use_max_tr) { /* * We need to make sure that the update_max_tr sees that * current_trace changed to nop_trace to keep it from @@ -6334,7 +6375,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) tracing_disarm_snapshot(tr); } - if (!had_max_tr && t->use_max_tr) { + if (!had_max_tr && trace->use_max_tr) { ret = tracing_arm_snapshot_locked(tr); if (ret) return ret; @@ -6343,18 +6384,21 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) tr->current_trace = &nop_trace; #endif - if (t->init) { - ret = tracer_init(t, tr); + tr->current_trace_flags = t->flags ? : t->tracer->flags; + + if (trace->init) { + ret = tracer_init(trace, tr); if (ret) { #ifdef CONFIG_TRACER_MAX_TRACE - if (t->use_max_tr) + if (trace->use_max_tr) tracing_disarm_snapshot(tr); #endif + tr->current_trace_flags = nop_trace.flags; return ret; } } - tr->current_trace = t; + tr->current_trace = trace; tr->current_trace->enabled++; trace_branch_enable(tr); @@ -6532,7 +6576,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) /* trace pipe does not show start of buffer */ cpumask_setall(iter->started); - if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) iter->iter_flags |= TRACE_FILE_LAT_FMT; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ @@ -6593,7 +6637,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl if (trace_buffer_iter(iter, iter->cpu_file)) return EPOLLIN | EPOLLRDNORM; - if (tr->trace_flags & TRACE_ITER_BLOCK) + if (tr->trace_flags & TRACE_ITER(BLOCK)) /* * Always select as readable when in blocking mode */ @@ -6912,6 +6956,43 @@ out_err: } static ssize_t +tracing_syscall_buf_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + char buf[64]; + int r; + + r = snprintf(buf, 64, "%d\n", tr->syscall_buf_sz); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_syscall_buf_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + if (val > SYSCALL_FAULT_USER_MAX) + val = SYSCALL_FAULT_USER_MAX; + + tr->syscall_buf_sz = val; + + *ppos += cnt; + + return cnt; +} + +static ssize_t tracing_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -7145,7 +7226,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) struct trace_array *tr = inode->i_private; /* disable tracing ? */ - if (tr->trace_flags & TRACE_ITER_STOP_ON_FREE) + if (tr->trace_flags & TRACE_ITER(STOP_ON_FREE)) tracer_tracing_off(tr); /* resize the ring buffer to 0 */ tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); @@ -7223,52 +7304,43 @@ struct trace_user_buf { char *buf; }; -struct trace_user_buf_info { - struct trace_user_buf __percpu *tbuf; - int ref; -}; - - static DEFINE_MUTEX(trace_user_buffer_mutex); static struct trace_user_buf_info *trace_user_buffer; -static void trace_user_fault_buffer_free(struct trace_user_buf_info *tinfo) +/** + * trace_user_fault_destroy - free up allocated memory of a trace user buffer + * @tinfo: The descriptor to free up + * + * Frees any data allocated in the trace info dsecriptor. + */ +void trace_user_fault_destroy(struct trace_user_buf_info *tinfo) { char *buf; int cpu; + if (!tinfo || !tinfo->tbuf) + return; + for_each_possible_cpu(cpu) { buf = per_cpu_ptr(tinfo->tbuf, cpu)->buf; kfree(buf); } free_percpu(tinfo->tbuf); - kfree(tinfo); } -static int trace_user_fault_buffer_enable(void) +static int user_fault_buffer_enable(struct trace_user_buf_info *tinfo, size_t size) { - struct trace_user_buf_info *tinfo; char *buf; int cpu; - guard(mutex)(&trace_user_buffer_mutex); - - if (trace_user_buffer) { - trace_user_buffer->ref++; - return 0; - } - - tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); - if (!tinfo) - return -ENOMEM; + lockdep_assert_held(&trace_user_buffer_mutex); tinfo->tbuf = alloc_percpu(struct trace_user_buf); - if (!tinfo->tbuf) { - kfree(tinfo); + if (!tinfo->tbuf) return -ENOMEM; - } tinfo->ref = 1; + tinfo->size = size; /* Clear each buffer in case of error */ for_each_possible_cpu(cpu) { @@ -7276,42 +7348,165 @@ static int trace_user_fault_buffer_enable(void) } for_each_possible_cpu(cpu) { - buf = kmalloc_node(TRACE_MARKER_MAX_SIZE, GFP_KERNEL, + buf = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); - if (!buf) { - trace_user_fault_buffer_free(tinfo); + if (!buf) return -ENOMEM; - } per_cpu_ptr(tinfo->tbuf, cpu)->buf = buf; } - trace_user_buffer = tinfo; - return 0; } -static void trace_user_fault_buffer_disable(void) +/* For internal use. Free and reinitialize */ +static void user_buffer_free(struct trace_user_buf_info **tinfo) { - struct trace_user_buf_info *tinfo; + lockdep_assert_held(&trace_user_buffer_mutex); - guard(mutex)(&trace_user_buffer_mutex); + trace_user_fault_destroy(*tinfo); + kfree(*tinfo); + *tinfo = NULL; +} + +/* For internal use. Initialize and allocate */ +static int user_buffer_init(struct trace_user_buf_info **tinfo, size_t size) +{ + bool alloc = false; + int ret; + + lockdep_assert_held(&trace_user_buffer_mutex); - tinfo = trace_user_buffer; + if (!*tinfo) { + alloc = true; + *tinfo = kzalloc(sizeof(**tinfo), GFP_KERNEL); + if (!*tinfo) + return -ENOMEM; + } + + ret = user_fault_buffer_enable(*tinfo, size); + if (ret < 0 && alloc) + user_buffer_free(tinfo); + + return ret; +} - if (WARN_ON_ONCE(!tinfo)) +/* For internal use, derefrence and free if necessary */ +static void user_buffer_put(struct trace_user_buf_info **tinfo) +{ + guard(mutex)(&trace_user_buffer_mutex); + + if (WARN_ON_ONCE(!*tinfo || !(*tinfo)->ref)) return; - if (--tinfo->ref) + if (--(*tinfo)->ref) return; - trace_user_fault_buffer_free(tinfo); - trace_user_buffer = NULL; + user_buffer_free(tinfo); } -/* Must be called with preemption disabled */ -static char *trace_user_fault_read(struct trace_user_buf_info *tinfo, - const char __user *ptr, size_t size, - size_t *read_size) +/** + * trace_user_fault_init - Allocated or reference a per CPU buffer + * @tinfo: A pointer to the trace buffer descriptor + * @size: The size to allocate each per CPU buffer + * + * Create a per CPU buffer that can be used to copy from user space + * in a task context. When calling trace_user_fault_read(), preemption + * must be disabled, and it will enable preemption and copy user + * space data to the buffer. If any schedule switches occur, it will + * retry until it succeeds without a schedule switch knowing the buffer + * is still valid. + * + * Returns 0 on success, negative on failure. + */ +int trace_user_fault_init(struct trace_user_buf_info *tinfo, size_t size) +{ + int ret; + + if (!tinfo) + return -EINVAL; + + guard(mutex)(&trace_user_buffer_mutex); + + ret = user_buffer_init(&tinfo, size); + if (ret < 0) + trace_user_fault_destroy(tinfo); + + return ret; +} + +/** + * trace_user_fault_get - up the ref count for the user buffer + * @tinfo: A pointer to a pointer to the trace buffer descriptor + * + * Ups the ref count of the trace buffer. + * + * Returns the new ref count. + */ +int trace_user_fault_get(struct trace_user_buf_info *tinfo) +{ + if (!tinfo) + return -1; + + guard(mutex)(&trace_user_buffer_mutex); + + tinfo->ref++; + return tinfo->ref; +} + +/** + * trace_user_fault_put - dereference a per cpu trace buffer + * @tinfo: The @tinfo that was passed to trace_user_fault_get() + * + * Decrement the ref count of @tinfo. + * + * Returns the new refcount (negative on error). + */ +int trace_user_fault_put(struct trace_user_buf_info *tinfo) +{ + guard(mutex)(&trace_user_buffer_mutex); + + if (WARN_ON_ONCE(!tinfo || !tinfo->ref)) + return -1; + + --tinfo->ref; + return tinfo->ref; +} + +/** + * trace_user_fault_read - Read user space into a per CPU buffer + * @tinfo: The @tinfo allocated by trace_user_fault_get() + * @ptr: The user space pointer to read + * @size: The size of user space to read. + * @copy_func: Optional function to use to copy from user space + * @data: Data to pass to copy_func if it was supplied + * + * Preemption must be disabled when this is called, and must not + * be enabled while using the returned buffer. + * This does the copying from user space into a per CPU buffer. + * + * The @size must not be greater than the size passed in to + * trace_user_fault_init(). + * + * If @copy_func is NULL, trace_user_fault_read() will use copy_from_user(), + * otherwise it will call @copy_func. It will call @copy_func with: + * + * buffer: the per CPU buffer of the @tinfo. + * ptr: The pointer @ptr to user space to read + * size: The @size of the ptr to read + * data: The @data parameter + * + * It is expected that @copy_func will return 0 on success and non zero + * if there was a fault. + * + * Returns a pointer to the buffer with the content read from @ptr. + * Preemption must remain disabled while the caller accesses the + * buffer returned by this function. + * Returns NULL if there was a fault, or the size passed in is + * greater than the size passed to trace_user_fault_init(). + */ +char *trace_user_fault_read(struct trace_user_buf_info *tinfo, + const char __user *ptr, size_t size, + trace_user_buf_copy copy_func, void *data) { int cpu = smp_processor_id(); char *buffer = per_cpu_ptr(tinfo->tbuf, cpu)->buf; @@ -7319,9 +7514,14 @@ static char *trace_user_fault_read(struct trace_user_buf_info *tinfo, int trys = 0; int ret; - if (size > TRACE_MARKER_MAX_SIZE) - size = TRACE_MARKER_MAX_SIZE; - *read_size = 0; + lockdep_assert_preemption_disabled(); + + /* + * It's up to the caller to not try to copy more than it said + * it would. + */ + if (size > tinfo->size) + return NULL; /* * This acts similar to a seqcount. The per CPU context switches are @@ -7356,12 +7556,19 @@ static char *trace_user_fault_read(struct trace_user_buf_info *tinfo, migrate_disable(); /* - * Now preemption is being enabed and another task can come in + * Now preemption is being enabled and another task can come in * and use the same buffer and corrupt our data. */ preempt_enable_notrace(); - ret = __copy_from_user(buffer, ptr, size); + /* Make sure preemption is enabled here */ + lockdep_assert_preemption_enabled(); + + if (copy_func) { + ret = copy_func(buffer, ptr, size, data); + } else { + ret = __copy_from_user(buffer, ptr, size); + } preempt_disable_notrace(); migrate_enable(); @@ -7378,7 +7585,6 @@ static char *trace_user_fault_read(struct trace_user_buf_info *tinfo, */ } while (nr_context_switches_cpu(cpu) != cnt); - *read_size = size; return buffer; } @@ -7389,13 +7595,12 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, struct trace_array *tr = filp->private_data; ssize_t written = -ENODEV; unsigned long ip; - size_t size; char *buf; if (tracing_disabled) return -EINVAL; - if (!(tr->trace_flags & TRACE_ITER_MARKERS)) + if (!(tr->trace_flags & TRACE_ITER(MARKERS))) return -EINVAL; if ((ssize_t)cnt < 0) @@ -7407,13 +7612,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, /* Must have preemption disabled while having access to the buffer */ guard(preempt_notrace)(); - buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, &size); + buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, NULL, NULL); if (!buf) return -EFAULT; - if (cnt > size) - cnt = size; - /* The selftests expect this function to be the IP address */ ip = _THIS_IP_; @@ -7442,7 +7644,7 @@ static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, size_t size; /* cnt includes both the entry->id and the data behind it. */ - size = struct_size(entry, buf, cnt - sizeof(entry->id)); + size = struct_offset(entry, id) + cnt; buffer = tr->array_buffer.buffer; @@ -7473,30 +7675,29 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, { struct trace_array *tr = filp->private_data; ssize_t written = -ENODEV; - size_t size; char *buf; if (tracing_disabled) return -EINVAL; - if (!(tr->trace_flags & TRACE_ITER_MARKERS)) + if (!(tr->trace_flags & TRACE_ITER(MARKERS))) return -EINVAL; /* The marker must at least have a tag id */ if (cnt < sizeof(unsigned int)) return -EINVAL; + /* raw write is all or nothing */ + if (cnt > TRACE_MARKER_MAX_SIZE) + return -EINVAL; + /* Must have preemption disabled while having access to the buffer */ guard(preempt_notrace)(); - buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, &size); + buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, NULL, NULL); if (!buf) return -EFAULT; - /* raw write is all or nothing */ - if (cnt > size) - return -EINVAL; - /* The global trace_marker_raw can go to multiple instances */ if (tr == &global_trace) { guard(rcu)(); @@ -7516,20 +7717,26 @@ static int tracing_mark_open(struct inode *inode, struct file *filp) { int ret; - ret = trace_user_fault_buffer_enable(); - if (ret < 0) - return ret; + scoped_guard(mutex, &trace_user_buffer_mutex) { + if (!trace_user_buffer) { + ret = user_buffer_init(&trace_user_buffer, TRACE_MARKER_MAX_SIZE); + if (ret < 0) + return ret; + } else { + trace_user_buffer->ref++; + } + } stream_open(inode, filp); ret = tracing_open_generic_tr(inode, filp); if (ret < 0) - trace_user_fault_buffer_disable(); + user_buffer_put(&trace_user_buffer); return ret; } static int tracing_mark_release(struct inode *inode, struct file *file) { - trace_user_fault_buffer_disable(); + user_buffer_put(&trace_user_buffer); return tracing_release_generic_tr(inode, file); } @@ -7917,6 +8124,14 @@ static const struct file_operations tracing_entries_fops = { .release = tracing_release_generic_tr, }; +static const struct file_operations tracing_syscall_buf_fops = { + .open = tracing_open_generic_tr, + .read = tracing_syscall_buf_read, + .write = tracing_syscall_buf_write, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; + static const struct file_operations tracing_buffer_meta_fops = { .open = tracing_buffer_meta_open, .read = seq_read, @@ -8801,8 +9016,8 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) struct trace_iterator *iter = &info->iter; int ret = 0; - /* A memmap'ed buffer is not supported for user space mmap */ - if (iter->tr->flags & TRACE_ARRAY_FL_MEMMAP) + /* A memmap'ed and backup buffers are not supported for user space mmap */ + if (iter->tr->flags & (TRACE_ARRAY_FL_MEMMAP | TRACE_ARRAY_FL_VMALLOC)) return -ENODEV; ret = get_snapshot_map(iter->tr); @@ -9315,7 +9530,7 @@ trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt, get_tr_index(tr_index, &tr, &index); - if (tr->trace_flags & (1 << index)) + if (tr->trace_flags & (1ULL << index)) buf = "1\n"; else buf = "0\n"; @@ -9344,7 +9559,7 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); - ret = set_tracer_flag(tr, 1 << index, val); + ret = set_tracer_flag(tr, 1ULL << index, val); mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); @@ -9417,39 +9632,19 @@ create_trace_option_file(struct trace_array *tr, topt->entry = trace_create_file(opt->name, TRACE_MODE_WRITE, t_options, topt, &trace_options_fops); - } -static void -create_trace_option_files(struct trace_array *tr, struct tracer *tracer) +static int +create_trace_option_files(struct trace_array *tr, struct tracer *tracer, + struct tracer_flags *flags) { struct trace_option_dentry *topts; struct trace_options *tr_topts; - struct tracer_flags *flags; struct tracer_opt *opts; int cnt; - int i; - - if (!tracer) - return; - - flags = tracer->flags; if (!flags || !flags->opts) - return; - - /* - * If this is an instance, only create flags for tracers - * the instance may have. - */ - if (!trace_ok_for_array(tracer, tr)) - return; - - for (i = 0; i < tr->nr_topts; i++) { - /* Make sure there's no duplicate flags. */ - if (WARN_ON_ONCE(tr->topts[i].tracer->flags == tracer->flags)) - return; - } + return 0; opts = flags->opts; @@ -9458,13 +9653,13 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer) topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL); if (!topts) - return; + return 0; tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1), GFP_KERNEL); if (!tr_topts) { kfree(topts); - return; + return -ENOMEM; } tr->topts = tr_topts; @@ -9479,6 +9674,97 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer) "Failed to create trace option: %s", opts[cnt].name); } + return 0; +} + +static int get_global_flags_val(struct tracer *tracer) +{ + struct tracers *t; + + list_for_each_entry(t, &global_trace.tracers, list) { + if (t->tracer != tracer) + continue; + if (!t->flags) + return -1; + return t->flags->val; + } + return -1; +} + +static int add_tracer_options(struct trace_array *tr, struct tracers *t) +{ + struct tracer *tracer = t->tracer; + struct tracer_flags *flags = t->flags ?: tracer->flags; + + if (!flags) + return 0; + + /* Only add tracer options after update_tracer_options finish */ + if (!tracer_options_updated) + return 0; + + return create_trace_option_files(tr, tracer, flags); +} + +static int add_tracer(struct trace_array *tr, struct tracer *tracer) +{ + struct tracer_flags *flags; + struct tracers *t; + int ret; + + /* Only enable if the directory has been created already. */ + if (!tr->dir && !(tr->flags & TRACE_ARRAY_FL_GLOBAL)) + return 0; + + /* + * If this is an instance, only create flags for tracers + * the instance may have. + */ + if (!trace_ok_for_array(tracer, tr)) + return 0; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return -ENOMEM; + + t->tracer = tracer; + t->flags = NULL; + list_add(&t->list, &tr->tracers); + + flags = tracer->flags; + if (!flags) { + if (!tracer->default_flags) + return 0; + + /* + * If the tracer defines default flags, it means the flags are + * per trace instance. + */ + flags = kmalloc(sizeof(*flags), GFP_KERNEL); + if (!flags) + return -ENOMEM; + + *flags = *tracer->default_flags; + flags->trace = tracer; + + t->flags = flags; + + /* If this is an instance, inherit the global_trace flags */ + if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) { + int val = get_global_flags_val(tracer); + if (!WARN_ON_ONCE(val < 0)) + flags->val = val; + } + } + + ret = add_tracer_options(tr, t); + if (ret < 0) { + list_del(&t->list); + kfree(t->flags); + kfree(t); + } + + return ret; } static struct dentry * @@ -9508,8 +9794,9 @@ static void create_trace_options_dir(struct trace_array *tr) for (i = 0; trace_options[i]; i++) { if (top_level || - !((1 << i) & TOP_LEVEL_TRACE_FLAGS)) + !((1ULL << i) & TOP_LEVEL_TRACE_FLAGS)) { create_trace_option_core_file(tr, trace_options[i], i); + } } } @@ -9830,7 +10117,7 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size struct trace_scratch *tscratch; unsigned int scratch_size = 0; - rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + rb_flags = tr->trace_flags & TRACE_ITER(OVERWRITE) ? RB_FL_OVERWRITE : 0; buf->tr = tr; @@ -9928,19 +10215,39 @@ static void init_trace_flags_index(struct trace_array *tr) tr->trace_flags_index[i] = i; } -static void __update_tracer_options(struct trace_array *tr) +static int __update_tracer(struct trace_array *tr) { struct tracer *t; + int ret = 0; + + for (t = trace_types; t && !ret; t = t->next) + ret = add_tracer(tr, t); - for (t = trace_types; t; t = t->next) - add_tracer_options(tr, t); + return ret; +} + +static __init int __update_tracer_options(struct trace_array *tr) +{ + struct tracers *t; + int ret = 0; + + list_for_each_entry(t, &tr->tracers, list) { + ret = add_tracer_options(tr, t); + if (ret < 0) + break; + } + + return ret; } -static void update_tracer_options(struct trace_array *tr) +static __init void update_tracer_options(void) { + struct trace_array *tr; + guard(mutex)(&trace_types_lock); tracer_options_updated = true; - __update_tracer_options(tr); + list_for_each_entry(tr, &ftrace_trace_arrays, list) + __update_tracer_options(tr); } /* Must have trace_types_lock held */ @@ -9985,9 +10292,13 @@ static int trace_array_create_dir(struct trace_array *tr) } init_tracer_tracefs(tr, tr->dir); - __update_tracer_options(tr); - - return ret; + ret = __update_tracer(tr); + if (ret) { + event_trace_del_tracer(tr); + tracefs_remove(tr->dir); + return ret; + } + return 0; } static struct trace_array * @@ -10029,16 +10340,20 @@ trace_array_create_systems(const char *name, const char *systems, raw_spin_lock_init(&tr->start_lock); + tr->syscall_buf_sz = global_trace.syscall_buf_sz; + tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; #ifdef CONFIG_TRACER_MAX_TRACE spin_lock_init(&tr->snapshot_trigger_lock); #endif tr->current_trace = &nop_trace; + tr->current_trace_flags = nop_trace.flags; INIT_LIST_HEAD(&tr->systems); INIT_LIST_HEAD(&tr->events); INIT_LIST_HEAD(&tr->hist_vars); INIT_LIST_HEAD(&tr->err_log); + INIT_LIST_HEAD(&tr->tracers); INIT_LIST_HEAD(&tr->marker_list); #ifdef CONFIG_MODULES @@ -10193,7 +10508,7 @@ static int __remove_instance(struct trace_array *tr) /* Disable all the flags that were enabled coming in */ for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) { if ((1 << i) & ZEROED_TRACE_FLAGS) - set_tracer_flag(tr, 1 << i, 0); + set_tracer_flag(tr, 1ULL << i, 0); } if (printk_trace == tr) @@ -10211,11 +10526,14 @@ static int __remove_instance(struct trace_array *tr) free_percpu(tr->last_func_repeats); free_trace_buffers(tr); clear_tracing_err_log(tr); + free_tracers(tr); if (tr->range_name) { reserve_mem_release_by_name(tr->range_name); kfree(tr->range_name); } + if (tr->flags & TRACE_ARRAY_FL_VMALLOC) + vfree((void *)tr->range_addr_start); for (i = 0; i < tr->nr_topts; i++) { kfree(tr->topts[i].topts); @@ -10345,6 +10663,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer, tr, &buffer_subbuf_size_fops); + trace_create_file("syscall_user_buf_size", TRACE_MODE_WRITE, d_tracer, + tr, &tracing_syscall_buf_fops); + create_trace_options_dir(tr); #ifdef CONFIG_TRACER_MAX_TRACE @@ -10630,7 +10951,7 @@ static __init void tracer_init_tracefs_work_func(struct work_struct *work) create_trace_instances(NULL); - update_tracer_options(&global_trace); + update_tracer_options(); } static __init int tracer_init_tracefs(void) @@ -10650,7 +10971,8 @@ static __init int tracer_init_tracefs(void) tracer_init_tracefs_work_func(NULL); } - rv_init_interface(); + if (rv_init_interface()) + pr_err("RV: Error while creating the RV interface\n"); return 0; } @@ -10783,10 +11105,10 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m /* While dumping, do not allow the buffer to be enable */ tracer_tracing_disable(tr); - old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ; + old_userobj = tr->trace_flags & TRACE_ITER(SYM_USEROBJ); /* don't look at user memory in panic mode */ - tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + tr->trace_flags &= ~TRACE_ITER(SYM_USEROBJ); if (dump_mode == DUMP_ORIG) iter.cpu_file = raw_smp_processor_id(); @@ -11009,7 +11331,7 @@ __init static void do_allocate_snapshot(const char *name) /* * When allocate_snapshot is set, the next call to * allocate_trace_buffers() (called by trace_array_get_by_name()) - * will allocate the snapshot buffer. That will alse clear + * will allocate the snapshot buffer. That will also clear * this flag. */ allocate_snapshot = true; @@ -11018,6 +11340,42 @@ __init static void do_allocate_snapshot(const char *name) static inline void do_allocate_snapshot(const char *name) { } #endif +__init static int backup_instance_area(const char *backup, + unsigned long *addr, phys_addr_t *size) +{ + struct trace_array *backup_tr; + void *allocated_vaddr = NULL; + + backup_tr = trace_array_get_by_name(backup, NULL); + if (!backup_tr) { + pr_warn("Tracing: Instance %s is not found.\n", backup); + return -ENOENT; + } + + if (!(backup_tr->flags & TRACE_ARRAY_FL_BOOT)) { + pr_warn("Tracing: Instance %s is not boot mapped.\n", backup); + trace_array_put(backup_tr); + return -EINVAL; + } + + *size = backup_tr->range_addr_size; + + allocated_vaddr = vzalloc(*size); + if (!allocated_vaddr) { + pr_warn("Tracing: Failed to allocate memory for copying instance %s (size 0x%lx)\n", + backup, (unsigned long)*size); + trace_array_put(backup_tr); + return -ENOMEM; + } + + memcpy(allocated_vaddr, + (void *)backup_tr->range_addr_start, (size_t)*size); + *addr = (unsigned long)allocated_vaddr; + + trace_array_put(backup_tr); + return 0; +} + __init static void enable_instances(void) { struct trace_array *tr; @@ -11040,11 +11398,15 @@ __init static void enable_instances(void) char *flag_delim; char *addr_delim; char *rname __free(kfree) = NULL; + char *backup; tok = strsep(&curr_str, ","); - flag_delim = strchr(tok, '^'); - addr_delim = strchr(tok, '@'); + name = strsep(&tok, "="); + backup = tok; + + flag_delim = strchr(name, '^'); + addr_delim = strchr(name, '@'); if (addr_delim) *addr_delim++ = '\0'; @@ -11052,7 +11414,10 @@ __init static void enable_instances(void) if (flag_delim) *flag_delim++ = '\0'; - name = tok; + if (backup) { + if (backup_instance_area(backup, &addr, &size) < 0) + continue; + } if (flag_delim) { char *flag; @@ -11148,7 +11513,13 @@ __init static void enable_instances(void) tr->ref++; } - if (start) { + /* + * Backup buffers can be freed but need vfree(). + */ + if (backup) + tr->flags |= TRACE_ARRAY_FL_VMALLOC; + + if (start || backup) { tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT; tr->range_name = no_free_ptr(rname); } @@ -11242,6 +11613,7 @@ __init static int tracer_alloc_buffers(void) * just a bootstrap of current_trace anyway. */ global_trace.current_trace = &nop_trace; + global_trace.current_trace_flags = nop_trace.flags; global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; #ifdef CONFIG_TRACER_MAX_TRACE @@ -11255,10 +11627,7 @@ __init static int tracer_alloc_buffers(void) init_trace_flags_index(&global_trace); - register_tracer(&nop_trace); - - /* Function tracing may start here (via kernel command line) */ - init_function_trace(); + INIT_LIST_HEAD(&global_trace.tracers); /* All seems OK, enable tracing */ tracing_disabled = 0; @@ -11270,6 +11639,8 @@ __init static int tracer_alloc_buffers(void) global_trace.flags = TRACE_ARRAY_FL_GLOBAL; + global_trace.syscall_buf_sz = syscall_buf_size; + INIT_LIST_HEAD(&global_trace.systems); INIT_LIST_HEAD(&global_trace.events); INIT_LIST_HEAD(&global_trace.hist_vars); @@ -11277,6 +11648,11 @@ __init static int tracer_alloc_buffers(void) list_add(&global_trace.marker_list, &marker_copies); list_add(&global_trace.list, &ftrace_trace_arrays); + register_tracer(&nop_trace); + + /* Function tracing may start here (via kernel command line) */ + init_function_trace(); + apply_trace_boot_options(); register_snapshot_cmd(); @@ -11300,7 +11676,7 @@ out_free_buffer_mask: #ifdef CONFIG_FUNCTION_TRACER /* Used to set module cached ftrace filtering at boot up */ -__init struct trace_array *trace_get_global_array(void) +struct trace_array *trace_get_global_array(void) { return &global_trace; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 85eabb454bee..b6d42fe06115 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -22,6 +22,7 @@ #include <linux/ctype.h> #include <linux/once_lite.h> #include <linux/ftrace_regs.h> +#include <linux/llist.h> #include "pid_list.h" @@ -131,6 +132,8 @@ enum trace_type { #define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) #define HIST_STACKTRACE_SKIP 5 +#define SYSCALL_FAULT_USER_MAX 165 + /* * syscalls are special, and need special handling, this is why * they are not included in trace_entries.h @@ -216,7 +219,7 @@ struct array_buffer { int cpu; }; -#define TRACE_FLAGS_MAX_SIZE 32 +#define TRACE_FLAGS_MAX_SIZE 64 struct trace_options { struct tracer *tracer; @@ -390,7 +393,8 @@ struct trace_array { int buffer_percent; unsigned int n_err_log_entries; struct tracer *current_trace; - unsigned int trace_flags; + struct tracer_flags *current_trace_flags; + u64 trace_flags; unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; unsigned int flags; raw_spinlock_t start_lock; @@ -404,6 +408,7 @@ struct trace_array { struct list_head systems; struct list_head events; struct list_head marker_list; + struct list_head tracers; struct trace_event_file *trace_marker_file; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ /* one per_cpu trace_pipe can be opened by only one user */ @@ -430,6 +435,7 @@ struct trace_array { int function_enabled; #endif int no_filter_buffering_ref; + unsigned int syscall_buf_sz; struct list_head hist_vars; #ifdef CONFIG_TRACER_SNAPSHOT struct cond_snapshot *cond_snapshot; @@ -448,6 +454,7 @@ enum { TRACE_ARRAY_FL_LAST_BOOT = BIT(2), TRACE_ARRAY_FL_MOD_INIT = BIT(3), TRACE_ARRAY_FL_MEMMAP = BIT(4), + TRACE_ARRAY_FL_VMALLOC = BIT(5), }; #ifdef CONFIG_MODULES @@ -631,9 +638,10 @@ struct tracer { u32 old_flags, u32 bit, int set); /* Return 0 if OK with change, else return non-zero */ int (*flag_changed)(struct trace_array *tr, - u32 mask, int set); + u64 mask, int set); struct tracer *next; struct tracer_flags *flags; + struct tracer_flags *default_flags; int enabled; bool print_max; bool allow_instances; @@ -937,8 +945,6 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) -extern void ftrace_graph_sleep_time_control(bool enable); - #ifdef CONFIG_FUNCTION_PROFILER extern void ftrace_graph_graph_time_control(bool enable); #else @@ -958,7 +964,8 @@ extern int __trace_graph_entry(struct trace_array *tr, extern int __trace_graph_retaddr_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned int trace_ctx, - unsigned long retaddr); + unsigned long retaddr, + struct ftrace_regs *fregs); extern void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned int trace_ctx, @@ -1109,7 +1116,8 @@ static inline void ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftra #endif /* CONFIG_DYNAMIC_FTRACE */ extern unsigned int fgraph_max_depth; -extern bool fgraph_sleep_time; +extern int fgraph_no_sleep_time; +extern bool fprofile_no_sleep_time; static inline bool ftrace_graph_ignore_func(struct fgraph_ops *gops, struct ftrace_graph_ent *trace) @@ -1154,11 +1162,6 @@ struct ftrace_func_command { char *params, int enable); }; extern bool ftrace_filter_param __initdata; -static inline int ftrace_trace_task(struct trace_array *tr) -{ - return this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid) != - FTRACE_PID_IGNORE; -} extern int ftrace_is_dead(void); int ftrace_create_function_files(struct trace_array *tr, struct dentry *parent); @@ -1176,10 +1179,6 @@ void ftrace_clear_pids(struct trace_array *tr); int init_function_trace(void); void ftrace_pid_follow_fork(struct trace_array *tr, bool enable); #else -static inline int ftrace_trace_task(struct trace_array *tr) -{ - return 1; -} static inline int ftrace_is_dead(void) { return 0; } static inline int ftrace_create_function_files(struct trace_array *tr, @@ -1345,11 +1344,11 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, # define FUNCTION_FLAGS \ C(FUNCTION, "function-trace"), \ C(FUNC_FORK, "function-fork"), -# define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION +# define FUNCTION_DEFAULT_FLAGS TRACE_ITER(FUNCTION) #else # define FUNCTION_FLAGS # define FUNCTION_DEFAULT_FLAGS 0UL -# define TRACE_ITER_FUNC_FORK 0UL +# define TRACE_ITER_FUNC_FORK_BIT -1 #endif #ifdef CONFIG_STACKTRACE @@ -1359,6 +1358,24 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, # define STACK_FLAGS #endif +#ifdef CONFIG_FUNCTION_PROFILER +# define PROFILER_FLAGS \ + C(PROF_TEXT_OFFSET, "prof-text-offset"), +# ifdef CONFIG_FUNCTION_GRAPH_TRACER +# define FPROFILE_FLAGS \ + C(GRAPH_TIME, "graph-time"), +# define FPROFILE_DEFAULT_FLAGS TRACE_ITER(GRAPH_TIME) +# else +# define FPROFILE_FLAGS +# define FPROFILE_DEFAULT_FLAGS 0UL +# endif +#else +# define PROFILER_FLAGS +# define FPROFILE_FLAGS +# define FPROFILE_DEFAULT_FLAGS 0UL +# define TRACE_ITER_PROF_TEXT_OFFSET_BIT -1 +#endif + /* * trace_iterator_flags is an enumeration that defines bit * positions into trace_flags that controls the output. @@ -1391,13 +1408,15 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(MARKERS, "markers"), \ C(EVENT_FORK, "event-fork"), \ C(TRACE_PRINTK, "trace_printk_dest"), \ - C(COPY_MARKER, "copy_trace_marker"),\ + C(COPY_MARKER, "copy_trace_marker"), \ C(PAUSE_ON_TRACE, "pause-on-trace"), \ C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \ FUNCTION_FLAGS \ FGRAPH_FLAGS \ STACK_FLAGS \ - BRANCH_FLAGS + BRANCH_FLAGS \ + PROFILER_FLAGS \ + FPROFILE_FLAGS /* * By defining C, we can make TRACE_FLAGS a list of bit names @@ -1413,20 +1432,17 @@ enum trace_iterator_bits { }; /* - * By redefining C, we can make TRACE_FLAGS a list of masks that - * use the bits as defined above. + * And use TRACE_ITER(flag) to define the bit masks. */ -#undef C -#define C(a, b) TRACE_ITER_##a = (1 << TRACE_ITER_##a##_BIT) - -enum trace_iterator_flags { TRACE_FLAGS }; +#define TRACE_ITER(flag) \ + (TRACE_ITER_##flag##_BIT < 0 ? 0 : 1ULL << (TRACE_ITER_##flag##_BIT)) /* * TRACE_ITER_SYM_MASK masks the options in trace_flags that * control the output of kernel symbols. */ #define TRACE_ITER_SYM_MASK \ - (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) + (TRACE_ITER(PRINT_PARENT)|TRACE_ITER(SYM_OFFSET)|TRACE_ITER(SYM_ADDR)) extern struct tracer nop_trace; @@ -1435,7 +1451,7 @@ extern int enable_branch_tracing(struct trace_array *tr); extern void disable_branch_tracing(void); static inline int trace_branch_enable(struct trace_array *tr) { - if (tr->trace_flags & TRACE_ITER_BRANCH) + if (tr->trace_flags & TRACE_ITER(BRANCH)) return enable_branch_tracing(tr); return 0; } @@ -1531,6 +1547,23 @@ void trace_buffered_event_enable(void); void early_enable_events(struct trace_array *tr, char *buf, bool disable_first); +struct trace_user_buf; +struct trace_user_buf_info { + struct trace_user_buf __percpu *tbuf; + size_t size; + int ref; +}; + +typedef int (*trace_user_buf_copy)(char *dst, const char __user *src, + size_t size, void *data); +int trace_user_fault_init(struct trace_user_buf_info *tinfo, size_t size); +int trace_user_fault_get(struct trace_user_buf_info *tinfo); +int trace_user_fault_put(struct trace_user_buf_info *tinfo); +void trace_user_fault_destroy(struct trace_user_buf_info *tinfo); +char *trace_user_fault_read(struct trace_user_buf_info *tinfo, + const char __user *ptr, size_t size, + trace_user_buf_copy copy_func, void *data); + static inline void __trace_event_discard_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) @@ -1752,13 +1785,13 @@ extern void clear_event_triggers(struct trace_array *tr); enum { EVENT_TRIGGER_FL_PROBE = BIT(0), + EVENT_TRIGGER_FL_COUNT = BIT(1), }; struct event_trigger_data { unsigned long count; int ref; int flags; - const struct event_trigger_ops *ops; struct event_command *cmd_ops; struct event_filter __rcu *filter; char *filter_str; @@ -1769,6 +1802,7 @@ struct event_trigger_data { char *name; struct list_head named_list; struct event_trigger_data *named_data; + struct llist_node llist; }; /* Avoid typos */ @@ -1783,6 +1817,10 @@ struct enable_trigger_data { bool hist; }; +bool event_trigger_count(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event); + extern int event_enable_trigger_print(struct seq_file *m, struct event_trigger_data *data); extern void event_enable_trigger_free(struct event_trigger_data *data); @@ -1846,64 +1884,6 @@ extern void event_file_get(struct trace_event_file *file); extern void event_file_put(struct trace_event_file *file); /** - * struct event_trigger_ops - callbacks for trace event triggers - * - * The methods in this structure provide per-event trigger hooks for - * various trigger operations. - * - * The @init and @free methods are used during trigger setup and - * teardown, typically called from an event_command's @parse() - * function implementation. - * - * The @print method is used to print the trigger spec. - * - * The @trigger method is the function that actually implements the - * trigger and is called in the context of the triggering event - * whenever that event occurs. - * - * All the methods below, except for @init() and @free(), must be - * implemented. - * - * @trigger: The trigger 'probe' function called when the triggering - * event occurs. The data passed into this callback is the data - * that was supplied to the event_command @reg() function that - * registered the trigger (see struct event_command) along with - * the trace record, rec. - * - * @init: An optional initialization function called for the trigger - * when the trigger is registered (via the event_command reg() - * function). This can be used to perform per-trigger - * initialization such as incrementing a per-trigger reference - * count, for instance. This is usually implemented by the - * generic utility function @event_trigger_init() (see - * trace_event_triggers.c). - * - * @free: An optional de-initialization function called for the - * trigger when the trigger is unregistered (via the - * event_command @reg() function). This can be used to perform - * per-trigger de-initialization such as decrementing a - * per-trigger reference count and freeing corresponding trigger - * data, for instance. This is usually implemented by the - * generic utility function @event_trigger_free() (see - * trace_event_triggers.c). - * - * @print: The callback function invoked to have the trigger print - * itself. This is usually implemented by a wrapper function - * that calls the generic utility function @event_trigger_print() - * (see trace_event_triggers.c). - */ -struct event_trigger_ops { - void (*trigger)(struct event_trigger_data *data, - struct trace_buffer *buffer, - void *rec, - struct ring_buffer_event *rbe); - int (*init)(struct event_trigger_data *data); - void (*free)(struct event_trigger_data *data); - int (*print)(struct seq_file *m, - struct event_trigger_data *data); -}; - -/** * struct event_command - callbacks and data members for event commands * * Event commands are invoked by users by writing the command name @@ -1952,7 +1932,7 @@ struct event_trigger_ops { * * @reg: Adds the trigger to the list of triggers associated with the * event, and enables the event trigger itself, after - * initializing it (via the event_trigger_ops @init() function). + * initializing it (via the event_command @init() function). * This is also where commands can use the @trigger_type value to * make the decision as to whether or not multiple instances of * the trigger should be allowed. This is usually implemented by @@ -1961,7 +1941,7 @@ struct event_trigger_ops { * * @unreg: Removes the trigger from the list of triggers associated * with the event, and disables the event trigger itself, after - * initializing it (via the event_trigger_ops @free() function). + * initializing it (via the event_command @free() function). * This is usually implemented by the generic utility function * @unregister_trigger() (see trace_event_triggers.c). * @@ -1975,12 +1955,41 @@ struct event_trigger_ops { * ignored. This is usually implemented by the generic utility * function @set_trigger_filter() (see trace_event_triggers.c). * - * @get_trigger_ops: The callback function invoked to retrieve the - * event_trigger_ops implementation associated with the command. - * This callback function allows a single event_command to - * support multiple trigger implementations via different sets of - * event_trigger_ops, depending on the value of the @param - * string. + * All the methods below, except for @init() and @free(), must be + * implemented. + * + * @trigger: The trigger 'probe' function called when the triggering + * event occurs. The data passed into this callback is the data + * that was supplied to the event_command @reg() function that + * registered the trigger (see struct event_command) along with + * the trace record, rec. + * + * @count_func: If defined and a numeric parameter is passed to the + * trigger, then this function will be called before @trigger + * is called. If this function returns false, then @trigger is not + * executed. + * + * @init: An optional initialization function called for the trigger + * when the trigger is registered (via the event_command reg() + * function). This can be used to perform per-trigger + * initialization such as incrementing a per-trigger reference + * count, for instance. This is usually implemented by the + * generic utility function @event_trigger_init() (see + * trace_event_triggers.c). + * + * @free: An optional de-initialization function called for the + * trigger when the trigger is unregistered (via the + * event_command @reg() function). This can be used to perform + * per-trigger de-initialization such as decrementing a + * per-trigger reference count and freeing corresponding trigger + * data, for instance. This is usually implemented by the + * generic utility function @event_trigger_free() (see + * trace_event_triggers.c). + * + * @print: The callback function invoked to have the trigger print + * itself. This is usually implemented by a wrapper function + * that calls the generic utility function @event_trigger_print() + * (see trace_event_triggers.c). */ struct event_command { struct list_head list; @@ -2001,7 +2010,18 @@ struct event_command { int (*set_filter)(char *filter_str, struct event_trigger_data *data, struct trace_event_file *file); - const struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); + void (*trigger)(struct event_trigger_data *data, + struct trace_buffer *buffer, + void *rec, + struct ring_buffer_event *rbe); + bool (*count_func)(struct event_trigger_data *data, + struct trace_buffer *buffer, + void *rec, + struct ring_buffer_event *rbe); + int (*init)(struct event_trigger_data *data); + void (*free)(struct event_trigger_data *data); + int (*print)(struct seq_file *m, + struct event_trigger_data *data); }; /** @@ -2022,7 +2042,7 @@ struct event_command { * either committed or discarded. At that point, if any commands * have deferred their triggers, those commands are finally * invoked following the close of the current event. In other - * words, if the event_trigger_ops @func() probe implementation + * words, if the event_command @func() probe implementation * itself logs to the trace buffer, this flag should be set, * otherwise it can be left unspecified. * @@ -2064,8 +2084,8 @@ extern const char *__stop___tracepoint_str[]; void trace_printk_control(bool enabled); void trace_printk_start_comm(void); -int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); -int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); +int trace_keep_overwrite(struct tracer *tracer, u64 mask, int set); +int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled); /* Used from boot time tracer */ extern int trace_set_options(struct trace_array *tr, char *option); @@ -2248,4 +2268,25 @@ static inline int rv_init_interface(void) */ #define FTRACE_TRAMPOLINE_MARKER ((unsigned long) INT_MAX) +/* + * This is used to get the address of the args array based on + * the type of the entry. + */ +#define FGRAPH_ENTRY_ARGS(e) \ + ({ \ + unsigned long *_args; \ + struct ftrace_graph_ent_entry *_e = e; \ + \ + if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && \ + e->ent.type == TRACE_GRAPH_RETADDR_ENT) { \ + struct fgraph_retaddr_ent_entry *_re; \ + \ + _re = (typeof(_re))_e; \ + _args = _re->args; \ + } else { \ + _args = _e->args; \ + } \ + _args; \ + }) + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index d06854bd32b3..c4dfbc293bae 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -144,9 +144,16 @@ static int create_dyn_event(const char *raw_command) if (!ret || ret != -ECANCELED) break; } - mutex_unlock(&dyn_event_ops_mutex); - if (ret == -ECANCELED) + if (ret == -ECANCELED) { + static const char *err_msg[] = {"No matching dynamic event type"}; + + /* Wrong dynamic event. Leave an error message. */ + tracing_log_err(NULL, "dynevent", raw_command, err_msg, + 0, 0); ret = -EINVAL; + } + + mutex_unlock(&dyn_event_ops_mutex); return ret; } diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index de294ae2c5c5..f6a8d29c0d76 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -80,11 +80,11 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, F_STRUCT( __field_struct( struct ftrace_graph_ent, graph_ent ) __field_packed( unsigned long, graph_ent, func ) - __field_packed( unsigned int, graph_ent, depth ) + __field_packed( unsigned long, graph_ent, depth ) __dynamic_array(unsigned long, args ) ), - F_printk("--> %ps (%u)", (void *)__entry->func, __entry->depth) + F_printk("--> %ps (%lu)", (void *)__entry->func, __entry->depth) ); #ifdef CONFIG_FUNCTION_GRAPH_RETADDR @@ -95,13 +95,14 @@ FTRACE_ENTRY_PACKED(fgraph_retaddr_entry, fgraph_retaddr_ent_entry, TRACE_GRAPH_RETADDR_ENT, F_STRUCT( - __field_struct( struct fgraph_retaddr_ent, graph_ent ) - __field_packed( unsigned long, graph_ent, func ) - __field_packed( unsigned int, graph_ent, depth ) - __field_packed( unsigned long, graph_ent, retaddr ) + __field_struct( struct fgraph_retaddr_ent, graph_rent ) + __field_packed( unsigned long, graph_rent.ent, func ) + __field_packed( unsigned long, graph_rent.ent, depth ) + __field_packed( unsigned long, graph_rent, retaddr ) + __dynamic_array(unsigned long, args ) ), - F_printk("--> %ps (%u) <- %ps", (void *)__entry->func, __entry->depth, + F_printk("--> %ps (%lu) <- %ps", (void *)__entry->func, __entry->depth, (void *)__entry->retaddr) ); diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index a1d402124836..3ee39715d5e4 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -61,6 +61,9 @@ static void trace_event_probe_cleanup(struct trace_eprobe *ep) kfree(ep); } +DEFINE_FREE(trace_event_probe_cleanup, struct trace_eprobe *, + if (!IS_ERR_OR_NULL(_T)) trace_event_probe_cleanup(_T)) + static struct trace_eprobe *to_trace_eprobe(struct dyn_event *ev) { return container_of(ev, struct trace_eprobe, devent); @@ -197,10 +200,10 @@ static struct trace_eprobe *alloc_event_probe(const char *group, struct trace_event_call *event, int nargs) { - struct trace_eprobe *ep; + struct trace_eprobe *ep __free(trace_event_probe_cleanup) = NULL; const char *event_name; const char *sys_name; - int ret = -ENOMEM; + int ret; if (!event) return ERR_PTR(-ENODEV); @@ -211,25 +214,22 @@ static struct trace_eprobe *alloc_event_probe(const char *group, ep = kzalloc(struct_size(ep, tp.args, nargs), GFP_KERNEL); if (!ep) { trace_event_put_ref(event); - goto error; + return ERR_PTR(-ENOMEM); } ep->event = event; ep->event_name = kstrdup(event_name, GFP_KERNEL); if (!ep->event_name) - goto error; + return ERR_PTR(-ENOMEM); ep->event_system = kstrdup(sys_name, GFP_KERNEL); if (!ep->event_system) - goto error; + return ERR_PTR(-ENOMEM); ret = trace_probe_init(&ep->tp, this_event, group, false, nargs); if (ret < 0) - goto error; + return ERR_PTR(ret); dyn_event_init(&ep->devent, &eprobe_dyn_event_ops); - return ep; -error: - trace_event_probe_cleanup(ep); - return ERR_PTR(ret); + return_ptr(ep); } static int eprobe_event_define_fields(struct trace_event_call *event_call) @@ -484,13 +484,6 @@ static void eprobe_trigger_func(struct event_trigger_data *data, __eprobe_trace_func(edata, rec); } -static const struct event_trigger_ops eprobe_trigger_ops = { - .trigger = eprobe_trigger_func, - .print = eprobe_trigger_print, - .init = eprobe_trigger_init, - .free = eprobe_trigger_free, -}; - static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, @@ -513,12 +506,6 @@ static void eprobe_trigger_unreg_func(char *glob, } -static const struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd, - char *param) -{ - return &eprobe_trigger_ops; -} - static struct event_command event_trigger_cmd = { .name = "eprobe", .trigger_type = ETT_EVENT_EPROBE, @@ -527,8 +514,11 @@ static struct event_command event_trigger_cmd = { .reg = eprobe_trigger_reg_func, .unreg = eprobe_trigger_unreg_func, .unreg_all = NULL, - .get_trigger_ops = eprobe_trigger_get_ops, .set_filter = NULL, + .trigger = eprobe_trigger_func, + .print = eprobe_trigger_print, + .init = eprobe_trigger_init, + .free = eprobe_trigger_free, }; static struct event_trigger_data * @@ -548,7 +538,6 @@ new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file) trigger->flags = EVENT_TRIGGER_FL_PROBE; trigger->count = -1; - trigger->ops = &eprobe_trigger_ops; /* * EVENT PROBE triggers are not registered as commands with @@ -801,25 +790,6 @@ find_and_get_event(const char *system, const char *event_name) return NULL; } -static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i) -{ - struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; - int ret; - - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - ctx->event = ep->event; - ctx->flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT; - - ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], ctx); - /* Handle symbols "@" */ - if (!ret) - ret = traceprobe_update_arg(&ep->tp.args[i]); - - return ret; -} - static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const char *argv[]) { struct event_filter *dummy = NULL; @@ -856,13 +826,10 @@ static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const ch ret = create_event_filter(top_trace_array(), ep->event, ep->filter_str, true, &dummy); free_event_filter(dummy); - if (ret) - goto error; - - return 0; -error: - kfree(ep->filter_str); - ep->filter_str = NULL; + if (ret) { + kfree(ep->filter_str); + ep->filter_str = NULL; + } return ret; } @@ -874,31 +841,33 @@ static int __trace_eprobe_create(int argc, const char *argv[]) * Fetch args (no space): * <name>=$<field>[:TYPE] */ + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; + struct trace_eprobe *ep __free(trace_event_probe_cleanup) = NULL; + const char *trlog __free(trace_probe_log_clear) = NULL; const char *event = NULL, *group = EPROBE_EVENT_SYSTEM; const char *sys_event = NULL, *sys_name = NULL; struct trace_event_call *event_call; char *buf1 __free(kfree) = NULL; char *buf2 __free(kfree) = NULL; char *gbuf __free(kfree) = NULL; - struct trace_eprobe *ep = NULL; int ret = 0, filter_idx = 0; int i, filter_cnt; if (argc < 2 || argv[0][0] != 'e') return -ECANCELED; - trace_probe_log_init("event_probe", argc, argv); + trlog = trace_probe_log_init("event_probe", argc, argv); event = strchr(&argv[0][1], ':'); if (event) { gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); if (!gbuf) - goto mem_error; + return -ENOMEM; event++; ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) - goto parse_error; + return -EINVAL; } trace_probe_log_set_index(1); @@ -906,18 +875,18 @@ static int __trace_eprobe_create(int argc, const char *argv[]) buf2 = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); if (!buf2) - goto mem_error; + return -ENOMEM; ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0); if (ret || !sys_event || !sys_name) { trace_probe_log_err(0, NO_EVENT_INFO); - goto parse_error; + return -EINVAL; } if (!event) { buf1 = kstrdup(sys_event, GFP_KERNEL); if (!buf1) - goto mem_error; + return -ENOMEM; event = buf1; } @@ -933,8 +902,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) if (argc - 2 > MAX_TRACE_ARGS) { trace_probe_log_set_index(2); trace_probe_log_err(0, TOO_MANY_ARGS); - ret = -E2BIG; - goto error; + return -E2BIG; } scoped_guard(mutex, &event_mutex) { @@ -948,29 +916,39 @@ static int __trace_eprobe_create(int argc, const char *argv[]) trace_probe_log_err(0, BAD_ATTACH_EVENT); /* This must return -ENOMEM or missing event, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM && ret != -ENODEV); - ep = NULL; - goto error; + return ret; } if (filter_idx) { trace_probe_log_set_index(filter_idx); ret = trace_eprobe_parse_filter(ep, filter_cnt, argv + filter_idx); if (ret) - goto parse_error; + return -EINVAL; } else ep->filter_str = NULL; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + ctx->event = ep->event; + ctx->flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT; + argc -= 2; argv += 2; /* parse arguments */ for (i = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); - ret = trace_eprobe_tp_update_arg(ep, argv, i); + + ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], ctx); + /* Handle symbols "@" */ + if (!ret) + ret = traceprobe_update_arg(&ep->tp.args[i]); if (ret) - goto error; + return ret; } ret = traceprobe_set_print_fmt(&ep->tp, PROBE_PRINT_EVENT); if (ret < 0) - goto error; + return ret; + init_trace_eprobe_call(ep); scoped_guard(mutex, &event_mutex) { ret = trace_probe_register_event_call(&ep->tp); @@ -979,25 +957,16 @@ static int __trace_eprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(0); trace_probe_log_err(0, EVENT_EXIST); } - goto error; + return ret; } ret = dyn_event_add(&ep->devent, &ep->tp.event->call); if (ret < 0) { trace_probe_unregister_event_call(&ep->tp); - goto error; + return ret; } + /* To avoid freeing registered eprobe event, clear ep. */ + ep = NULL; } - trace_probe_log_clear(); - return ret; - -mem_error: - ret = -ENOMEM; - goto error; -parse_error: - ret = -EINVAL; -error: - trace_probe_log_clear(); - trace_event_probe_cleanup(ep); return ret; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e00da4182deb..b16a5a158040 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -360,7 +360,7 @@ static bool process_string(const char *fmt, int len, struct trace_event_call *ca /* Anything else, this isn't a function */ break; } - /* A function could be wrapped in parethesis, try the next one */ + /* A function could be wrapped in parenthesis, try the next one */ s = r + 1; } while (s < e); @@ -567,7 +567,7 @@ static void test_event_printk(struct trace_event_call *call) * If start_arg is zero, then this is the start of the * first argument. The processing of the argument happens * when the end of the argument is found, as it needs to - * handle paranthesis and such. + * handle parenthesis and such. */ if (!start_arg) { start_arg = i; @@ -785,7 +785,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, * * When soft_disable is not set but the soft_mode is, * we do nothing. Do not disable the tracepoint, otherwise - * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. + * "soft enable"s (clearing the SOFT_DISABLED bit) won't work. */ if (soft_disable) { if (atomic_dec_return(&file->sm_ref) > 0) @@ -845,13 +845,13 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, if (soft_disable) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); - if (tr->trace_flags & TRACE_ITER_RECORD_CMD) { + if (tr->trace_flags & TRACE_ITER(RECORD_CMD)) { cmd = true; tracing_start_cmdline_record(); set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } - if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { + if (tr->trace_flags & TRACE_ITER(RECORD_TGID)) { tgid = true; tracing_start_tgid_record(); set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); @@ -1394,7 +1394,7 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) if (!tr) return -ENOENT; - /* Modules events can be appened with :mod:<module> */ + /* Modules events can be appended with :mod:<module> */ mod = strstr(buf, ":mod:"); if (mod) { *mod = '\0'; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 54226b48b2d1..385af8405392 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -142,7 +142,7 @@ static bool is_not(const char *str) } /** - * struct prog_entry - a singe entry in the filter program + * struct prog_entry - a single entry in the filter program * @target: Index to jump to on a branch (actually one minus the index) * @when_to_branch: The value of the result of the predicate to do a branch * @pred: The predicate to execute. diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 6bfaf1210dd2..5e6e70540eef 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5283,7 +5283,7 @@ hist_trigger_actions(struct hist_trigger_data *hist_data, * on the stack, so when the histogram trigger is initialized * a percpu array of 4 hist_pad structures is allocated. * This will cover every context from normal, softirq, irq and NMI - * in the very unlikely event that a tigger happens at each of + * in the very unlikely event that a trigger happens at each of * these contexts and interrupts a currently active trigger. */ struct hist_pad { @@ -5696,7 +5696,7 @@ static void hist_trigger_show(struct seq_file *m, seq_puts(m, "\n\n"); seq_puts(m, "# event histogram\n#\n# trigger info: "); - data->ops->print(m, data); + data->cmd_ops->print(m, data); seq_puts(m, "#\n\n"); hist_data = data->private_data; @@ -6018,7 +6018,7 @@ static void hist_trigger_debug_show(struct seq_file *m, seq_puts(m, "\n\n"); seq_puts(m, "# event histogram\n#\n# trigger info: "); - data->ops->print(m, data); + data->cmd_ops->print(m, data); seq_puts(m, "#\n\n"); hist_data = data->private_data; @@ -6328,20 +6328,21 @@ static void event_hist_trigger_free(struct event_trigger_data *data) free_hist_pad(); } -static const struct event_trigger_ops event_hist_trigger_ops = { - .trigger = event_hist_trigger, - .print = event_hist_trigger_print, - .init = event_hist_trigger_init, - .free = event_hist_trigger_free, -}; - static int event_hist_trigger_named_init(struct event_trigger_data *data) { + int ret; + data->ref++; save_named_trigger(data->named_data->name, data); - return event_hist_trigger_init(data->named_data); + ret = event_hist_trigger_init(data->named_data); + if (ret < 0) { + kfree(data->cmd_ops); + data->cmd_ops = &trigger_hist_cmd; + } + + return ret; } static void event_hist_trigger_named_free(struct event_trigger_data *data) @@ -6353,24 +6354,14 @@ static void event_hist_trigger_named_free(struct event_trigger_data *data) data->ref--; if (!data->ref) { + struct event_command *cmd_ops = data->cmd_ops; + del_named_trigger(data); trigger_data_free(data); + kfree(cmd_ops); } } -static const struct event_trigger_ops event_hist_trigger_named_ops = { - .trigger = event_hist_trigger, - .print = event_hist_trigger_print, - .init = event_hist_trigger_named_init, - .free = event_hist_trigger_named_free, -}; - -static const struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, - char *param) -{ - return &event_hist_trigger_ops; -} - static void hist_clear(struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; @@ -6564,13 +6555,24 @@ static int hist_register_trigger(char *glob, data->paused = true; if (named_data) { + struct event_command *cmd_ops; + data->private_data = named_data->private_data; set_named_trigger_data(data, named_data); - data->ops = &event_hist_trigger_named_ops; + /* Copy the command ops and update some of the functions */ + cmd_ops = kmalloc(sizeof(*cmd_ops), GFP_KERNEL); + if (!cmd_ops) { + ret = -ENOMEM; + goto out; + } + *cmd_ops = *data->cmd_ops; + cmd_ops->init = event_hist_trigger_named_init; + cmd_ops->free = event_hist_trigger_named_free; + data->cmd_ops = cmd_ops; } - if (data->ops->init) { - ret = data->ops->init(data); + if (data->cmd_ops->init) { + ret = data->cmd_ops->init(data); if (ret < 0) goto out; } @@ -6684,8 +6686,8 @@ static void hist_unregister_trigger(char *glob, } } - if (test && test->ops->free) - test->ops->free(test); + if (test && test->cmd_ops->free) + test->cmd_ops->free(test); if (hist_data->enable_timestamps) { if (!hist_data->remove || test) @@ -6737,8 +6739,8 @@ static void hist_unreg_all(struct trace_event_file *file) update_cond_flag(file); if (hist_data->enable_timestamps) tracing_set_filter_buffering(file->tr, false); - if (test->ops->free) - test->ops->free(test); + if (test->cmd_ops->free) + test->cmd_ops->free(test); } } } @@ -6914,8 +6916,11 @@ static struct event_command trigger_hist_cmd = { .reg = hist_register_trigger, .unreg = hist_unregister_trigger, .unreg_all = hist_unreg_all, - .get_trigger_ops = event_hist_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = event_hist_trigger, + .print = event_hist_trigger_print, + .init = event_hist_trigger_init, + .free = event_hist_trigger_free, }; __init int register_trigger_hist_cmd(void) @@ -6947,66 +6952,6 @@ hist_enable_trigger(struct event_trigger_data *data, } } -static void -hist_enable_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) -{ - if (!data->count) - return; - - if (data->count != -1) - (data->count)--; - - hist_enable_trigger(data, buffer, rec, event); -} - -static const struct event_trigger_ops hist_enable_trigger_ops = { - .trigger = hist_enable_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static const struct event_trigger_ops hist_enable_count_trigger_ops = { - .trigger = hist_enable_count_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static const struct event_trigger_ops hist_disable_trigger_ops = { - .trigger = hist_enable_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static const struct event_trigger_ops hist_disable_count_trigger_ops = { - .trigger = hist_enable_count_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static const struct event_trigger_ops * -hist_enable_get_trigger_ops(char *cmd, char *param) -{ - const struct event_trigger_ops *ops; - bool enable; - - enable = (strcmp(cmd, ENABLE_HIST_STR) == 0); - - if (enable) - ops = param ? &hist_enable_count_trigger_ops : - &hist_enable_trigger_ops; - else - ops = param ? &hist_disable_count_trigger_ops : - &hist_disable_trigger_ops; - - return ops; -} - static void hist_enable_unreg_all(struct trace_event_file *file) { struct event_trigger_data *test, *n; @@ -7016,8 +6961,8 @@ static void hist_enable_unreg_all(struct trace_event_file *file) list_del_rcu(&test->list); update_cond_flag(file); trace_event_trigger_enable_disable(file, 0); - if (test->ops->free) - test->ops->free(test); + if (test->cmd_ops->free) + test->cmd_ops->free(test); } } } @@ -7029,8 +6974,12 @@ static struct event_command trigger_hist_enable_cmd = { .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .unreg_all = hist_enable_unreg_all, - .get_trigger_ops = hist_enable_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = hist_enable_trigger, + .count_func = event_trigger_count, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, }; static struct event_command trigger_hist_disable_cmd = { @@ -7040,8 +6989,12 @@ static struct event_command trigger_hist_disable_cmd = { .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .unreg_all = hist_enable_unreg_all, - .get_trigger_ops = hist_enable_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = hist_enable_trigger, + .count_func = event_trigger_count, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, }; static __init void unregister_trigger_hist_enable_disable_cmds(void) diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index f24ee61f8884..4554c458b78c 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -359,7 +359,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, fmt = synth_field_fmt(se->fields[i]->type); /* parameter types */ - if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) + if (tr && tr->trace_flags & TRACE_ITER(VERBOSE)) trace_seq_printf(s, "%s ", fmt); snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt); @@ -375,7 +375,6 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, n_u64++; } else { trace_seq_printf(s, print_fmt, se->fields[i]->name, - STR_VAR_LEN_MAX, (char *)&entry->fields[n_u64].as_u64, i == se->n_fields - 1 ? "" : " "); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index cbfc306c0159..06b75bcfc7b8 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -6,6 +6,7 @@ */ #include <linux/security.h> +#include <linux/kthread.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/mutex.h> @@ -17,15 +18,77 @@ static LIST_HEAD(trigger_commands); static DEFINE_MUTEX(trigger_cmd_mutex); +static struct task_struct *trigger_kthread; +static struct llist_head trigger_data_free_list; +static DEFINE_MUTEX(trigger_data_kthread_mutex); + +/* Bulk garbage collection of event_trigger_data elements */ +static int trigger_kthread_fn(void *ignore) +{ + struct event_trigger_data *data, *tmp; + struct llist_node *llnodes; + + /* Once this task starts, it lives forever */ + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (llist_empty(&trigger_data_free_list)) + schedule(); + + __set_current_state(TASK_RUNNING); + + llnodes = llist_del_all(&trigger_data_free_list); + + /* make sure current triggers exit before free */ + tracepoint_synchronize_unregister(); + + llist_for_each_entry_safe(data, tmp, llnodes, llist) + kfree(data); + } + + return 0; +} + void trigger_data_free(struct event_trigger_data *data) { if (data->cmd_ops->set_filter) data->cmd_ops->set_filter(NULL, data, NULL); - /* make sure current triggers exit before free */ - tracepoint_synchronize_unregister(); + if (unlikely(!trigger_kthread)) { + guard(mutex)(&trigger_data_kthread_mutex); + /* Check again after taking mutex */ + if (!trigger_kthread) { + struct task_struct *kthread; + + kthread = kthread_create(trigger_kthread_fn, NULL, + "trigger_data_free"); + if (!IS_ERR(kthread)) + WRITE_ONCE(trigger_kthread, kthread); + } + } - kfree(data); + if (!trigger_kthread) { + /* Do it the slow way */ + tracepoint_synchronize_unregister(); + kfree(data); + return; + } + + llist_add(&data->llist, &trigger_data_free_list); + wake_up_process(trigger_kthread); +} + +static inline void data_ops_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) +{ + const struct event_command *cmd_ops = data->cmd_ops; + + if (data->flags & EVENT_TRIGGER_FL_COUNT) { + if (!cmd_ops->count_func(data, buffer, rec, event)) + return; + } + + cmd_ops->trigger(data, buffer, rec, event); } /** @@ -70,7 +133,7 @@ event_triggers_call(struct trace_event_file *file, if (data->paused) continue; if (!rec) { - data->ops->trigger(data, buffer, rec, event); + data_ops_trigger(data, buffer, rec, event); continue; } filter = rcu_dereference_sched(data->filter); @@ -80,7 +143,7 @@ event_triggers_call(struct trace_event_file *file, tt |= data->cmd_ops->trigger_type; continue; } - data->ops->trigger(data, buffer, rec, event); + data_ops_trigger(data, buffer, rec, event); } return tt; } @@ -122,7 +185,7 @@ event_triggers_post_call(struct trace_event_file *file, if (data->paused) continue; if (data->cmd_ops->trigger_type & tt) - data->ops->trigger(data, NULL, NULL, NULL); + data_ops_trigger(data, NULL, NULL, NULL); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); @@ -191,7 +254,7 @@ static int trigger_show(struct seq_file *m, void *v) } data = list_entry(v, struct event_trigger_data, list); - data->ops->print(m, data); + data->cmd_ops->print(m, data); return 0; } @@ -245,7 +308,8 @@ int trigger_process_regex(struct trace_event_file *file, char *buff) char *command, *next; struct event_command *p; - next = buff = skip_spaces(buff); + next = buff = strim(buff); + command = strsep(&next, ": \t"); if (next) { next = skip_spaces(next); @@ -282,8 +346,6 @@ static ssize_t event_trigger_regex_write(struct file *file, if (IS_ERR(buf)) return PTR_ERR(buf); - strim(buf); - guard(mutex)(&event_mutex); event_file = event_file_file(file); @@ -300,13 +362,9 @@ static ssize_t event_trigger_regex_write(struct file *file, static int event_trigger_regex_release(struct inode *inode, struct file *file) { - mutex_lock(&event_mutex); - if (file->f_mode & FMODE_READ) seq_release(inode, file); - mutex_unlock(&event_mutex); - return 0; } @@ -378,7 +436,37 @@ __init int unregister_event_command(struct event_command *cmd) } /** - * event_trigger_print - Generic event_trigger_ops @print implementation + * event_trigger_count - Optional count function for event triggers + * @data: Trigger-specific data + * @buffer: The ring buffer that the event is being written to + * @rec: The trace entry for the event, NULL for unconditional invocation + * @event: The event meta data in the ring buffer + * + * For triggers that can take a count parameter that doesn't do anything + * special, they can use this function to assign to their .count_func + * field. + * + * This simply does a count down of the @data->count field. + * + * If the @data->count is greater than zero, it will decrement it. + * + * Returns false if @data->count is zero, otherwise true. + */ +bool event_trigger_count(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) +{ + if (!data->count) + return false; + + if (data->count != -1) + (data->count)--; + + return true; +} + +/** + * event_trigger_print - Generic event_command @print implementation * @name: The name of the event trigger * @m: The seq_file being printed to * @data: Trigger-specific data @@ -413,7 +501,7 @@ event_trigger_print(const char *name, struct seq_file *m, } /** - * event_trigger_init - Generic event_trigger_ops @init implementation + * event_trigger_init - Generic event_command @init implementation * @data: Trigger-specific data * * Common implementation of event trigger initialization. @@ -430,7 +518,7 @@ int event_trigger_init(struct event_trigger_data *data) } /** - * event_trigger_free - Generic event_trigger_ops @free implementation + * event_trigger_free - Generic event_command @free implementation * @data: Trigger-specific data * * Common implementation of event trigger de-initialization. @@ -492,8 +580,8 @@ clear_event_triggers(struct trace_array *tr) list_for_each_entry_safe(data, n, &file->triggers, list) { trace_event_trigger_enable_disable(file, 0); list_del_rcu(&data->list); - if (data->ops->free) - data->ops->free(data); + if (data->cmd_ops->free) + data->cmd_ops->free(data); } } } @@ -556,8 +644,8 @@ static int register_trigger(char *glob, return -EEXIST; } - if (data->ops->init) { - ret = data->ops->init(data); + if (data->cmd_ops->init) { + ret = data->cmd_ops->init(data); if (ret < 0) return ret; } @@ -595,8 +683,8 @@ static bool try_unregister_trigger(char *glob, } if (data) { - if (data->ops->free) - data->ops->free(data); + if (data->cmd_ops->free) + data->cmd_ops->free(data); return true; } @@ -644,7 +732,7 @@ static void unregister_trigger(char *glob, * param - text following cmd and ':' and stripped of filter * filter - the optional filter text following (and including) 'if' * - * To illustrate the use of these componenents, here are some concrete + * To illustrate the use of these components, here are some concrete * examples. For the following triggers: * * echo 'traceon:5 if pid == 0' > trigger @@ -807,9 +895,13 @@ int event_trigger_separate_filter(char *param_and_filter, char **param, * @private_data: User data to associate with the event trigger * * Allocate an event_trigger_data instance and initialize it. The - * @cmd_ops are used along with the @cmd and @param to get the - * trigger_ops to assign to the event_trigger_data. @private_data can - * also be passed in and associated with the event_trigger_data. + * @cmd_ops defines how the trigger will operate. If @param is set, + * and @cmd_ops->trigger_ops->count_func is non NULL, then the + * data->count is set to @param and before the trigger is executed, the + * @cmd_ops->trigger_ops->count_func() is called. If that function returns + * false, the @cmd_ops->trigger_ops->trigger() function will not be called. + * @private_data can also be passed in and associated with the + * event_trigger_data. * * Use trigger_data_free() to free an event_trigger_data object. * @@ -821,18 +913,16 @@ struct event_trigger_data *trigger_data_alloc(struct event_command *cmd_ops, void *private_data) { struct event_trigger_data *trigger_data; - const struct event_trigger_ops *trigger_ops; - - trigger_ops = cmd_ops->get_trigger_ops(cmd, param); trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); if (!trigger_data) return NULL; trigger_data->count = -1; - trigger_data->ops = trigger_ops; trigger_data->cmd_ops = cmd_ops; trigger_data->private_data = private_data; + if (param && cmd_ops->count_func) + trigger_data->flags |= EVENT_TRIGGER_FL_COUNT; INIT_LIST_HEAD(&trigger_data->list); INIT_LIST_HEAD(&trigger_data->named_list); @@ -1271,31 +1361,28 @@ traceon_trigger(struct event_trigger_data *data, tracing_on(); } -static void -traceon_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) +static bool +traceon_count_func(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) { struct trace_event_file *file = data->private_data; if (file) { if (tracer_tracing_is_on(file->tr)) - return; + return false; } else { if (tracing_is_on()) - return; + return false; } if (!data->count) - return; + return false; if (data->count != -1) (data->count)--; - if (file) - tracer_tracing_on(file->tr); - else - tracing_on(); + return true; } static void @@ -1319,31 +1406,28 @@ traceoff_trigger(struct event_trigger_data *data, tracing_off(); } -static void -traceoff_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) +static bool +traceoff_count_func(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) { struct trace_event_file *file = data->private_data; if (file) { if (!tracer_tracing_is_on(file->tr)) - return; + return false; } else { if (!tracing_is_on()) - return; + return false; } if (!data->count) - return; + return false; if (data->count != -1) (data->count)--; - if (file) - tracer_tracing_off(file->tr); - else - tracing_off(); + return true; } static int @@ -1360,58 +1444,18 @@ traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static const struct event_trigger_ops traceon_trigger_ops = { - .trigger = traceon_trigger, - .print = traceon_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static const struct event_trigger_ops traceon_count_trigger_ops = { - .trigger = traceon_count_trigger, - .print = traceon_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static const struct event_trigger_ops traceoff_trigger_ops = { - .trigger = traceoff_trigger, - .print = traceoff_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static const struct event_trigger_ops traceoff_count_trigger_ops = { - .trigger = traceoff_count_trigger, - .print = traceoff_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static const struct event_trigger_ops * -onoff_get_trigger_ops(char *cmd, char *param) -{ - const struct event_trigger_ops *ops; - - /* we register both traceon and traceoff to this callback */ - if (strcmp(cmd, "traceon") == 0) - ops = param ? &traceon_count_trigger_ops : - &traceon_trigger_ops; - else - ops = param ? &traceoff_count_trigger_ops : - &traceoff_trigger_ops; - - return ops; -} - static struct event_command trigger_traceon_cmd = { .name = "traceon", .trigger_type = ETT_TRACE_ONOFF, .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, - .get_trigger_ops = onoff_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = traceon_trigger, + .count_func = traceon_count_func, + .print = traceon_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, }; static struct event_command trigger_traceoff_cmd = { @@ -1421,8 +1465,12 @@ static struct event_command trigger_traceoff_cmd = { .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, - .get_trigger_ops = onoff_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = traceoff_trigger, + .count_func = traceoff_count_func, + .print = traceoff_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, }; #ifdef CONFIG_TRACER_SNAPSHOT @@ -1439,20 +1487,6 @@ snapshot_trigger(struct event_trigger_data *data, tracing_snapshot(); } -static void -snapshot_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) -{ - if (!data->count) - return; - - if (data->count != -1) - (data->count)--; - - snapshot_trigger(data, buffer, rec, event); -} - static int register_snapshot_trigger(char *glob, struct event_trigger_data *data, @@ -1484,34 +1518,18 @@ snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static const struct event_trigger_ops snapshot_trigger_ops = { - .trigger = snapshot_trigger, - .print = snapshot_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static const struct event_trigger_ops snapshot_count_trigger_ops = { - .trigger = snapshot_count_trigger, - .print = snapshot_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static const struct event_trigger_ops * -snapshot_get_trigger_ops(char *cmd, char *param) -{ - return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops; -} - static struct event_command trigger_snapshot_cmd = { .name = "snapshot", .trigger_type = ETT_SNAPSHOT, .parse = event_trigger_parse, .reg = register_snapshot_trigger, .unreg = unregister_snapshot_trigger, - .get_trigger_ops = snapshot_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = snapshot_trigger, + .count_func = event_trigger_count, + .print = snapshot_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, }; static __init int register_trigger_snapshot_cmd(void) @@ -1558,20 +1576,6 @@ stacktrace_trigger(struct event_trigger_data *data, trace_dump_stack(STACK_SKIP); } -static void -stacktrace_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) -{ - if (!data->count) - return; - - if (data->count != -1) - (data->count)--; - - stacktrace_trigger(data, buffer, rec, event); -} - static int stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data) { @@ -1579,26 +1583,6 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static const struct event_trigger_ops stacktrace_trigger_ops = { - .trigger = stacktrace_trigger, - .print = stacktrace_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static const struct event_trigger_ops stacktrace_count_trigger_ops = { - .trigger = stacktrace_count_trigger, - .print = stacktrace_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static const struct event_trigger_ops * -stacktrace_get_trigger_ops(char *cmd, char *param) -{ - return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops; -} - static struct event_command trigger_stacktrace_cmd = { .name = "stacktrace", .trigger_type = ETT_STACKTRACE, @@ -1606,8 +1590,12 @@ static struct event_command trigger_stacktrace_cmd = { .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, - .get_trigger_ops = stacktrace_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = stacktrace_trigger, + .count_func = event_trigger_count, + .print = stacktrace_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, }; static __init int register_trigger_stacktrace_cmd(void) @@ -1642,24 +1630,24 @@ event_enable_trigger(struct event_trigger_data *data, set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); } -static void -event_enable_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) +static bool +event_enable_count_func(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; if (!data->count) - return; + return false; /* Skip if the event is in a state we want to switch to */ if (enable_data->enable == !(enable_data->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) - return; + return false; if (data->count != -1) (data->count)--; - event_enable_trigger(data, buffer, rec, event); + return true; } int event_enable_trigger_print(struct seq_file *m, @@ -1704,34 +1692,6 @@ void event_enable_trigger_free(struct event_trigger_data *data) } } -static const struct event_trigger_ops event_enable_trigger_ops = { - .trigger = event_enable_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static const struct event_trigger_ops event_enable_count_trigger_ops = { - .trigger = event_enable_count_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static const struct event_trigger_ops event_disable_trigger_ops = { - .trigger = event_enable_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static const struct event_trigger_ops event_disable_count_trigger_ops = { - .trigger = event_enable_count_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - int event_enable_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, char *param_and_filter) @@ -1861,8 +1821,8 @@ int event_enable_register_trigger(char *glob, } } - if (data->ops->init) { - ret = data->ops->init(data); + if (data->cmd_ops->init) { + ret = data->cmd_ops->init(data); if (ret < 0) return ret; } @@ -1902,30 +1862,8 @@ void event_enable_unregister_trigger(char *glob, } } - if (data && data->ops->free) - data->ops->free(data); -} - -static const struct event_trigger_ops * -event_enable_get_trigger_ops(char *cmd, char *param) -{ - const struct event_trigger_ops *ops; - bool enable; - -#ifdef CONFIG_HIST_TRIGGERS - enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) || - (strcmp(cmd, ENABLE_HIST_STR) == 0)); -#else - enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; -#endif - if (enable) - ops = param ? &event_enable_count_trigger_ops : - &event_enable_trigger_ops; - else - ops = param ? &event_disable_count_trigger_ops : - &event_disable_trigger_ops; - - return ops; + if (data && data->cmd_ops->free) + data->cmd_ops->free(data); } static struct event_command trigger_enable_cmd = { @@ -1934,8 +1872,12 @@ static struct event_command trigger_enable_cmd = { .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, - .get_trigger_ops = event_enable_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = event_enable_trigger, + .count_func = event_enable_count_func, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, }; static struct event_command trigger_disable_cmd = { @@ -1944,8 +1886,12 @@ static struct event_command trigger_disable_cmd = { .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, - .get_trigger_ops = event_enable_get_trigger_ops, .set_filter = set_trigger_filter, + .trigger = event_enable_trigger, + .count_func = event_enable_count_func, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, }; static __init void unregister_trigger_enable_disable_cmds(void) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index b15854c75d4f..dca6e50b3b21 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1041,7 +1041,7 @@ static int user_field_array_size(const char *type) static int user_field_size(const char *type) { - /* long is not allowed from a user, since it's ambigious in size */ + /* long is not allowed from a user, since it's ambiguous in size */ if (strcmp(type, "s64") == 0) return sizeof(s64); if (strcmp(type, "u64") == 0) @@ -1079,7 +1079,7 @@ static int user_field_size(const char *type) if (str_has_prefix(type, "__rel_loc ")) return sizeof(u32); - /* Uknown basic type, error */ + /* Unknown basic type, error */ return -EINVAL; } @@ -2465,7 +2465,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info, /* * Prevent users from using the same address and bit multiple times * within the same mm address space. This can cause unexpected behavior - * for user processes that is far easier to debug if this is explictly + * for user processes that is far easier to debug if this is explicitly * an error upon registering. */ if (current_user_event_enabler_exists((unsigned long)reg.enable_addr, diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 8001dbf16891..262c0556e4af 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -632,7 +632,7 @@ print_fentry_event(struct trace_iterator *iter, int flags, trace_seq_printf(s, "%s: (", trace_probe_name(tp)); - if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_offset(s, field->ip, flags)) goto out; trace_seq_putc(s, ')'); @@ -662,12 +662,12 @@ print_fexit_event(struct trace_iterator *iter, int flags, trace_seq_printf(s, "%s: (", trace_probe_name(tp)); - if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_offset(s, field->ret_ip, flags)) goto out; trace_seq_puts(s, " <- "); - if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_no_offset(s, field->func, flags)) goto out; trace_seq_putc(s, ')'); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index d17c18934445..c12795c2fb39 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -154,11 +154,11 @@ static int function_trace_init(struct trace_array *tr) if (!tr->ops) return -ENOMEM; - func = select_trace_function(func_flags.val); + func = select_trace_function(tr->current_trace_flags->val); if (!func) return -EINVAL; - if (!handle_func_repeats(tr, func_flags.val)) + if (!handle_func_repeats(tr, tr->current_trace_flags->val)) return -ENOMEM; ftrace_init_array_ops(tr, func); @@ -459,14 +459,14 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) u32 new_flags; /* Do nothing if already set. */ - if (!!set == !!(func_flags.val & bit)) + if (!!set == !!(tr->current_trace_flags->val & bit)) return 0; /* We can change this flag only when not running. */ if (tr->current_trace != &function_trace) return 0; - new_flags = (func_flags.val & ~bit) | (set ? bit : 0); + new_flags = (tr->current_trace_flags->val & ~bit) | (set ? bit : 0); func = select_trace_function(new_flags); if (!func) return -EINVAL; @@ -491,7 +491,7 @@ static struct tracer function_trace __tracer_data = .init = function_trace_init, .reset = function_trace_reset, .start = function_trace_start, - .flags = &func_flags, + .default_flags = &func_flags, .set_flag = func_set_flag, .allow_instances = true, #ifdef CONFIG_FTRACE_SELFTEST diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a7f4b9a47a71..b1e9c9913309 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -16,9 +16,12 @@ #include "trace.h" #include "trace_output.h" -/* When set, irq functions will be ignored */ +/* When set, irq functions might be ignored */ static int ftrace_graph_skip_irqs; +/* Do not record function time when task is sleeping */ +int fgraph_no_sleep_time; + struct fgraph_cpu_data { pid_t last_pid; int depth; @@ -33,14 +36,19 @@ struct fgraph_ent_args { unsigned long args[FTRACE_REGS_MAX_ARGS]; }; +struct fgraph_retaddr_ent_args { + struct fgraph_retaddr_ent_entry ent; + /* Force the sizeof of args[] to have FTRACE_REGS_MAX_ARGS entries */ + unsigned long args[FTRACE_REGS_MAX_ARGS]; +}; + struct fgraph_data { struct fgraph_cpu_data __percpu *cpu_data; /* Place to preserve last processed entry. */ union { struct fgraph_ent_args ent; - /* TODO allow retaddr to have args */ - struct fgraph_retaddr_ent_entry rent; + struct fgraph_retaddr_ent_args rent; }; struct ftrace_graph_ret_entry ret; int failed; @@ -85,11 +93,6 @@ static struct tracer_opt trace_opts[] = { /* Include sleep time (scheduled out) between entry and return */ { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, -#ifdef CONFIG_FUNCTION_PROFILER - /* Include time within nested functions */ - { TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) }, -#endif - { } /* Empty entry */ }; @@ -97,13 +100,13 @@ static struct tracer_flags tracer_flags = { /* Don't display overruns, proc, or tail by default */ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS | - TRACE_GRAPH_SLEEP_TIME | TRACE_GRAPH_GRAPH_TIME, + TRACE_GRAPH_SLEEP_TIME, .opts = trace_opts }; -static bool tracer_flags_is_set(u32 flags) +static bool tracer_flags_is_set(struct trace_array *tr, u32 flags) { - return (tracer_flags.val & flags) == flags; + return (tr->current_trace_flags->val & flags) == flags; } /* @@ -162,20 +165,32 @@ int __trace_graph_entry(struct trace_array *tr, int __trace_graph_retaddr_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned int trace_ctx, - unsigned long retaddr) + unsigned long retaddr, + struct ftrace_regs *fregs) { struct ring_buffer_event *event; struct trace_buffer *buffer = tr->array_buffer.buffer; struct fgraph_retaddr_ent_entry *entry; + int size; + + /* If fregs is defined, add FTRACE_REGS_MAX_ARGS long size words */ + size = sizeof(*entry) + (FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long)); event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RETADDR_ENT, - sizeof(*entry), trace_ctx); + size, trace_ctx); if (!event) return 0; entry = ring_buffer_event_data(event); - entry->graph_ent.func = trace->func; - entry->graph_ent.depth = trace->depth; - entry->graph_ent.retaddr = retaddr; + entry->graph_rent.ent = *trace; + entry->graph_rent.retaddr = retaddr; + +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + if (fregs) { + for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++) + entry->args[i] = ftrace_regs_get_argument(fregs, i); + } +#endif + trace_buffer_unlock_commit_nostack(buffer, event); return 1; @@ -184,17 +199,21 @@ int __trace_graph_retaddr_entry(struct trace_array *tr, int __trace_graph_retaddr_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned int trace_ctx, - unsigned long retaddr) + unsigned long retaddr, + struct ftrace_regs *fregs) { return 1; } #endif -static inline int ftrace_graph_ignore_irqs(void) +static inline int ftrace_graph_ignore_irqs(struct trace_array *tr) { if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT)) return 0; + if (tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_IRQS)) + return 0; + return in_hardirq(); } @@ -232,22 +251,20 @@ static int graph_entry(struct ftrace_graph_ent *trace, return 1; } - if (!ftrace_trace_task(tr)) - return 0; - if (ftrace_graph_ignore_func(gops, trace)) return 0; - if (ftrace_graph_ignore_irqs()) + if (ftrace_graph_ignore_irqs(tr)) return 0; - if (fgraph_sleep_time) { - /* Only need to record the calltime */ - ftimes = fgraph_reserve_data(gops->idx, sizeof(ftimes->calltime)); - } else { + if (fgraph_no_sleep_time && + !tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME)) { ftimes = fgraph_reserve_data(gops->idx, sizeof(*ftimes)); if (ftimes) ftimes->sleeptime = current->ftrace_sleeptime; + } else { + /* Only need to record the calltime */ + ftimes = fgraph_reserve_data(gops->idx, sizeof(ftimes->calltime)); } if (!ftimes) return 0; @@ -263,9 +280,10 @@ static int graph_entry(struct ftrace_graph_ent *trace, trace_ctx = tracing_gen_ctx(); if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && - tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR)) { + tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_RETADDR)) { unsigned long retaddr = ftrace_graph_top_ret_addr(current); - ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr); + ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, + retaddr, fregs); } else { ret = __graph_entry(tr, trace, trace_ctx, fregs); } @@ -333,11 +351,15 @@ void __trace_graph_return(struct trace_array *tr, trace_buffer_unlock_commit_nostack(buffer, event); } -static void handle_nosleeptime(struct ftrace_graph_ret *trace, +static void handle_nosleeptime(struct trace_array *tr, + struct ftrace_graph_ret *trace, struct fgraph_times *ftimes, int size) { - if (fgraph_sleep_time || size < sizeof(*ftimes)) + if (size < sizeof(*ftimes)) + return; + + if (!fgraph_no_sleep_time || tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME)) return; ftimes->calltime += current->ftrace_sleeptime - ftimes->sleeptime; @@ -366,7 +388,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace, if (!ftimes) return; - handle_nosleeptime(trace, ftimes, size); + handle_nosleeptime(tr, trace, ftimes, size); calltime = ftimes->calltime; @@ -379,6 +401,7 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, struct ftrace_regs *fregs) { struct fgraph_times *ftimes; + struct trace_array *tr; int size; ftrace_graph_addr_finish(gops, trace); @@ -392,7 +415,8 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, if (!ftimes) return; - handle_nosleeptime(trace, ftimes, size); + tr = gops->private; + handle_nosleeptime(tr, trace, ftimes, size); if (tracing_thresh && (trace_clock_local() - ftimes->calltime < tracing_thresh)) @@ -441,7 +465,7 @@ static int graph_trace_init(struct trace_array *tr) { int ret; - if (tracer_flags_is_set(TRACE_GRAPH_ARGS)) + if (tracer_flags_is_set(tr, TRACE_GRAPH_ARGS)) tr->gops->entryfunc = trace_graph_entry_args; else tr->gops->entryfunc = trace_graph_entry; @@ -451,6 +475,12 @@ static int graph_trace_init(struct trace_array *tr) else tr->gops->retfunc = trace_graph_return; + if (!tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_IRQS)) + ftrace_graph_skip_irqs++; + + if (!tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME)) + fgraph_no_sleep_time++; + /* Make gops functions visible before we start tracing */ smp_mb(); @@ -468,10 +498,6 @@ static int ftrace_graph_trace_args(struct trace_array *tr, int set) { trace_func_graph_ent_t entry; - /* Do nothing if the current tracer is not this tracer */ - if (tr->current_trace != &graph_trace) - return 0; - if (set) entry = trace_graph_entry_args; else @@ -492,6 +518,16 @@ static int ftrace_graph_trace_args(struct trace_array *tr, int set) static void graph_trace_reset(struct trace_array *tr) { + if (!tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_IRQS)) + ftrace_graph_skip_irqs--; + if (WARN_ON_ONCE(ftrace_graph_skip_irqs < 0)) + ftrace_graph_skip_irqs = 0; + + if (!tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME)) + fgraph_no_sleep_time--; + if (WARN_ON_ONCE(fgraph_no_sleep_time < 0)) + fgraph_no_sleep_time = 0; + tracing_stop_cmdline_record(); unregister_ftrace_graph(tr->gops); } @@ -634,13 +670,9 @@ get_return_for_leaf(struct trace_iterator *iter, * Save current and next entries for later reference * if the output fails. */ - if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT)) { - data->rent = *(struct fgraph_retaddr_ent_entry *)curr; - } else { - int size = min((int)sizeof(data->ent), (int)iter->ent_size); + int size = min_t(int, sizeof(data->rent), iter->ent_size); - memcpy(&data->ent, curr, size); - } + memcpy(&data->rent, curr, size); /* * If the next event is not a return type, then * we only care about what type it is. Otherwise we can @@ -703,7 +735,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, addr >= (unsigned long)__irqentry_text_end) return; - if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) { /* Absolute time */ if (flags & TRACE_GRAPH_PRINT_ABS_TIME) print_graph_abs_time(iter->ts, s); @@ -723,7 +755,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, } /* Latency format */ - if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) print_graph_lat_fmt(s, ent); } @@ -777,7 +809,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, struct trace_seq *s, u32 flags) { if (!(flags & TRACE_GRAPH_PRINT_DURATION) || - !(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) + !(tr->trace_flags & TRACE_ITER(CONTEXT_INFO))) return; /* No real adata, just filling the column with spaces */ @@ -818,7 +850,7 @@ static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_e trace_seq_puts(s, " /*"); trace_seq_puts(s, " <-"); - seq_print_ip_sym(s, entry->graph_ent.retaddr, trace_flags | TRACE_ITER_SYM_OFFSET); + seq_print_ip_sym_offset(s, entry->graph_rent.retaddr, trace_flags); if (comment) trace_seq_puts(s, " */"); @@ -964,7 +996,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, trace_seq_printf(s, "%ps", (void *)ret_func); if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) { - print_function_args(s, entry->args, ret_func); + print_function_args(s, FGRAPH_ENTRY_ARGS(entry), ret_func); trace_seq_putc(s, ';'); } else trace_seq_puts(s, "();"); @@ -1016,7 +1048,7 @@ print_graph_entry_nested(struct trace_iterator *iter, args_size = iter->ent_size - offsetof(struct ftrace_graph_ent_entry, args); if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) - print_function_args(s, entry->args, func); + print_function_args(s, FGRAPH_ENTRY_ARGS(entry), func); else trace_seq_puts(s, "()"); @@ -1054,7 +1086,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, /* Interrupt */ print_graph_irq(iter, addr, type, cpu, ent->pid, flags); - if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) + if (!(tr->trace_flags & TRACE_ITER(CONTEXT_INFO))) return; /* Absolute time */ @@ -1076,7 +1108,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, } /* Latency format */ - if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) print_graph_lat_fmt(s, ent); return; @@ -1198,11 +1230,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, /* * print_graph_entry() may consume the current event, * thus @field may become invalid, so we need to save it. - * sizeof(struct ftrace_graph_ent_entry) is very small, - * it can be safely saved at the stack. + * This function is shared by ftrace_graph_ent_entry and + * fgraph_retaddr_ent_entry, the size of the latter one + * is larger, but it is very small and can be safely saved + * at the stack. */ struct ftrace_graph_ent_entry *entry; - u8 save_buf[sizeof(*entry) + FTRACE_REGS_MAX_ARGS * sizeof(long)]; + struct fgraph_retaddr_ent_entry *rentry; + u8 save_buf[sizeof(*rentry) + FTRACE_REGS_MAX_ARGS * sizeof(long)]; /* The ent_size is expected to be as big as the entry */ if (iter->ent_size > sizeof(save_buf)) @@ -1431,12 +1466,17 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) } #ifdef CONFIG_FUNCTION_GRAPH_RETADDR case TRACE_GRAPH_RETADDR_ENT: { - struct fgraph_retaddr_ent_entry saved; + /* + * ftrace_graph_ent_entry and fgraph_retaddr_ent_entry have + * similar functions and memory layouts. The only difference + * is that the latter one has an extra retaddr member, so + * they can share most of the logic. + */ struct fgraph_retaddr_ent_entry *rfield; trace_assign_type(rfield, entry); - saved = *rfield; - return print_graph_entry((struct ftrace_graph_ent_entry *)&saved, s, iter, flags); + return print_graph_entry((struct ftrace_graph_ent_entry *)rfield, + s, iter, flags); } #endif case TRACE_GRAPH_RET: { @@ -1459,7 +1499,8 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) static enum print_line_t print_graph_function(struct trace_iterator *iter) { - return print_graph_function_flags(iter, tracer_flags.val); + struct trace_array *tr = iter->tr; + return print_graph_function_flags(iter, tr->current_trace_flags->val); } static enum print_line_t @@ -1495,7 +1536,7 @@ static void print_lat_header(struct seq_file *s, u32 flags) static void __print_graph_headers_flags(struct trace_array *tr, struct seq_file *s, u32 flags) { - int lat = tr->trace_flags & TRACE_ITER_LATENCY_FMT; + int lat = tr->trace_flags & TRACE_ITER(LATENCY_FMT); if (lat) print_lat_header(s, flags); @@ -1535,7 +1576,10 @@ static void __print_graph_headers_flags(struct trace_array *tr, static void print_graph_headers(struct seq_file *s) { - print_graph_headers_flags(s, tracer_flags.val); + struct trace_iterator *iter = s->private; + struct trace_array *tr = iter->tr; + + print_graph_headers_flags(s, tr->current_trace_flags->val); } void print_graph_headers_flags(struct seq_file *s, u32 flags) @@ -1543,10 +1587,10 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags) struct trace_iterator *iter = s->private; struct trace_array *tr = iter->tr; - if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) + if (!(tr->trace_flags & TRACE_ITER(CONTEXT_INFO))) return; - if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) { + if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) { /* print nothing if the buffers are empty */ if (trace_empty(iter)) return; @@ -1613,17 +1657,56 @@ void graph_trace_close(struct trace_iterator *iter) static int func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { - if (bit == TRACE_GRAPH_PRINT_IRQS) - ftrace_graph_skip_irqs = !set; +/* + * The function profiler gets updated even if function graph + * isn't the current tracer. Handle it separately. + */ +#ifdef CONFIG_FUNCTION_PROFILER + if (bit == TRACE_GRAPH_SLEEP_TIME && (tr->flags & TRACE_ARRAY_FL_GLOBAL) && + !!set == fprofile_no_sleep_time) { + if (set) { + fgraph_no_sleep_time--; + if (WARN_ON_ONCE(fgraph_no_sleep_time < 0)) + fgraph_no_sleep_time = 0; + fprofile_no_sleep_time = false; + } else { + fgraph_no_sleep_time++; + fprofile_no_sleep_time = true; + } + } +#endif + + /* Do nothing if the current tracer is not this tracer */ + if (tr->current_trace != &graph_trace) + return 0; - if (bit == TRACE_GRAPH_SLEEP_TIME) - ftrace_graph_sleep_time_control(set); + /* Do nothing if already set. */ + if (!!set == !!(tr->current_trace_flags->val & bit)) + return 0; + + switch (bit) { + case TRACE_GRAPH_SLEEP_TIME: + if (set) { + fgraph_no_sleep_time--; + if (WARN_ON_ONCE(fgraph_no_sleep_time < 0)) + fgraph_no_sleep_time = 0; + } else { + fgraph_no_sleep_time++; + } + break; - if (bit == TRACE_GRAPH_GRAPH_TIME) - ftrace_graph_graph_time_control(set); + case TRACE_GRAPH_PRINT_IRQS: + if (set) + ftrace_graph_skip_irqs--; + else + ftrace_graph_skip_irqs++; + if (WARN_ON_ONCE(ftrace_graph_skip_irqs < 0)) + ftrace_graph_skip_irqs = 0; + break; - if (bit == TRACE_GRAPH_ARGS) + case TRACE_GRAPH_ARGS: return ftrace_graph_trace_args(tr, set); + } return 0; } @@ -1660,7 +1743,7 @@ static struct tracer graph_trace __tracer_data = { .reset = graph_trace_reset, .print_line = print_graph_function, .print_header = print_graph_headers, - .flags = &tracer_flags, + .default_flags = &tracer_flags, .set_flag = func_graph_set_flag, .allow_instances = true, #ifdef CONFIG_FTRACE_SELFTEST diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 4c45c49b06c8..17673905907c 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -63,7 +63,7 @@ irq_trace(void) #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int irqsoff_display_graph(struct trace_array *tr, int set); -# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH) +# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER(DISPLAY_GRAPH)) #else static inline int irqsoff_display_graph(struct trace_array *tr, int set) { @@ -485,8 +485,8 @@ static int register_irqsoff_function(struct trace_array *tr, int graph, int set) { int ret; - /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ - if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION))) + /* 'set' is set if TRACE_ITER(FUNCTION) is about to be set */ + if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER(FUNCTION)))) return 0; if (graph) @@ -515,7 +515,7 @@ static void unregister_irqsoff_function(struct trace_array *tr, int graph) static int irqsoff_function_set(struct trace_array *tr, u32 mask, int set) { - if (!(mask & TRACE_ITER_FUNCTION)) + if (!(mask & TRACE_ITER(FUNCTION))) return 0; if (set) @@ -536,7 +536,7 @@ static inline int irqsoff_function_set(struct trace_array *tr, u32 mask, int set } #endif /* CONFIG_FUNCTION_TRACER */ -static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) +static int irqsoff_flag_changed(struct trace_array *tr, u64 mask, int set) { struct tracer *tracer = tr->current_trace; @@ -544,7 +544,7 @@ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) return 0; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - if (mask & TRACE_ITER_DISPLAY_GRAPH) + if (mask & TRACE_ITER(DISPLAY_GRAPH)) return irqsoff_display_graph(tr, set); #endif @@ -582,10 +582,10 @@ static int __irqsoff_tracer_init(struct trace_array *tr) save_flags = tr->trace_flags; /* non overwrite screws up the latency tracers */ - set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); - set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); + set_tracer_flag(tr, TRACE_ITER(OVERWRITE), 1); + set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), 1); /* without pause, we will produce garbage if another latency occurs */ - set_tracer_flag(tr, TRACE_ITER_PAUSE_ON_TRACE, 1); + set_tracer_flag(tr, TRACE_ITER(PAUSE_ON_TRACE), 1); tr->max_latency = 0; irqsoff_trace = tr; @@ -605,15 +605,15 @@ static int __irqsoff_tracer_init(struct trace_array *tr) static void __irqsoff_tracer_reset(struct trace_array *tr) { - int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; - int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; - int pause_flag = save_flags & TRACE_ITER_PAUSE_ON_TRACE; + int lat_flag = save_flags & TRACE_ITER(LATENCY_FMT); + int overwrite_flag = save_flags & TRACE_ITER(OVERWRITE); + int pause_flag = save_flags & TRACE_ITER(PAUSE_ON_TRACE); stop_irqsoff_tracer(tr, is_graph(tr)); - set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); - set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); - set_tracer_flag(tr, TRACE_ITER_PAUSE_ON_TRACE, pause_flag); + set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), lat_flag); + set_tracer_flag(tr, TRACE_ITER(OVERWRITE), overwrite_flag); + set_tracer_flag(tr, TRACE_ITER(PAUSE_ON_TRACE), pause_flag); ftrace_reset_array_ops(tr); irqsoff_busy = false; diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 896ff78b8349..b30795f34079 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -31,7 +31,7 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file) old_userobj = tr->trace_flags; /* don't look at user memory in panic mode */ - tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + tr->trace_flags &= ~TRACE_ITER(SYM_USEROBJ); kdb_printf("Dumping ftrace buffer:\n"); if (skip_entries) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ee8171b19bee..9953506370a5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1584,7 +1584,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags, trace_seq_printf(s, "%s: (", trace_probe_name(tp)); - if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_offset(s, field->ip, flags)) goto out; trace_seq_putc(s, ')'); @@ -1614,12 +1614,12 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, trace_seq_printf(s, "%s: (", trace_probe_name(tp)); - if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_offset(s, field->ret_ip, flags)) goto out; trace_seq_puts(s, " <- "); - if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_no_offset(s, field->func, flags)) goto out; trace_seq_putc(s, ')'); diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index a9962d4497e8..827104d00bc0 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -329,7 +329,7 @@ static struct osnoise_data { u64 print_stack; /* print IRQ stack if total > */ int timerlat_tracer; /* timerlat tracer */ #endif - bool tainted; /* infor users and developers about a problem */ + bool tainted; /* info users and developers about a problem */ } osnoise_data = { .sample_period = DEFAULT_SAMPLE_PERIOD, .sample_runtime = DEFAULT_SAMPLE_RUNTIME, @@ -738,7 +738,7 @@ cond_move_thread_delta_start(struct osnoise_variables *osn_var, u64 duration) /* * get_int_safe_duration - Get the duration of a window * - * The irq, softirq and thread varaibles need to have its duration without + * The irq, softirq and thread variables need to have its duration without * the interference from higher priority interrupts. Instead of keeping a * variable to discount the interrupt interference from these variables, the * starting time of these variables are pushed forward with the interrupt's @@ -1460,7 +1460,7 @@ static int run_osnoise(void) stop_in = osnoise_data.stop_tracing * NSEC_PER_USEC; /* - * Start timestemp + * Start timestamp */ start = time_get(); @@ -1881,7 +1881,7 @@ static int timerlat_main(void *data) tlat->kthread = current; osn_var->pid = current->pid; /* - * Anotate the arrival time. + * Annotate the arrival time. */ tlat->abs_period = hrtimer_cb_get_time(&tlat->timer); @@ -1978,7 +1978,7 @@ static void stop_per_cpu_kthreads(void) } /* - * start_kthread - Start a workload tread + * start_kthread - Start a workload thread */ static int start_kthread(unsigned int cpu) { @@ -2705,7 +2705,7 @@ static int osnoise_create_cpu_timerlat_fd(struct dentry *top_dir) * Why not using tracing instance per_cpu/ dir? * * Because osnoise/timerlat have a single workload, having - * multiple files like these are wast of memory. + * multiple files like these are waste of memory. */ per_cpu = tracefs_create_dir("per_cpu", top_dir); if (!per_cpu) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 97db0b0ccf3e..cc2d3306bb60 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -420,7 +420,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, } mmap_read_unlock(mm); } - if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) + if (ret && ((sym_flags & TRACE_ITER(SYM_ADDR)) || !file)) trace_seq_printf(s, " <" IP_FMT ">", ip); return !trace_seq_has_overflowed(s); } @@ -433,9 +433,9 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) goto out; } - trace_seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET); + trace_seq_print_sym(s, ip, sym_flags & TRACE_ITER(SYM_OFFSET)); - if (sym_flags & TRACE_ITER_SYM_ADDR) + if (sym_flags & TRACE_ITER(SYM_ADDR)) trace_seq_printf(s, " <" IP_FMT ">", ip); out: @@ -569,7 +569,7 @@ static int lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) { struct trace_array *tr = iter->tr; - unsigned long verbose = tr->trace_flags & TRACE_ITER_VERBOSE; + unsigned long verbose = tr->trace_flags & TRACE_ITER(VERBOSE); unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; unsigned long long abs_ts = iter->ts - iter->array_buffer->time_start; unsigned long long rel_ts = next_ts - iter->ts; @@ -636,7 +636,7 @@ int trace_print_context(struct trace_iterator *iter) trace_seq_printf(s, "%16s-%-7d ", comm, entry->pid); - if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { + if (tr->trace_flags & TRACE_ITER(RECORD_TGID)) { unsigned int tgid = trace_find_tgid(entry->pid); if (!tgid) @@ -647,7 +647,7 @@ int trace_print_context(struct trace_iterator *iter) trace_seq_printf(s, "[%03d] ", iter->cpu); - if (tr->trace_flags & TRACE_ITER_IRQ_INFO) + if (tr->trace_flags & TRACE_ITER(IRQ_INFO)) trace_print_lat_fmt(s, entry); trace_print_time(s, iter, iter->ts); @@ -661,7 +661,7 @@ int trace_print_lat_context(struct trace_iterator *iter) struct trace_entry *entry, *next_entry; struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; - unsigned long verbose = (tr->trace_flags & TRACE_ITER_VERBOSE); + unsigned long verbose = (tr->trace_flags & TRACE_ITER(VERBOSE)); u64 next_ts; next_entry = trace_find_next_entry(iter, NULL, &next_ts); @@ -950,7 +950,9 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c int offset; int len; int ret; + int i; void *pos; + char *str; list_for_each_entry_reverse(field, head, link) { trace_seq_printf(&iter->seq, " %s=", field->name); @@ -977,8 +979,29 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c trace_seq_puts(&iter->seq, "<OVERFLOW>"); break; } - pos = (void *)iter->ent + offset; - trace_seq_printf(&iter->seq, "%.*s", len, (char *)pos); + str = (char *)iter->ent + offset; + /* Check if there's any non printable strings */ + for (i = 0; i < len; i++) { + if (str[i] && !(isascii(str[i]) && isprint(str[i]))) + break; + } + if (i < len) { + for (i = 0; i < len; i++) { + if (isascii(str[i]) && isprint(str[i])) + trace_seq_putc(&iter->seq, str[i]); + else + trace_seq_putc(&iter->seq, '.'); + } + trace_seq_puts(&iter->seq, " ("); + for (i = 0; i < len; i++) { + if (i) + trace_seq_putc(&iter->seq, ':'); + trace_seq_printf(&iter->seq, "%02x", str[i]); + } + trace_seq_putc(&iter->seq, ')'); + } else { + trace_seq_printf(&iter->seq, "%.*s", len, str); + } break; case FILTER_PTR_STRING: if (!iter->fmt_size) @@ -1127,7 +1150,7 @@ static void print_fn_trace(struct trace_seq *s, unsigned long ip, if (args) print_function_args(s, args, ip); - if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) { + if ((flags & TRACE_ITER(PRINT_PARENT)) && parent_ip) { trace_seq_puts(s, " <-"); seq_print_ip_sym(s, parent_ip, flags); } @@ -1417,7 +1440,7 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, trace_seq_puts(s, "<user stack trace>\n"); - if (tr->trace_flags & TRACE_ITER_SYM_USEROBJ) { + if (tr->trace_flags & TRACE_ITER(SYM_USEROBJ)) { struct task_struct *task; /* * we do the lookup on the thread group leader, @@ -1467,12 +1490,12 @@ trace_hwlat_print(struct trace_iterator *iter, int flags, trace_assign_type(field, entry); - trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld count:%d", + trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ptSp count:%d", field->seqnum, field->duration, field->outer_duration, - (long long)field->timestamp.tv_sec, - field->timestamp.tv_nsec, field->count); + &field->timestamp, + field->count); if (field->nmi_count) { /* diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 2e305364f2a9..99b676733d46 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -16,6 +16,17 @@ extern int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); +static inline int seq_print_ip_sym_offset(struct trace_seq *s, unsigned long ip, + unsigned long sym_flags) +{ + return seq_print_ip_sym(s, ip, sym_flags | TRACE_ITER(SYM_OFFSET)); +} +static inline int seq_print_ip_sym_no_offset(struct trace_seq *s, unsigned long ip, + unsigned long sym_flags) +{ + return seq_print_ip_sym(s, ip, sym_flags & ~TRACE_ITER(SYM_OFFSET)); +} + extern void trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset); extern int trace_print_context(struct trace_iterator *iter); extern int trace_print_lat_context(struct trace_iterator *iter); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 5cbdc423afeb..2f571083ce9e 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -156,7 +156,7 @@ fail: static struct trace_probe_log trace_probe_log; extern struct mutex dyn_event_ops_mutex; -void trace_probe_log_init(const char *subsystem, int argc, const char **argv) +const char *trace_probe_log_init(const char *subsystem, int argc, const char **argv) { lockdep_assert_held(&dyn_event_ops_mutex); @@ -164,6 +164,7 @@ void trace_probe_log_init(const char *subsystem, int argc, const char **argv) trace_probe_log.argc = argc; trace_probe_log.argv = argv; trace_probe_log.index = 0; + return subsystem; } void trace_probe_log_clear(void) @@ -214,7 +215,7 @@ void __trace_probe_log_err(int offset, int err_type) p = command; for (i = 0; i < trace_probe_log.argc; i++) { len = strlen(trace_probe_log.argv[i]); - strcpy(p, trace_probe_log.argv[i]); + memcpy(p, trace_probe_log.argv[i], len); p[len] = ' '; p += len + 1; } @@ -516,7 +517,7 @@ static void clear_btf_context(struct traceprobe_parse_context *ctx) } } -/* Return 1 if the field separater is arrow operator ('->') */ +/* Return 1 if the field separator is arrow operator ('->') */ static int split_next_field(char *varname, char **next_field, struct traceprobe_parse_context *ctx) { diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 08b5bda24da2..9fc56c937130 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -578,11 +578,13 @@ struct trace_probe_log { int index; }; -void trace_probe_log_init(const char *subsystem, int argc, const char **argv); +const char *trace_probe_log_init(const char *subsystem, int argc, const char **argv); void trace_probe_log_set_index(int index); void trace_probe_log_clear(void); void __trace_probe_log_err(int offset, int err); +DEFINE_FREE(trace_probe_log_clear, const char *, if (_T) trace_probe_log_clear()) + #define trace_probe_log_err(offs, err) \ __trace_probe_log_err(offs, TP_ERR_##err) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e3f2e4f56faa..8faa73d3bba1 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -41,7 +41,7 @@ static void stop_func_tracer(struct trace_array *tr, int graph); static int save_flags; #ifdef CONFIG_FUNCTION_GRAPH_TRACER -# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH) +# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER(DISPLAY_GRAPH)) #else # define is_graph(tr) false #endif @@ -247,8 +247,8 @@ static int register_wakeup_function(struct trace_array *tr, int graph, int set) { int ret; - /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ - if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION))) + /* 'set' is set if TRACE_ITER(FUNCTION) is about to be set */ + if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER(FUNCTION)))) return 0; if (graph) @@ -277,7 +277,7 @@ static void unregister_wakeup_function(struct trace_array *tr, int graph) static int wakeup_function_set(struct trace_array *tr, u32 mask, int set) { - if (!(mask & TRACE_ITER_FUNCTION)) + if (!(mask & TRACE_ITER(FUNCTION))) return 0; if (set) @@ -324,7 +324,7 @@ __trace_function(struct trace_array *tr, trace_function(tr, ip, parent_ip, trace_ctx, NULL); } -static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) +static int wakeup_flag_changed(struct trace_array *tr, u64 mask, int set) { struct tracer *tracer = tr->current_trace; @@ -332,7 +332,7 @@ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) return 0; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - if (mask & TRACE_ITER_DISPLAY_GRAPH) + if (mask & TRACE_ITER(DISPLAY_GRAPH)) return wakeup_display_graph(tr, set); #endif @@ -681,8 +681,8 @@ static int __wakeup_tracer_init(struct trace_array *tr) save_flags = tr->trace_flags; /* non overwrite screws up the latency tracers */ - set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); - set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); + set_tracer_flag(tr, TRACE_ITER(OVERWRITE), 1); + set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), 1); tr->max_latency = 0; wakeup_trace = tr; @@ -725,15 +725,15 @@ static int wakeup_dl_tracer_init(struct trace_array *tr) static void wakeup_tracer_reset(struct trace_array *tr) { - int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; - int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + int lat_flag = save_flags & TRACE_ITER(LATENCY_FMT); + int overwrite_flag = save_flags & TRACE_ITER(OVERWRITE); stop_wakeup_tracer(tr); /* make sure we put back any tasks we are tracing */ wakeup_reset(tr); - set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); - set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); + set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), lat_flag); + set_tracer_flag(tr, TRACE_ITER(OVERWRITE), overwrite_flag); ftrace_reset_array_ops(tr); wakeup_busy = false; } diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index c158d65a8a88..32684ef4fb9d 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -15,7 +15,7 @@ * * A write to the buffer will either succeed or fail. That is, unlike * sprintf() there will not be a partial write (well it may write into - * the buffer but it wont update the pointers). This allows users to + * the buffer but it won't update the pointers). This allows users to * try to write something into the trace_seq buffer and if it fails * they can flush it and try again. * diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 0f932b22f9ec..e96d0063cbcf 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <trace/syscall.h> #include <trace/events/syscalls.h> +#include <linux/kernel_stat.h> #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/kernel.h> @@ -123,6 +124,119 @@ const char *get_syscall_name(int syscall) return entry->name; } +/* Added to user strings or arrays when max limit is reached */ +#define EXTRA "..." + +static void get_dynamic_len_ptr(struct syscall_trace_enter *trace, + struct syscall_metadata *entry, + int *offset_p, int *len_p, unsigned char **ptr_p) +{ + unsigned char *ptr; + int offset = *offset_p; + int val; + + /* This arg points to a user space string */ + ptr = (void *)trace->args + sizeof(long) * entry->nb_args + offset; + val = *(int *)ptr; + + /* The value is a dynamic string (len << 16 | offset) */ + ptr = (void *)trace + (val & 0xffff); + *len_p = val >> 16; + offset += 4; + + *ptr_p = ptr; + *offset_p = offset; +} + +static enum print_line_t +sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadata *entry, + struct trace_seq *s, struct trace_event *event) +{ + unsigned char *ptr; + int offset = 0; + int bits, len; + bool done = false; + static const struct trace_print_flags __flags[] = + { + { O_TMPFILE, "O_TMPFILE" }, + { O_WRONLY, "O_WRONLY" }, + { O_RDWR, "O_RDWR" }, + { O_CREAT, "O_CREAT" }, + { O_EXCL, "O_EXCL" }, + { O_NOCTTY, "O_NOCTTY" }, + { O_TRUNC, "O_TRUNC" }, + { O_APPEND, "O_APPEND" }, + { O_NONBLOCK, "O_NONBLOCK" }, + { O_DSYNC, "O_DSYNC" }, + { O_DIRECT, "O_DIRECT" }, + { O_LARGEFILE, "O_LARGEFILE" }, + { O_DIRECTORY, "O_DIRECTORY" }, + { O_NOFOLLOW, "O_NOFOLLOW" }, + { O_NOATIME, "O_NOATIME" }, + { O_CLOEXEC, "O_CLOEXEC" }, + { -1, NULL } + }; + + trace_seq_printf(s, "%s(", entry->name); + + for (int i = 0; !done && i < entry->nb_args; i++) { + + if (trace_seq_has_overflowed(s)) + goto end; + + if (i) + trace_seq_puts(s, ", "); + + switch (i) { + case 2: + bits = trace->args[2]; + + trace_seq_puts(s, "flags: "); + + /* No need to show mode when not creating the file */ + if (!(bits & (O_CREAT|O_TMPFILE))) + done = true; + + if (!(bits & O_ACCMODE)) { + if (!bits) { + trace_seq_puts(s, "O_RDONLY"); + continue; + } + trace_seq_puts(s, "O_RDONLY|"); + } + + trace_print_flags_seq(s, "|", bits, __flags); + /* + * trace_print_flags_seq() adds a '\0' to the + * buffer, but this needs to append more to the seq. + */ + if (!trace_seq_has_overflowed(s)) + trace_seq_pop(s); + + continue; + case 3: + trace_seq_printf(s, "%s: 0%03o", entry->args[i], + (unsigned int)trace->args[i]); + continue; + } + + trace_seq_printf(s, "%s: %lu", entry->args[i], + trace->args[i]); + + if (!(BIT(i) & entry->user_mask)) + continue; + + get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr); + trace_seq_printf(s, " \"%.*s\"", len, ptr); + } + + trace_seq_putc(s, ')'); +end: + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + static enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -132,7 +246,9 @@ print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_entry *ent = iter->ent; struct syscall_trace_enter *trace; struct syscall_metadata *entry; - int i, syscall; + int i, syscall, val, len; + unsigned char *ptr; + int offset = 0; trace = (typeof(trace))ent; syscall = trace->nr; @@ -146,9 +262,20 @@ print_syscall_enter(struct trace_iterator *iter, int flags, goto end; } + switch (entry->syscall_nr) { + case __NR_openat: + if (!tr || !(tr->trace_flags & TRACE_ITER(VERBOSE))) + return sys_enter_openat_print(trace, entry, s, event); + break; + default: + break; + } + trace_seq_printf(s, "%s(", entry->name); for (i = 0; i < entry->nb_args; i++) { + bool printable = false; + char *str; if (trace_seq_has_overflowed(s)) goto end; @@ -157,7 +284,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags, trace_seq_puts(s, ", "); /* parameter types */ - if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) + if (tr && tr->trace_flags & TRACE_ITER(VERBOSE)) trace_seq_printf(s, "%s ", entry->types[i]); /* parameter values */ @@ -167,6 +294,48 @@ print_syscall_enter(struct trace_iterator *iter, int flags, else trace_seq_printf(s, "%s: 0x%lx", entry->args[i], trace->args[i]); + + if (!(BIT(i) & entry->user_mask)) + continue; + + get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr); + + if (entry->user_arg_size < 0 || entry->user_arg_is_str) { + trace_seq_printf(s, " \"%.*s\"", len, ptr); + continue; + } + + val = trace->args[entry->user_arg_size]; + + str = ptr; + trace_seq_puts(s, " ("); + for (int x = 0; x < len; x++, ptr++) { + if (isascii(*ptr) && isprint(*ptr)) + printable = true; + if (x) + trace_seq_putc(s, ':'); + trace_seq_printf(s, "%02x", *ptr); + } + if (len < val) + trace_seq_printf(s, ", %s", EXTRA); + + trace_seq_putc(s, ')'); + + /* If nothing is printable, don't bother printing anything */ + if (!printable) + continue; + + trace_seq_puts(s, " \""); + for (int x = 0; x < len; x++) { + if (isascii(str[x]) && isprint(str[x])) + trace_seq_putc(s, str[x]); + else + trace_seq_putc(s, '.'); + } + if (len < val) + trace_seq_printf(s, "\"%s", EXTRA); + else + trace_seq_putc(s, '"'); } trace_seq_putc(s, ')'); @@ -212,26 +381,107 @@ print_syscall_exit(struct trace_iterator *iter, int flags, .size = sizeof(_type), .align = __alignof__(_type), \ .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER } +/* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + +static int __init +sys_enter_openat_print_fmt(struct syscall_metadata *entry, char *buf, int len) +{ + int pos = 0; + + pos += snprintf(buf + pos, LEN_OR_ZERO, + "\"dfd: 0x%%08lx, filename: 0x%%08lx \\\"%%s\\\", flags: %%s%%s, mode: 0%%03o\","); + pos += snprintf(buf + pos, LEN_OR_ZERO, + " ((unsigned long)(REC->dfd)),"); + pos += snprintf(buf + pos, LEN_OR_ZERO, + " ((unsigned long)(REC->filename)),"); + pos += snprintf(buf + pos, LEN_OR_ZERO, + " __get_str(__filename_val),"); + pos += snprintf(buf + pos, LEN_OR_ZERO, + " (REC->flags & ~3) && !(REC->flags & 3) ? \"O_RDONLY|\" : \"\", "); + pos += snprintf(buf + pos, LEN_OR_ZERO, + " REC->flags ? __print_flags(REC->flags, \"|\", "); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_WRONLY\" }, ", O_WRONLY); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_RDWR\" }, ", O_RDWR); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_CREAT\" }, ", O_CREAT); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_EXCL\" }, ", O_EXCL); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_NOCTTY\" }, ", O_NOCTTY); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_TRUNC\" }, ", O_TRUNC); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_APPEND\" }, ", O_APPEND); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_NONBLOCK\" }, ", O_NONBLOCK); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_DSYNC\" }, ", O_DSYNC); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_DIRECT\" }, ", O_DIRECT); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_LARGEFILE\" }, ", O_LARGEFILE); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_DIRECTORY\" }, ", O_DIRECTORY); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_NOFOLLOW\" }, ", O_NOFOLLOW); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_NOATIME\" }, ", O_NOATIME); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_CLOEXEC\" }) : \"O_RDONLY\", ", O_CLOEXEC); + + pos += snprintf(buf + pos, LEN_OR_ZERO, + " ((unsigned long)(REC->mode))"); + return pos; +} + static int __init __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) { + bool is_string = entry->user_arg_is_str; int i; int pos = 0; - /* When len=0, we just calculate the needed length */ -#define LEN_OR_ZERO (len ? len - pos : 0) + switch (entry->syscall_nr) { + case __NR_openat: + return sys_enter_openat_print_fmt(entry, buf, len); + default: + break; + } pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); for (i = 0; i < entry->nb_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s", - entry->args[i], sizeof(unsigned long), - i == entry->nb_args - 1 ? "" : ", "); + if (i) + pos += snprintf(buf + pos, LEN_OR_ZERO, ", "); + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx", + entry->args[i], sizeof(unsigned long)); + + if (!(BIT(i) & entry->user_mask)) + continue; + + /* Add the format for the user space string or array */ + if (entry->user_arg_size < 0 || is_string) + pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\""); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, " (%%s)"); } pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); for (i = 0; i < entry->nb_args; i++) { pos += snprintf(buf + pos, LEN_OR_ZERO, ", ((unsigned long)(REC->%s))", entry->args[i]); + if (!(BIT(i) & entry->user_mask)) + continue; + /* The user space data for arg has name __<arg>_val */ + if (entry->user_arg_size < 0 || is_string) { + pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)", + entry->args[i]); + } else { + pos += snprintf(buf + pos, LEN_OR_ZERO, ", __print_dynamic_array(__%s_val, 1)", + entry->args[i]); + } } #undef LEN_OR_ZERO @@ -277,8 +527,11 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) { struct syscall_trace_enter trace; struct syscall_metadata *meta = call->data; + unsigned long mask; + char *arg; int offset = offsetof(typeof(trace), args); int ret = 0; + int len; int i; for (i = 0; i < meta->nb_args; i++) { @@ -291,9 +544,320 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) offset += sizeof(unsigned long); } + if (ret || !meta->user_mask) + return ret; + + mask = meta->user_mask; + + while (mask) { + int idx = ffs(mask) - 1; + mask &= ~BIT(idx); + + /* + * User space data is faulted into a temporary buffer and then + * added as a dynamic string or array to the end of the event. + * The user space data name for the arg pointer is + * "__<arg>_val". + */ + len = strlen(meta->args[idx]) + sizeof("___val"); + arg = kmalloc(len, GFP_KERNEL); + if (WARN_ON_ONCE(!arg)) { + meta->user_mask = 0; + return -ENOMEM; + } + + snprintf(arg, len, "__%s_val", meta->args[idx]); + + ret = trace_define_field(call, "__data_loc char[]", + arg, offset, sizeof(int), 0, + FILTER_OTHER); + if (ret) { + kfree(arg); + break; + } + offset += 4; + } return ret; } +/* + * Create a per CPU temporary buffer to copy user space pointers into. + * + * SYSCALL_FAULT_USER_MAX is the amount to copy from user space. + * (defined in kernel/trace/trace.h) + + * SYSCALL_FAULT_ARG_SZ is the amount to copy from user space plus the + * nul terminating byte and possibly appended EXTRA (4 bytes). + * + * SYSCALL_FAULT_BUF_SZ holds the size of the per CPU buffer to use + * to copy memory from user space addresses into that will hold + * 3 args as only 3 args are allowed to be copied from system calls. + */ +#define SYSCALL_FAULT_ARG_SZ (SYSCALL_FAULT_USER_MAX + 1 + 4) +#define SYSCALL_FAULT_MAX_CNT 3 +#define SYSCALL_FAULT_BUF_SZ (SYSCALL_FAULT_ARG_SZ * SYSCALL_FAULT_MAX_CNT) + +/* Use the tracing per CPU buffer infrastructure to copy from user space */ +struct syscall_user_buffer { + struct trace_user_buf_info buf; + struct rcu_head rcu; +}; + +static struct syscall_user_buffer *syscall_buffer; + +static int syscall_fault_buffer_enable(void) +{ + struct syscall_user_buffer *sbuf; + int ret; + + lockdep_assert_held(&syscall_trace_lock); + + if (syscall_buffer) { + trace_user_fault_get(&syscall_buffer->buf); + return 0; + } + + sbuf = kmalloc(sizeof(*sbuf), GFP_KERNEL); + if (!sbuf) + return -ENOMEM; + + ret = trace_user_fault_init(&sbuf->buf, SYSCALL_FAULT_BUF_SZ); + if (ret < 0) { + kfree(sbuf); + return ret; + } + + WRITE_ONCE(syscall_buffer, sbuf); + + return 0; +} + +static void rcu_free_syscall_buffer(struct rcu_head *rcu) +{ + struct syscall_user_buffer *sbuf = + container_of(rcu, struct syscall_user_buffer, rcu); + + trace_user_fault_destroy(&sbuf->buf); + kfree(sbuf); +} + + +static void syscall_fault_buffer_disable(void) +{ + struct syscall_user_buffer *sbuf = syscall_buffer; + + lockdep_assert_held(&syscall_trace_lock); + + if (trace_user_fault_put(&sbuf->buf)) + return; + + WRITE_ONCE(syscall_buffer, NULL); + call_rcu_tasks_trace(&sbuf->rcu, rcu_free_syscall_buffer); +} + +struct syscall_args { + char *ptr_array[SYSCALL_FAULT_MAX_CNT]; + int read[SYSCALL_FAULT_MAX_CNT]; + int uargs; +}; + +static int syscall_copy_user(char *buf, const char __user *ptr, + size_t size, void *data) +{ + struct syscall_args *args = data; + int ret; + + for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { + ptr = (char __user *)args->ptr_array[i]; + ret = strncpy_from_user(buf, ptr, size); + args->read[i] = ret; + } + return 0; +} + +static int syscall_copy_user_array(char *buf, const char __user *ptr, + size_t size, void *data) +{ + struct syscall_args *args = data; + int ret; + + for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { + ptr = (char __user *)args->ptr_array[i]; + ret = __copy_from_user(buf, ptr, size); + args->read[i] = ret ? -1 : size; + } + return 0; +} + +static char *sys_fault_user(unsigned int buf_size, + struct syscall_metadata *sys_data, + struct syscall_user_buffer *sbuf, + unsigned long *args, + unsigned int data_size[SYSCALL_FAULT_MAX_CNT]) +{ + trace_user_buf_copy syscall_copy = syscall_copy_user; + unsigned long mask = sys_data->user_mask; + unsigned long size = SYSCALL_FAULT_ARG_SZ - 1; + struct syscall_args sargs; + bool array = false; + char *buffer; + char *buf; + int ret; + int i = 0; + + /* The extra is appended to the user data in the buffer */ + BUILD_BUG_ON(SYSCALL_FAULT_USER_MAX + sizeof(EXTRA) >= + SYSCALL_FAULT_ARG_SZ); + + /* + * If this system call event has a size argument, use + * it to define how much of user space memory to read, + * and read it as an array and not a string. + */ + if (sys_data->user_arg_size >= 0) { + array = true; + size = args[sys_data->user_arg_size]; + if (size > SYSCALL_FAULT_ARG_SZ - 1) + size = SYSCALL_FAULT_ARG_SZ - 1; + syscall_copy = syscall_copy_user_array; + } + + while (mask) { + int idx = ffs(mask) - 1; + mask &= ~BIT(idx); + + if (WARN_ON_ONCE(i == SYSCALL_FAULT_MAX_CNT)) + break; + + /* Get the pointer to user space memory to read */ + sargs.ptr_array[i++] = (char *)args[idx]; + } + + sargs.uargs = i; + + /* Clear the values that are not used */ + for (; i < SYSCALL_FAULT_MAX_CNT; i++) { + data_size[i] = -1; /* Denotes no pointer */ + } + + /* A zero size means do not even try */ + if (!buf_size) + return NULL; + + buffer = trace_user_fault_read(&sbuf->buf, NULL, size, + syscall_copy, &sargs); + if (!buffer) + return NULL; + + buf = buffer; + for (i = 0; i < sargs.uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { + + ret = sargs.read[i]; + if (ret < 0) + continue; + buf[ret] = '\0'; + + /* For strings, replace any non-printable characters with '.' */ + if (!array) { + for (int x = 0; x < ret; x++) { + if (!isprint(buf[x])) + buf[x] = '.'; + } + + size = min(buf_size, SYSCALL_FAULT_USER_MAX); + + /* + * If the text was truncated due to our max limit, + * add "..." to the string. + */ + if (ret > size) { + strscpy(buf + size, EXTRA, sizeof(EXTRA)); + ret = size + sizeof(EXTRA); + } else { + buf[ret++] = '\0'; + } + } else { + ret = min((unsigned int)ret, buf_size); + } + data_size[i] = ret; + } + + return buffer; +} + +static int +syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args, + char **buffer, int *size, int *user_sizes, int *uargs, + int buf_size) +{ + struct syscall_user_buffer *sbuf; + int i; + + /* If the syscall_buffer is NULL, tracing is being shutdown */ + sbuf = READ_ONCE(syscall_buffer); + if (!sbuf) + return -1; + + *buffer = sys_fault_user(buf_size, sys_data, sbuf, args, user_sizes); + /* + * user_size is the amount of data to append. + * Need to add 4 for the meta field that points to + * the user memory at the end of the event and also + * stores its size. + */ + for (i = 0; i < SYSCALL_FAULT_MAX_CNT; i++) { + if (user_sizes[i] < 0) + break; + *size += user_sizes[i] + 4; + } + /* Save the number of user read arguments of this syscall */ + *uargs = i; + return 0; +} + +static void syscall_put_data(struct syscall_metadata *sys_data, + struct syscall_trace_enter *entry, + char *buffer, int size, int *user_sizes, int uargs) +{ + char *buf = buffer; + void *ptr; + int val; + + /* + * Set the pointer to point to the meta data of the event + * that has information about the stored user space memory. + */ + ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args; + + /* + * The meta data will store the offset of the user data from + * the beginning of the event. That is after the static arguments + * and the meta data fields. + */ + val = (ptr - (void *)entry) + 4 * uargs; + + for (int i = 0; i < uargs; i++) { + + if (i) + val += user_sizes[i - 1]; + + /* Store the offset and the size into the meta data */ + *(int *)ptr = val | (user_sizes[i] << 16); + + /* Skip the meta data */ + ptr += 4; + } + + for (int i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { + /* Nothing to do if the user space was empty or faulted */ + if (!user_sizes[i]) + continue; + + memcpy(ptr, buf, user_sizes[i]); + ptr += user_sizes[i]; + } +} + static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) { struct trace_array *tr = data; @@ -302,15 +866,18 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct trace_event_buffer fbuffer; unsigned long args[6]; + char *user_ptr; + int user_sizes[SYSCALL_FAULT_MAX_CNT] = {}; int syscall_nr; - int size; + int size = 0; + int uargs = 0; + bool mayfault; /* * Syscall probe called with preemption enabled, but the ring * buffer and per-cpu data require preemption to be disabled. */ might_fault(); - guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) @@ -327,7 +894,20 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) if (!sys_data) return; - size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; + /* Check if this syscall event faults in user space memory */ + mayfault = sys_data->user_mask != 0; + + guard(preempt_notrace)(); + + syscall_get_arguments(current, regs, args); + + if (mayfault) { + if (syscall_get_data(sys_data, args, &user_ptr, + &size, user_sizes, &uargs, tr->syscall_buf_sz) < 0) + return; + } + + size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); if (!entry) @@ -335,9 +915,12 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) entry = ring_buffer_event_data(fbuffer.event); entry->nr = syscall_nr; - syscall_get_arguments(current, regs, args); + memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); + if (mayfault) + syscall_put_data(sys_data, entry, user_ptr, size, user_sizes, uargs); + trace_event_buffer_commit(&fbuffer); } @@ -386,39 +969,50 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) static int reg_event_syscall_enter(struct trace_event_file *file, struct trace_event_call *call) { + struct syscall_metadata *sys_data = call->data; struct trace_array *tr = file->tr; int ret = 0; int num; - num = ((struct syscall_metadata *)call->data)->syscall_nr; + num = sys_data->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; - mutex_lock(&syscall_trace_lock); - if (!tr->sys_refcount_enter) + guard(mutex)(&syscall_trace_lock); + if (sys_data->user_mask) { + ret = syscall_fault_buffer_enable(); + if (ret < 0) + return ret; + } + if (!tr->sys_refcount_enter) { ret = register_trace_sys_enter(ftrace_syscall_enter, tr); - if (!ret) { - WRITE_ONCE(tr->enter_syscall_files[num], file); - tr->sys_refcount_enter++; + if (ret < 0) { + if (sys_data->user_mask) + syscall_fault_buffer_disable(); + return ret; + } } - mutex_unlock(&syscall_trace_lock); - return ret; + WRITE_ONCE(tr->enter_syscall_files[num], file); + tr->sys_refcount_enter++; + return 0; } static void unreg_event_syscall_enter(struct trace_event_file *file, struct trace_event_call *call) { + struct syscall_metadata *sys_data = call->data; struct trace_array *tr = file->tr; int num; - num = ((struct syscall_metadata *)call->data)->syscall_nr; + num = sys_data->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; - mutex_lock(&syscall_trace_lock); + guard(mutex)(&syscall_trace_lock); tr->sys_refcount_enter--; WRITE_ONCE(tr->enter_syscall_files[num], NULL); if (!tr->sys_refcount_enter) unregister_trace_sys_enter(ftrace_syscall_enter, tr); - mutex_unlock(&syscall_trace_lock); + if (sys_data->user_mask) + syscall_fault_buffer_disable(); } static int reg_event_syscall_exit(struct trace_event_file *file, @@ -459,6 +1053,215 @@ static void unreg_event_syscall_exit(struct trace_event_file *file, mutex_unlock(&syscall_trace_lock); } +/* + * For system calls that reference user space memory that can + * be recorded into the event, set the system call meta data's user_mask + * to the "args" index that points to the user space memory to retrieve. + */ +static void check_faultable_syscall(struct trace_event_call *call, int nr) +{ + struct syscall_metadata *sys_data = call->data; + unsigned long mask; + + /* Only work on entry */ + if (sys_data->enter_event != call) + return; + + sys_data->user_arg_size = -1; + + switch (nr) { + /* user arg 1 with size arg at 2 */ + case __NR_write: +#ifdef __NR_mq_timedsend + case __NR_mq_timedsend: +#endif + case __NR_pwrite64: + sys_data->user_mask = BIT(1); + sys_data->user_arg_size = 2; + break; + /* user arg 0 with size arg at 1 as string */ + case __NR_setdomainname: + case __NR_sethostname: + sys_data->user_mask = BIT(0); + sys_data->user_arg_size = 1; + sys_data->user_arg_is_str = 1; + break; +#ifdef __NR_kexec_file_load + /* user arg 4 with size arg at 3 as string */ + case __NR_kexec_file_load: + sys_data->user_mask = BIT(4); + sys_data->user_arg_size = 3; + sys_data->user_arg_is_str = 1; + break; +#endif + /* user arg at position 0 */ +#ifdef __NR_access + case __NR_access: +#endif + case __NR_acct: + case __NR_chdir: +#ifdef __NR_chown + case __NR_chown: +#endif +#ifdef __NR_chmod + case __NR_chmod: +#endif + case __NR_chroot: +#ifdef __NR_creat + case __NR_creat: +#endif + case __NR_delete_module: + case __NR_execve: + case __NR_fsopen: +#ifdef __NR_lchown + case __NR_lchown: +#endif +#ifdef __NR_open + case __NR_open: +#endif + case __NR_memfd_create: +#ifdef __NR_mkdir + case __NR_mkdir: +#endif +#ifdef __NR_mknod + case __NR_mknod: +#endif + case __NR_mq_open: + case __NR_mq_unlink: +#ifdef __NR_readlink + case __NR_readlink: +#endif +#ifdef __NR_rmdir + case __NR_rmdir: +#endif + case __NR_shmdt: +#ifdef __NR_statfs + case __NR_statfs: +#endif + case __NR_swapon: + case __NR_swapoff: +#ifdef __NR_truncate + case __NR_truncate: +#endif +#ifdef __NR_unlink + case __NR_unlink: +#endif + case __NR_umount2: +#ifdef __NR_utime + case __NR_utime: +#endif +#ifdef __NR_utimes + case __NR_utimes: +#endif + sys_data->user_mask = BIT(0); + break; + /* user arg at position 1 */ + case __NR_execveat: + case __NR_faccessat: + case __NR_faccessat2: + case __NR_finit_module: + case __NR_fchmodat: + case __NR_fchmodat2: + case __NR_fchownat: + case __NR_fgetxattr: + case __NR_flistxattr: + case __NR_fsetxattr: + case __NR_fspick: + case __NR_fremovexattr: +#ifdef __NR_futimesat + case __NR_futimesat: +#endif + case __NR_inotify_add_watch: + case __NR_mkdirat: + case __NR_mknodat: + case __NR_mount_setattr: + case __NR_name_to_handle_at: +#ifdef __NR_newfstatat + case __NR_newfstatat: +#endif + case __NR_openat: + case __NR_openat2: + case __NR_open_tree: + case __NR_open_tree_attr: + case __NR_readlinkat: + case __NR_quotactl: + case __NR_syslog: + case __NR_statx: + case __NR_unlinkat: +#ifdef __NR_utimensat + case __NR_utimensat: +#endif + sys_data->user_mask = BIT(1); + break; + /* user arg at position 2 */ + case __NR_init_module: + case __NR_fsconfig: + sys_data->user_mask = BIT(2); + break; + /* user arg at position 4 */ + case __NR_fanotify_mark: + sys_data->user_mask = BIT(4); + break; + /* 2 user args, 0 and 1 */ + case __NR_add_key: + case __NR_getxattr: + case __NR_lgetxattr: + case __NR_lremovexattr: +#ifdef __NR_link + case __NR_link: +#endif + case __NR_listxattr: + case __NR_llistxattr: + case __NR_lsetxattr: + case __NR_pivot_root: + case __NR_removexattr: +#ifdef __NR_rename + case __NR_rename: +#endif + case __NR_request_key: + case __NR_setxattr: +#ifdef __NR_symlink + case __NR_symlink: +#endif + sys_data->user_mask = BIT(0) | BIT(1); + break; + /* 2 user args, 0 and 2 */ + case __NR_symlinkat: + sys_data->user_mask = BIT(0) | BIT(2); + break; + /* 2 user args, 1 and 3 */ + case __NR_getxattrat: + case __NR_linkat: + case __NR_listxattrat: + case __NR_move_mount: +#ifdef __NR_renameat + case __NR_renameat: +#endif + case __NR_renameat2: + case __NR_removexattrat: + case __NR_setxattrat: + sys_data->user_mask = BIT(1) | BIT(3); + break; + case __NR_mount: /* Just dev_name and dir_name, TODO add type */ + sys_data->user_mask = BIT(0) | BIT(1) | BIT(2); + break; + default: + sys_data->user_mask = 0; + return; + } + + if (sys_data->user_arg_size < 0) + return; + + /* + * The user_arg_size can only be used when the system call + * is reading only a single address from user space. + */ + mask = sys_data->user_mask; + if (WARN_ON(mask & (mask - 1))) + sys_data->user_arg_size = -1; +} + static int __init init_syscall_trace(struct trace_event_call *call) { int id; @@ -471,6 +1274,8 @@ static int __init init_syscall_trace(struct trace_event_call *call) return -ENOSYS; } + check_faultable_syscall(call, num); + if (set_syscall_print_fmt(call) < 0) return -ENOMEM; @@ -598,9 +1403,14 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) struct hlist_head *head; unsigned long args[6]; bool valid_prog_array; + bool mayfault; + char *user_ptr; + int user_sizes[SYSCALL_FAULT_MAX_CNT] = {}; + int buf_size = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT; int syscall_nr; int rctx; - int size; + int size = 0; + int uargs = 0; /* * Syscall probe called with preemption enabled, but the ring @@ -619,13 +1429,24 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (!sys_data) return; + syscall_get_arguments(current, regs, args); + + /* Check if this syscall event faults in user space memory */ + mayfault = sys_data->user_mask != 0; + + if (mayfault) { + if (syscall_get_data(sys_data, args, &user_ptr, + &size, user_sizes, &uargs, buf_size) < 0) + return; + } + head = this_cpu_ptr(sys_data->enter_event->perf_events); valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); if (!valid_prog_array && hlist_empty(head)) return; /* get the size after alignment with the u32 buffer size field */ - size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); + size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -634,9 +1455,11 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) return; rec->nr = syscall_nr; - syscall_get_arguments(current, regs, args); memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args); + if (mayfault) + syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs); + if ((valid_prog_array && !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) || hlist_empty(head)) { @@ -651,36 +1474,46 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) static int perf_sysenter_enable(struct trace_event_call *call) { - int ret = 0; + struct syscall_metadata *sys_data = call->data; int num; + int ret; - num = ((struct syscall_metadata *)call->data)->syscall_nr; + num = sys_data->syscall_nr; - mutex_lock(&syscall_trace_lock); - if (!sys_perf_refcount_enter) + guard(mutex)(&syscall_trace_lock); + if (sys_data->user_mask) { + ret = syscall_fault_buffer_enable(); + if (ret < 0) + return ret; + } + if (!sys_perf_refcount_enter) { ret = register_trace_sys_enter(perf_syscall_enter, NULL); - if (ret) { - pr_info("event trace: Could not activate syscall entry trace point"); - } else { - set_bit(num, enabled_perf_enter_syscalls); - sys_perf_refcount_enter++; + if (ret) { + pr_info("event trace: Could not activate syscall entry trace point"); + if (sys_data->user_mask) + syscall_fault_buffer_disable(); + return ret; + } } - mutex_unlock(&syscall_trace_lock); - return ret; + set_bit(num, enabled_perf_enter_syscalls); + sys_perf_refcount_enter++; + return 0; } static void perf_sysenter_disable(struct trace_event_call *call) { + struct syscall_metadata *sys_data = call->data; int num; - num = ((struct syscall_metadata *)call->data)->syscall_nr; + num = sys_data->syscall_nr; - mutex_lock(&syscall_trace_lock); + guard(mutex)(&syscall_trace_lock); sys_perf_refcount_enter--; clear_bit(num, enabled_perf_enter_syscalls); if (!sys_perf_refcount_enter) unregister_trace_sys_enter(perf_syscall_enter, NULL); - mutex_unlock(&syscall_trace_lock); + if (sys_data->user_mask) + syscall_fault_buffer_disable(); } static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs, @@ -757,22 +1590,21 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) static int perf_sysexit_enable(struct trace_event_call *call) { - int ret = 0; int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - mutex_lock(&syscall_trace_lock); - if (!sys_perf_refcount_exit) - ret = register_trace_sys_exit(perf_syscall_exit, NULL); - if (ret) { - pr_info("event trace: Could not activate syscall exit trace point"); - } else { - set_bit(num, enabled_perf_exit_syscalls); - sys_perf_refcount_exit++; + guard(mutex)(&syscall_trace_lock); + if (!sys_perf_refcount_exit) { + int ret = register_trace_sys_exit(perf_syscall_exit, NULL); + if (ret) { + pr_info("event trace: Could not activate syscall exit trace point"); + return ret; + } } - mutex_unlock(&syscall_trace_lock); - return ret; + set_bit(num, enabled_perf_exit_syscalls); + sys_perf_refcount_exit++; + return 0; } static void perf_sysexit_disable(struct trace_event_call *call) @@ -781,12 +1613,11 @@ static void perf_sysexit_disable(struct trace_event_call *call) num = ((struct syscall_metadata *)call->data)->syscall_nr; - mutex_lock(&syscall_trace_lock); + guard(mutex)(&syscall_trace_lock); sys_perf_refcount_exit--; clear_bit(num, enabled_perf_exit_syscalls); if (!sys_perf_refcount_exit) unregister_trace_sys_exit(perf_syscall_exit, NULL); - mutex_unlock(&syscall_trace_lock); } #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 430d09c49462..1b4f32e2b9bd 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -533,21 +533,26 @@ static int register_trace_uprobe(struct trace_uprobe *tu) return ret; } +DEFINE_FREE(free_trace_uprobe, struct trace_uprobe *, if (_T) free_trace_uprobe(_T)) + /* * Argument syntax: * - Add uprobe: p|r[:[GRP/][EVENT]] PATH:OFFSET[%return][(REF)] [FETCHARGS] */ static int __trace_uprobe_create(int argc, const char **argv) { + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; + struct trace_uprobe *tu __free(free_trace_uprobe) = NULL; + const char *trlog __free(trace_probe_log_clear) = NULL; const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; - char *arg, *filename, *rctr, *rctr_end, *tmp; + struct path path __free(path_put) = {}; unsigned long offset, ref_ctr_offset; + char *filename __free(kfree) = NULL; + char *arg, *rctr, *rctr_end, *tmp; char *gbuf __free(kfree) = NULL; char *buf __free(kfree) = NULL; enum probe_print_type ptype; - struct trace_uprobe *tu; bool is_return = false; - struct path path; int i, ret; ref_ctr_offset = 0; @@ -565,7 +570,7 @@ static int __trace_uprobe_create(int argc, const char **argv) if (argc < 2) return -ECANCELED; - trace_probe_log_init("trace_uprobe", argc, argv); + trlog = trace_probe_log_init("trace_uprobe", argc, argv); if (argc - 2 > MAX_TRACE_ARGS) { trace_probe_log_set_index(2); @@ -585,10 +590,8 @@ static int __trace_uprobe_create(int argc, const char **argv) /* Find the last occurrence, in case the path contains ':' too. */ arg = strrchr(filename, ':'); - if (!arg || !isdigit(arg[1])) { - kfree(filename); + if (!arg || !isdigit(arg[1])) return -ECANCELED; - } trace_probe_log_set_index(1); /* filename is the 2nd argument */ @@ -596,14 +599,11 @@ static int __trace_uprobe_create(int argc, const char **argv) ret = kern_path(filename, LOOKUP_FOLLOW, &path); if (ret) { trace_probe_log_err(0, FILE_NOT_FOUND); - kfree(filename); - trace_probe_log_clear(); return ret; } if (!d_is_reg(path.dentry)) { trace_probe_log_err(0, NO_REGULAR_FILE); - ret = -EINVAL; - goto fail_address_parse; + return -EINVAL; } /* Parse reference counter offset if specified. */ @@ -611,16 +611,14 @@ static int __trace_uprobe_create(int argc, const char **argv) if (rctr) { rctr_end = strchr(rctr, ')'); if (!rctr_end) { - ret = -EINVAL; rctr_end = rctr + strlen(rctr); trace_probe_log_err(rctr_end - filename, REFCNT_OPEN_BRACE); - goto fail_address_parse; + return -EINVAL; } else if (rctr_end[1] != '\0') { - ret = -EINVAL; trace_probe_log_err(rctr_end + 1 - filename, BAD_REFCNT_SUFFIX); - goto fail_address_parse; + return -EINVAL; } *rctr++ = '\0'; @@ -628,7 +626,7 @@ static int __trace_uprobe_create(int argc, const char **argv) ret = kstrtoul(rctr, 0, &ref_ctr_offset); if (ret) { trace_probe_log_err(rctr - filename, BAD_REFCNT); - goto fail_address_parse; + return ret; } } @@ -640,8 +638,7 @@ static int __trace_uprobe_create(int argc, const char **argv) is_return = true; } else { trace_probe_log_err(tmp - filename, BAD_ADDR_SUFFIX); - ret = -EINVAL; - goto fail_address_parse; + return -EINVAL; } } @@ -649,7 +646,7 @@ static int __trace_uprobe_create(int argc, const char **argv) ret = kstrtoul(arg, 0, &offset); if (ret) { trace_probe_log_err(arg - filename, BAD_UPROBE_OFFS); - goto fail_address_parse; + return ret; } /* setup a probe */ @@ -657,12 +654,12 @@ static int __trace_uprobe_create(int argc, const char **argv) if (event) { gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); if (!gbuf) - goto fail_mem; + return -ENOMEM; ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) - goto fail_address_parse; + return ret; } if (!event) { @@ -671,7 +668,7 @@ static int __trace_uprobe_create(int argc, const char **argv) tail = kstrdup(kbasename(filename), GFP_KERNEL); if (!tail) - goto fail_mem; + return -ENOMEM; ptr = strpbrk(tail, ".-_"); if (ptr) @@ -679,7 +676,7 @@ static int __trace_uprobe_create(int argc, const char **argv) buf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); if (!buf) - goto fail_mem; + return -ENOMEM; snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset); event = buf; kfree(tail); @@ -693,51 +690,36 @@ static int __trace_uprobe_create(int argc, const char **argv) ret = PTR_ERR(tu); /* This must return -ENOMEM otherwise there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); - goto fail_address_parse; + return ret; } tu->offset = offset; tu->ref_ctr_offset = ref_ctr_offset; tu->path = path; - tu->filename = filename; + /* Clear @path so that it will not freed by path_put() */ + memset(&path, 0, sizeof(path)); + tu->filename = no_free_ptr(filename); + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + ctx->flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER; /* parse arguments */ for (i = 0; i < argc; i++) { - struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) - = kzalloc(sizeof(*ctx), GFP_KERNEL); - - if (!ctx) { - ret = -ENOMEM; - goto error; - } - ctx->flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER; trace_probe_log_set_index(i + 2); ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], ctx); if (ret) - goto error; + return ret; } ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; ret = traceprobe_set_print_fmt(&tu->tp, ptype); if (ret < 0) - goto error; + return ret; ret = register_trace_uprobe(tu); if (!ret) - goto out; - -error: - free_trace_uprobe(tu); -out: - trace_probe_log_clear(); - return ret; - -fail_mem: - ret = -ENOMEM; - -fail_address_parse: - trace_probe_log_clear(); - path_put(&path); - kfree(filename); + tu = NULL; return ret; } diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index e066d31d08f8..fe9bf8db1922 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -31,6 +31,13 @@ u32 *vmcoreinfo_note; /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ static unsigned char *vmcoreinfo_data_safecopy; +struct hwerr_info { + atomic_t count; + time64_t timestamp; +}; + +static struct hwerr_info hwerr_data[HWERR_RECOV_MAX]; + Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len) { @@ -118,6 +125,16 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void) } EXPORT_SYMBOL(paddr_vmcoreinfo_note); +void hwerr_log_error_type(enum hwerr_error_type src) +{ + if (src < 0 || src >= HWERR_RECOV_MAX) + return; + + atomic_inc(&hwerr_data[src].count); + WRITE_ONCE(hwerr_data[src].timestamp, ktime_get_real_seconds()); +} +EXPORT_SYMBOL_GPL(hwerr_log_error_type); + static int __init crash_save_vmcoreinfo_init(void) { vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5b62d1002783..0685e3a8aa0a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -25,6 +25,7 @@ #include <linux/stop_machine.h> #include <linux/sysctl.h> #include <linux/tick.h> +#include <linux/sys_info.h> #include <linux/sched/clock.h> #include <linux/sched/debug.h> @@ -65,6 +66,13 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace; unsigned int __read_mostly hardlockup_panic = IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC); +/* + * bitmasks to control what kinds of system info to be printed when + * hard lockup is detected, it could be task, memory, lock etc. + * Refer include/linux/sys_info.h for detailed bit definition. + */ +static unsigned long hardlockup_si_mask; + #ifdef CONFIG_SYSFS static unsigned int hardlockup_count; @@ -178,11 +186,15 @@ static void watchdog_hardlockup_kick(void) void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) { + int hardlockup_all_cpu_backtrace; + if (per_cpu(watchdog_hardlockup_touched, cpu)) { per_cpu(watchdog_hardlockup_touched, cpu) = false; return; } + hardlockup_all_cpu_backtrace = (hardlockup_si_mask & SYS_INFO_ALL_BT) ? + 1 : sysctl_hardlockup_all_cpu_backtrace; /* * Check for a hardlockup by making sure the CPU's timer * interrupt is incrementing. The timer interrupt should have @@ -196,6 +208,15 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) #ifdef CONFIG_SYSFS ++hardlockup_count; #endif + /* + * A poorly behaving BPF scheduler can trigger hard lockup by + * e.g. putting numerous affinitized tasks in a single queue and + * directing all CPUs at it. The following call can return true + * only once when sched_ext is enabled and will immediately + * abort the BPF scheduler and print out a warning message. + */ + if (scx_hardlockup(cpu)) + return; /* Only print hardlockups once. */ if (per_cpu(watchdog_hardlockup_warned, cpu)) @@ -205,7 +226,7 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) * Prevent multiple hard-lockup reports if one cpu is already * engaged in dumping all cpu back traces. */ - if (sysctl_hardlockup_all_cpu_backtrace) { + if (hardlockup_all_cpu_backtrace) { if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn)) return; } @@ -234,12 +255,13 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) trigger_single_cpu_backtrace(cpu); } - if (sysctl_hardlockup_all_cpu_backtrace) { + if (hardlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(cpu); if (!hardlockup_panic) clear_bit_unlock(0, &hard_lockup_nmi_warn); } + sys_info(hardlockup_si_mask & ~SYS_INFO_ALL_BT); if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); @@ -330,6 +352,13 @@ static void lockup_detector_update_enable(void) int __read_mostly sysctl_softlockup_all_cpu_backtrace; #endif +/* + * bitmasks to control what kinds of system info to be printed when + * soft lockup is detected, it could be task, memory, lock etc. + * Refer include/linux/sys_info.h for detailed bit definition. + */ +static unsigned long softlockup_si_mask; + static struct cpumask watchdog_allowed_mask __read_mostly; /* Global variables, exported for sysctl */ @@ -746,7 +775,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) unsigned long touch_ts, period_ts, now; struct pt_regs *regs = get_irq_regs(); int duration; - int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; + int softlockup_all_cpu_backtrace; unsigned long flags; if (!watchdog_enabled) @@ -758,6 +787,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (panic_in_progress()) return HRTIMER_NORESTART; + softlockup_all_cpu_backtrace = (softlockup_si_mask & SYS_INFO_ALL_BT) ? + 1 : sysctl_softlockup_all_cpu_backtrace; + watchdog_hardlockup_kick(); /* kick the softlockup detector */ @@ -846,6 +878,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) } add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); + sys_info(softlockup_si_mask & ~SYS_INFO_ALL_BT); if (softlockup_panic) panic("softlockup: hung tasks"); } @@ -1197,6 +1230,13 @@ static const struct ctl_table watchdog_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "softlockup_sys_info", + .data = &softlockup_si_mask, + .maxlen = sizeof(softlockup_si_mask), + .mode = 0644, + .proc_handler = sysctl_sys_info_handler, + }, #ifdef CONFIG_SMP { .procname = "softlockup_all_cpu_backtrace", @@ -1219,6 +1259,13 @@ static const struct ctl_table watchdog_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "hardlockup_sys_info", + .data = &hardlockup_si_mask, + .maxlen = sizeof(hardlockup_si_mask), + .mode = 0644, + .proc_handler = sysctl_sys_info_handler, + }, #ifdef CONFIG_SMP { .procname = "hardlockup_all_cpu_backtrace", @@ -1231,14 +1278,11 @@ static const struct ctl_table watchdog_sysctls[] = { }, #endif /* CONFIG_SMP */ #endif -}; - -static struct ctl_table watchdog_hardlockup_sysctl[] = { { .procname = "nmi_watchdog", .data = &watchdog_hardlockup_user_enabled, .maxlen = sizeof(int), - .mode = 0444, + .mode = 0644, .proc_handler = proc_nmi_watchdog, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -1248,10 +1292,6 @@ static struct ctl_table watchdog_hardlockup_sysctl[] = { static void __init watchdog_sysctl_init(void) { register_sysctl_init("kernel", watchdog_sysctls); - - if (watchdog_hardlockup_available) - watchdog_hardlockup_sysctl[0].mode = 0644; - register_sysctl_init("kernel", watchdog_hardlockup_sysctl); } #else diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 45320e27a16c..253311af47c6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -541,12 +541,6 @@ static void show_one_worker_pool(struct worker_pool *pool); !lockdep_is_held(&wq_pool_mutex), \ "RCU or wq_pool_mutex should be held") -#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ - RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() && \ - !lockdep_is_held(&wq->mutex) && \ - !lockdep_is_held(&wq_pool_mutex), \ - "RCU, wq->mutex or wq_pool_mutex should be held") - #define for_each_bh_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(bh_worker_pools, cpu)[0]; \ (pool) < &per_cpu(bh_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ @@ -3443,6 +3437,27 @@ sleep: goto woke_up; } +static bool assign_rescuer_work(struct pool_workqueue *pwq, struct worker *rescuer) +{ + struct worker_pool *pool = pwq->pool; + struct work_struct *work, *n; + + /* need rescue? */ + if (!pwq->nr_active || !need_to_create_worker(pool)) + return false; + + /* + * Slurp in all works issued via this workqueue and + * process'em. + */ + list_for_each_entry_safe(work, n, &pool->worklist, entry) { + if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n)) + pwq->stats[PWQ_STAT_RESCUED]++; + } + + return !list_empty(&rescuer->scheduled); +} + /** * rescuer_thread - the rescuer thread function * @__rescuer: self @@ -3497,7 +3512,6 @@ repeat: struct pool_workqueue *pwq = list_first_entry(&wq->maydays, struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; - struct work_struct *work, *n; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); @@ -3508,18 +3522,9 @@ repeat: raw_spin_lock_irq(&pool->lock); - /* - * Slurp in all works issued via this workqueue and - * process'em. - */ WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); - list_for_each_entry_safe(work, n, &pool->worklist, entry) { - if (get_work_pwq(work) == pwq && - assign_work(work, rescuer, &n)) - pwq->stats[PWQ_STAT_RESCUED]++; - } - if (!list_empty(&rescuer->scheduled)) { + if (assign_rescuer_work(pwq, rescuer)) { process_scheduled_works(rescuer); /* @@ -3534,10 +3539,9 @@ repeat: if (pwq->nr_active && need_to_create_worker(pool)) { raw_spin_lock(&wq_mayday_lock); /* - * Queue iff we aren't racing destruction - * and somebody else hasn't queued it already. + * Queue iff somebody else hasn't queued it already. */ - if (wq->rescuer && list_empty(&pwq->mayday_node)) { + if (list_empty(&pwq->mayday_node)) { get_pwq(pwq); list_add_tail(&pwq->mayday_node, &wq->maydays); } @@ -5376,11 +5380,6 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) /* update node_nr_active->max */ wq_update_node_max_active(ctx->wq, -1); - /* rescuer needs to respect wq cpumask changes */ - if (ctx->wq->rescuer) - set_cpus_allowed_ptr(ctx->wq->rescuer->task, - unbound_effective_cpumask(ctx->wq)); - mutex_unlock(&ctx->wq->mutex); } @@ -5614,10 +5613,13 @@ static int init_rescuer(struct workqueue_struct *wq) } wq->rescuer = rescuer; - if (wq->flags & WQ_UNBOUND) - kthread_bind_mask(rescuer->task, unbound_effective_cpumask(wq)); + + /* initial cpumask is consistent with the detached rescuer and unbind_worker() */ + if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) + kthread_bind_mask(rescuer->task, wq_unbound_cpumask); else kthread_bind_mask(rescuer->task, cpu_possible_mask); + wake_up_process(rescuer->task); return 0; @@ -5902,16 +5904,10 @@ void destroy_workqueue(struct workqueue_struct *wq) /* kill rescuer, if sanity checks fail, leave it w/o rescuer */ if (wq->rescuer) { - struct worker *rescuer = wq->rescuer; - - /* this prevents new queueing */ - raw_spin_lock_irq(&wq_mayday_lock); - wq->rescuer = NULL; - raw_spin_unlock_irq(&wq_mayday_lock); - /* rescuer will empty maydays list before exiting */ - kthread_stop(rescuer->task); - kfree(rescuer); + kthread_stop(wq->rescuer->task); + kfree(wq->rescuer); + wq->rescuer = NULL; } /* @@ -6937,8 +6933,26 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) } if (!ret) { + int cpu; + struct worker_pool *pool; + struct worker *worker; + mutex_lock(&wq_pool_attach_mutex); cpumask_copy(wq_unbound_cpumask, unbound_cpumask); + /* rescuer needs to respect cpumask changes when it is not attached */ + list_for_each_entry(wq, &workqueues, list) { + if (wq->rescuer && !wq->rescuer->pool) + unbind_worker(wq->rescuer); + } + /* DISASSOCIATED worker needs to respect wq_unbound_cpumask */ + for_each_possible_cpu(cpu) { + for_each_cpu_worker_pool(pool, cpu) { + if (!(pool->flags & POOL_DISASSOCIATED)) + continue; + for_each_pool_worker(worker, pool) + unbind_worker(worker); + } + } mutex_unlock(&wq_pool_attach_mutex); } return ret; |
