From 7bae563c0dbe0039d80a103601f64dcdb48b1481 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 15 Sep 2024 18:21:54 +0200 Subject: bpf: Constify struct btf_kind_operations struct btf_kind_operations are not modified in BTF. Constifying this structures moves some data to a read-only section, so increase overall security, especially when the structure holds some function pointers. On a x86_64, with allmodconfig: Before: ====== text data bss dec hex filename 184320 7091 548 191959 2edd7 kernel/bpf/btf.o After: ===== text data bss dec hex filename 184896 6515 548 191959 2edd7 kernel/bpf/btf.o Signed-off-by: Christophe JAILLET Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/9192ab72b2e9c66aefd6520f359a20297186327f.1726417289.git.christophe.jaillet@wanadoo.fr Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 75e4fe83c509..13dd1fa1d1b9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2808,7 +2808,7 @@ static void btf_ref_type_log(struct btf_verifier_env *env, btf_verifier_log(env, "type_id=%u", t->type); } -static struct btf_kind_operations modifier_ops = { +static const struct btf_kind_operations modifier_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_modifier_resolve, .check_member = btf_modifier_check_member, @@ -2817,7 +2817,7 @@ static struct btf_kind_operations modifier_ops = { .show = btf_modifier_show, }; -static struct btf_kind_operations ptr_ops = { +static const struct btf_kind_operations ptr_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_ptr_resolve, .check_member = btf_ptr_check_member, @@ -2858,7 +2858,7 @@ static void btf_fwd_type_log(struct btf_verifier_env *env, btf_verifier_log(env, "%s", btf_type_kflag(t) ? "union" : "struct"); } -static struct btf_kind_operations fwd_ops = { +static const struct btf_kind_operations fwd_ops = { .check_meta = btf_fwd_check_meta, .resolve = btf_df_resolve, .check_member = btf_df_check_member, @@ -3109,7 +3109,7 @@ static void btf_array_show(const struct btf *btf, const struct btf_type *t, __btf_array_show(btf, t, type_id, data, bits_offset, show); } -static struct btf_kind_operations array_ops = { +static const struct btf_kind_operations array_ops = { .check_meta = btf_array_check_meta, .resolve = btf_array_resolve, .check_member = btf_array_check_member, @@ -4185,7 +4185,7 @@ static void btf_struct_show(const struct btf *btf, const struct btf_type *t, __btf_struct_show(btf, t, type_id, data, bits_offset, show); } -static struct btf_kind_operations struct_ops = { +static const struct btf_kind_operations struct_ops = { .check_meta = btf_struct_check_meta, .resolve = btf_struct_resolve, .check_member = btf_struct_check_member, @@ -4353,7 +4353,7 @@ static void btf_enum_show(const struct btf *btf, const struct btf_type *t, btf_show_end_type(show); } -static struct btf_kind_operations enum_ops = { +static const struct btf_kind_operations enum_ops = { .check_meta = btf_enum_check_meta, .resolve = btf_df_resolve, .check_member = btf_enum_check_member, @@ -4456,7 +4456,7 @@ static void btf_enum64_show(const struct btf *btf, const struct btf_type *t, btf_show_end_type(show); } -static struct btf_kind_operations enum64_ops = { +static const struct btf_kind_operations enum64_ops = { .check_meta = btf_enum64_check_meta, .resolve = btf_df_resolve, .check_member = btf_enum_check_member, @@ -4534,7 +4534,7 @@ done: btf_verifier_log(env, ")"); } -static struct btf_kind_operations func_proto_ops = { +static const struct btf_kind_operations func_proto_ops = { .check_meta = btf_func_proto_check_meta, .resolve = btf_df_resolve, /* @@ -4592,7 +4592,7 @@ static int btf_func_resolve(struct btf_verifier_env *env, return 0; } -static struct btf_kind_operations func_ops = { +static const struct btf_kind_operations func_ops = { .check_meta = btf_func_check_meta, .resolve = btf_func_resolve, .check_member = btf_df_check_member, -- cgit From 40f34d6f12e292875b8027ec66038cabb5a317f6 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Thu, 26 Sep 2024 13:30:42 +0200 Subject: bpf: Call kfree(obj) only once in free_one() A kfree() call is always used at the end of this function implementation. Thus specify such a function call only once instead of duplicating it in a previous if branch. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/08987123-668c-40f3-a8ee-c3038d94f069@web.de Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index b3858a76e0b3..1a1b4458114c 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -252,11 +252,8 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic) static void free_one(void *obj, bool percpu) { - if (percpu) { + if (percpu) free_percpu(((void __percpu **)obj)[1]); - kfree(obj); - return; - } kfree(obj); } -- cgit From da7d71bcb0637b7aa18934628fdb5a55f2db49a6 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 16 Sep 2024 02:17:11 -0700 Subject: bpf: Use KF_FASTCALL to mark kfuncs supporting fastcall contract In order to allow pahole add btf_decl_tag("bpf_fastcall") for kfuncs supporting bpf_fastcall, mark such functions with KF_FASTCALL in id_set8 objects. Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240916091712.2929279-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 4 ++-- kernel/bpf/verifier.c | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1a43d06eab28..4053f279ed4c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3052,8 +3052,8 @@ BTF_ID(func, bpf_cgroup_release_dtor) #endif BTF_KFUNCS_START(common_btf_ids) -BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx) -BTF_ID_FLAGS(func, bpf_rdonly_cast) +BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx, KF_FASTCALL) +BTF_ID_FLAGS(func, bpf_rdonly_cast, KF_FASTCALL) BTF_ID_FLAGS(func, bpf_rcu_read_lock) BTF_ID_FLAGS(func, bpf_rcu_read_unlock) BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9a7ed527e47e..7d9b38ffd220 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16176,10 +16176,7 @@ static u32 kfunc_fastcall_clobber_mask(struct bpf_kfunc_call_arg_meta *meta) /* Same as verifier_inlines_helper_call() but for kfuncs, see comment above */ static bool is_fastcall_kfunc_call(struct bpf_kfunc_call_arg_meta *meta) { - if (meta->btf == btf_vmlinux) - return meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || - meta->func_id == special_kfunc_list[KF_bpf_rdonly_cast]; - return false; + return meta->kfunc_flags & KF_FASTCALL; } /* LLVM define a bpf_fastcall function attribute. -- cgit From 5bd48a3a14df4b3ee1be0757efcc0f40d4f57b35 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Thu, 10 Oct 2024 04:56:52 +0100 Subject: bpf: fix argument type in bpf_loop documentation The `index` argument to bpf_loop() is threaded as an u64. This lead in a subtle verifier denial where clang cloned the argument in another register[1]. [1] https://github.com/systemd/systemd/pull/34650#issuecomment-2401092895 Signed-off-by: Matteo Croce Link: https://lore.kernel.org/r/20241010035652.17830-1-technoboy85@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7d9b38ffd220..cfc62e0776bf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9917,7 +9917,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env, { /* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, * u64 flags); - * callback_fn(u32 index, void *callback_ctx); + * callback_fn(u64 index, void *callback_ctx); */ callee->regs[BPF_REG_1].type = SCALAR_VALUE; callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; -- cgit From 4971266e1595f76be3f844c834c1f9357a97dbde Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 10 Oct 2024 16:25:03 -0700 Subject: bpf: Add kmem_cache iterator The new "kmem_cache" iterator will traverse the list of slab caches and call attached BPF programs for each entry. It should check the argument (ctx.s) if it's NULL before using it. Now the iteration grabs the slab_mutex only if it traverse the list and releases the mutex when it runs the BPF program. The kmem_cache entry is protected by a refcount during the execution. Signed-off-by: Namhyung Kim Acked-by: Vlastimil Babka #slab Link: https://lore.kernel.org/r/20241010232505.1339892-2-namhyung@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/Makefile | 1 + kernel/bpf/kmem_cache_iter.c | 175 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+) create mode 100644 kernel/bpf/kmem_cache_iter.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 9b9c151b5c82..105328f0b9c0 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -52,3 +52,4 @@ obj-$(CONFIG_BPF_PRELOAD) += preload/ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o +obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o diff --git a/kernel/bpf/kmem_cache_iter.c b/kernel/bpf/kmem_cache_iter.c new file mode 100644 index 000000000000..ebc101d7da51 --- /dev/null +++ b/kernel/bpf/kmem_cache_iter.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024 Google */ +#include +#include +#include +#include +#include + +#include "../../mm/slab.h" /* kmem_cache, slab_caches and slab_mutex */ + +struct bpf_iter__kmem_cache { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct kmem_cache *, s); +}; + +static void *kmem_cache_iter_seq_start(struct seq_file *seq, loff_t *pos) +{ + loff_t cnt = 0; + bool found = false; + struct kmem_cache *s; + + mutex_lock(&slab_mutex); + + /* Find an entry at the given position in the slab_caches list instead + * of keeping a reference (of the last visited entry, if any) out of + * slab_mutex. It might miss something if one is deleted in the middle + * while it releases the lock. But it should be rare and there's not + * much we can do about it. + */ + list_for_each_entry(s, &slab_caches, list) { + if (cnt == *pos) { + /* Make sure this entry remains in the list by getting + * a new reference count. Note that boot_cache entries + * have a negative refcount, so don't touch them. + */ + if (s->refcount > 0) + s->refcount++; + found = true; + break; + } + cnt++; + } + mutex_unlock(&slab_mutex); + + if (!found) + return NULL; + + return s; +} + +static void kmem_cache_iter_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_iter__kmem_cache ctx = { + .meta = &meta, + .s = v, + }; + struct bpf_prog *prog; + bool destroy = false; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog && !ctx.s) + bpf_iter_run_prog(prog, &ctx); + + if (ctx.s == NULL) + return; + + mutex_lock(&slab_mutex); + + /* Skip kmem_cache_destroy() for active entries */ + if (ctx.s->refcount > 1) + ctx.s->refcount--; + else if (ctx.s->refcount == 1) + destroy = true; + + mutex_unlock(&slab_mutex); + + if (destroy) + kmem_cache_destroy(ctx.s); +} + +static void *kmem_cache_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct kmem_cache *s = v; + struct kmem_cache *next = NULL; + bool destroy = false; + + ++*pos; + + mutex_lock(&slab_mutex); + + if (list_last_entry(&slab_caches, struct kmem_cache, list) != s) { + next = list_next_entry(s, list); + + WARN_ON_ONCE(next->refcount == 0); + + /* boot_caches have negative refcount, don't touch them */ + if (next->refcount > 0) + next->refcount++; + } + + /* Skip kmem_cache_destroy() for active entries */ + if (s->refcount > 1) + s->refcount--; + else if (s->refcount == 1) + destroy = true; + + mutex_unlock(&slab_mutex); + + if (destroy) + kmem_cache_destroy(s); + + return next; +} + +static int kmem_cache_iter_seq_show(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_iter__kmem_cache ctx = { + .meta = &meta, + .s = v, + }; + struct bpf_prog *prog; + int ret = 0; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + return ret; +} + +static const struct seq_operations kmem_cache_iter_seq_ops = { + .start = kmem_cache_iter_seq_start, + .next = kmem_cache_iter_seq_next, + .stop = kmem_cache_iter_seq_stop, + .show = kmem_cache_iter_seq_show, +}; + +BTF_ID_LIST_GLOBAL_SINGLE(bpf_kmem_cache_btf_id, struct, kmem_cache) + +static const struct bpf_iter_seq_info kmem_cache_iter_seq_info = { + .seq_ops = &kmem_cache_iter_seq_ops, +}; + +static void bpf_iter_kmem_cache_show_fdinfo(const struct bpf_iter_aux_info *aux, + struct seq_file *seq) +{ + seq_puts(seq, "kmem_cache iter\n"); +} + +DEFINE_BPF_ITER_FUNC(kmem_cache, struct bpf_iter_meta *meta, + struct kmem_cache *s) + +static struct bpf_iter_reg bpf_kmem_cache_reg_info = { + .target = "kmem_cache", + .feature = BPF_ITER_RESCHED, + .show_fdinfo = bpf_iter_kmem_cache_show_fdinfo, + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__kmem_cache, s), + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, + }, + .seq_info = &kmem_cache_iter_seq_info, +}; + +static int __init bpf_kmem_cache_iter_init(void) +{ + bpf_kmem_cache_reg_info.ctx_arg_info[0].btf_id = bpf_kmem_cache_btf_id[0]; + return bpf_iter_reg_target(&bpf_kmem_cache_reg_info); +} + +late_initcall(bpf_kmem_cache_iter_init); -- cgit From a992d7a3979120fbd7c13435d27b3da8d9ed095a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 10 Oct 2024 16:25:04 -0700 Subject: mm/bpf: Add bpf_get_kmem_cache() kfunc The bpf_get_kmem_cache() is to get a slab cache information from a virtual address like virt_to_cache(). If the address is a pointer to a slab object, it'd return a valid kmem_cache pointer, otherwise NULL is returned. It doesn't grab a reference count of the kmem_cache so the caller is responsible to manage the access. The returned point is marked as PTR_UNTRUSTED. The intended use case for now is to symbolize locks in slab objects from the lock contention tracepoints. Suggested-by: Vlastimil Babka Acked-by: Roman Gushchin (mm/*) Acked-by: Vlastimil Babka #mm/slab Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20241010232505.1339892-3-namhyung@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 1 + kernel/bpf/verifier.c | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4053f279ed4c..aaaefefdf692 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3090,6 +3090,7 @@ BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_get_kmem_cache) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cfc62e0776bf..f514247ba8ba 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11259,6 +11259,7 @@ enum special_kfunc_type { KF_bpf_preempt_enable, KF_bpf_iter_css_task_new, KF_bpf_session_cookie, + KF_bpf_get_kmem_cache, }; BTF_SET_START(special_kfunc_set) @@ -11324,6 +11325,7 @@ BTF_ID(func, bpf_session_cookie) #else BTF_ID_UNUSED #endif +BTF_ID(func, bpf_get_kmem_cache) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -12834,6 +12836,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].type = PTR_TO_BTF_ID; regs[BPF_REG_0].btf_id = ptr_type_id; + if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache]) + regs[BPF_REG_0].type |= PTR_UNTRUSTED; + if (is_iter_next_kfunc(&meta)) { struct bpf_reg_state *cur_iter; -- cgit From 675c3596ff32c040d1dd2e28dd57e83e634b9f60 Mon Sep 17 00:00:00 2001 From: Juntong Deng Date: Mon, 14 Oct 2024 10:21:08 +0100 Subject: bpf: Add bpf_task_from_vpid() kfunc bpf_task_from_pid() that currently exists looks up the struct task_struct corresponding to the pid in the root pid namespace (init_pid_ns). This patch adds bpf_task_from_vpid() which looks up the struct task_struct corresponding to vpid in the pid namespace of the current process. This is useful for getting information about other processes in the same pid namespace. Signed-off-by: Juntong Deng Link: https://lore.kernel.org/r/AM6PR03MB5848E50DA58F79CDE65433C399442@AM6PR03MB5848.eurprd03.prod.outlook.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index aaaefefdf692..073e6f04f4d7 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2521,6 +2521,25 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid) return p; } +/** + * bpf_task_from_vpid - Find a struct task_struct from its vpid by looking it up + * in the pid namespace of the current task. If a task is returned, it must + * either be stored in a map, or released with bpf_task_release(). + * @vpid: The vpid of the task being looked up. + */ +__bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) +{ + struct task_struct *p; + + rcu_read_lock(); + p = find_task_by_vpid(vpid); + if (p) + p = bpf_task_acquire(p); + rcu_read_unlock(); + + return p; +} + /** * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data. * @p: The dynptr whose data slice to retrieve @@ -3034,6 +3053,7 @@ BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU) BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) BTF_KFUNCS_END(generic_btf_ids) -- cgit From d6083f040d5d8f8d748462c77e90547097df936e Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 15 Oct 2024 23:02:06 +0800 Subject: bpf: Prevent tailcall infinite loop caused by freplace There is a potential infinite loop issue that can occur when using a combination of tail calls and freplace. In an upcoming selftest, the attach target for entry_freplace of tailcall_freplace.c is subprog_tc of tc_bpf2bpf.c, while the tail call in entry_freplace leads to entry_tc. This results in an infinite loop: entry_tc -> subprog_tc -> entry_freplace --tailcall-> entry_tc. The problem arises because the tail_call_cnt in entry_freplace resets to zero each time entry_freplace is executed, causing the tail call mechanism to never terminate, eventually leading to a kernel panic. To fix this issue, the solution is twofold: 1. Prevent updating a program extended by an freplace program to a prog_array map. 2. Prevent extending a program that is already part of a prog_array map with an freplace program. This ensures that: * If a program or its subprogram has been extended by an freplace program, it can no longer be updated to a prog_array map. * If a program has been added to a prog_array map, neither it nor its subprograms can be extended by an freplace program. Moreover, an extension program should not be tailcalled. As such, return -EINVAL if the program has a type of BPF_PROG_TYPE_EXT when adding it to a prog_array map. Additionally, fix a minor code style issue by replacing eight spaces with a tab for proper formatting. Reviewed-by: Eduard Zingerman Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20241015150207.70264-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 26 ++++++++++++++++++++++++-- kernel/bpf/core.c | 1 + kernel/bpf/syscall.c | 7 ++++--- kernel/bpf/trampoline.c | 47 +++++++++++++++++++++++++++++++++++++++-------- 4 files changed, 68 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 79660e3fca4c..6cdbb4c33d31 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -947,22 +947,44 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { struct bpf_prog *prog = bpf_prog_get(fd); + bool is_extended; if (IS_ERR(prog)) return prog; - if (!bpf_prog_map_compatible(map, prog)) { + if (prog->type == BPF_PROG_TYPE_EXT || + !bpf_prog_map_compatible(map, prog)) { bpf_prog_put(prog); return ERR_PTR(-EINVAL); } + mutex_lock(&prog->aux->ext_mutex); + is_extended = prog->aux->is_extended; + if (!is_extended) + prog->aux->prog_array_member_cnt++; + mutex_unlock(&prog->aux->ext_mutex); + if (is_extended) { + /* Extended prog can not be tail callee. It's to prevent a + * potential infinite loop like: + * tail callee prog entry -> tail callee prog subprog -> + * freplace prog entry --tailcall-> tail callee prog entry. + */ + bpf_prog_put(prog); + return ERR_PTR(-EBUSY); + } + return prog; } static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { + struct bpf_prog *prog = ptr; + + mutex_lock(&prog->aux->ext_mutex); + prog->aux->prog_array_member_cnt--; + mutex_unlock(&prog->aux->ext_mutex); /* bpf_prog is freed after one RCU or tasks trace grace period */ - bpf_prog_put(ptr); + bpf_prog_put(prog); } static u32 prog_fd_array_sys_lookup_elem(void *ptr) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5e77c58e0601..233ea78f8f1b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -131,6 +131,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode); #endif mutex_init(&fp->aux->used_maps_mutex); + mutex_init(&fp->aux->ext_mutex); mutex_init(&fp->aux->dst_mutex); return fp; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a8f1808a1ca5..4d04d4d9c1f3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3214,7 +3214,8 @@ static void bpf_tracing_link_release(struct bpf_link *link) container_of(link, struct bpf_tracing_link, link.link); WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link, - tr_link->trampoline)); + tr_link->trampoline, + tr_link->tgt_prog)); bpf_trampoline_put(tr_link->trampoline); @@ -3354,7 +3355,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, * in prog->aux * * - if prog->aux->dst_trampoline is NULL, the program has already been - * attached to a target and its initial target was cleared (below) + * attached to a target and its initial target was cleared (below) * * - if tgt_prog != NULL, the caller specified tgt_prog_fd + * target_btf_id using the link_create API. @@ -3429,7 +3430,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, if (err) goto out_unlock; - err = bpf_trampoline_link_prog(&link->link, tr); + err = bpf_trampoline_link_prog(&link->link, tr, tgt_prog); if (err) { bpf_link_cleanup(&link_primer); link = NULL; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index f8302a5ca400..9f36c049f4c2 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -523,7 +523,27 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) } } -static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog) +{ + struct bpf_prog_aux *aux = tgt_prog->aux; + + guard(mutex)(&aux->ext_mutex); + if (aux->prog_array_member_cnt) + /* Program extensions can not extend target prog when the target + * prog has been updated to any prog_array map as tail callee. + * It's to prevent a potential infinite loop like: + * tgt prog entry -> tgt prog subprog -> freplace prog entry + * --tailcall-> tgt prog entry. + */ + return -EBUSY; + + aux->is_extended = true; + return 0; +} + +static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, + struct bpf_trampoline *tr, + struct bpf_prog *tgt_prog) { enum bpf_tramp_prog_type kind; struct bpf_tramp_link *link_exiting; @@ -544,6 +564,9 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_tr /* Cannot attach extension if fentry/fexit are in use. */ if (cnt) return -EBUSY; + err = bpf_freplace_check_tgt_prog(tgt_prog); + if (err) + return err; tr->extension_prog = link->link.prog; return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, link->link.prog->bpf_func); @@ -570,17 +593,21 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_tr return err; } -int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +int bpf_trampoline_link_prog(struct bpf_tramp_link *link, + struct bpf_trampoline *tr, + struct bpf_prog *tgt_prog) { int err; mutex_lock(&tr->mutex); - err = __bpf_trampoline_link_prog(link, tr); + err = __bpf_trampoline_link_prog(link, tr, tgt_prog); mutex_unlock(&tr->mutex); return err; } -static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, + struct bpf_trampoline *tr, + struct bpf_prog *tgt_prog) { enum bpf_tramp_prog_type kind; int err; @@ -591,6 +618,8 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_ err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, tr->extension_prog->bpf_func, NULL); tr->extension_prog = NULL; + guard(mutex)(&tgt_prog->aux->ext_mutex); + tgt_prog->aux->is_extended = false; return err; } hlist_del_init(&link->tramp_hlist); @@ -599,12 +628,14 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_ } /* bpf_trampoline_unlink_prog() should never fail. */ -int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, + struct bpf_trampoline *tr, + struct bpf_prog *tgt_prog) { int err; mutex_lock(&tr->mutex); - err = __bpf_trampoline_unlink_prog(link, tr); + err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog); mutex_unlock(&tr->mutex); return err; } @@ -619,7 +650,7 @@ static void bpf_shim_tramp_link_release(struct bpf_link *link) if (!shim_link->trampoline) return; - WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline)); + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline, NULL)); bpf_trampoline_put(shim_link->trampoline); } @@ -733,7 +764,7 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, goto err; } - err = __bpf_trampoline_link_prog(&shim_link->link, tr); + err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL); if (err) goto err; -- cgit From 6280cf718db0c557b5fe44e2d2e8ad8e832696a7 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 16 Oct 2024 08:41:35 +0000 Subject: bpf: Implement bpf_send_signal_task() kfunc Implement bpf_send_signal_task kfunc that is similar to bpf_send_signal_thread and bpf_send_signal helpers but can be used to send signals to other threads and processes. It also supports sending a cookie with the signal similar to sigqueue(). If the receiving process establishes a handler for the signal using the SA_SIGINFO flag to sigaction(), then it can obtain this cookie via the si_value field of the siginfo_t structure passed as the second argument to the handler. Signed-off-by: Puranjay Mohan Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241016084136.10305-2-puranjay@kernel.org --- kernel/bpf/helpers.c | 1 + kernel/trace/bpf_trace.c | 53 ++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 46 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 073e6f04f4d7..5c3fdb29c1b1 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3055,6 +3055,7 @@ BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) +BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS) BTF_KFUNCS_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a582cd25ca87..e7370a321126 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -802,6 +802,8 @@ struct send_signal_irq_work { struct task_struct *task; u32 sig; enum pid_type type; + bool has_siginfo; + struct kernel_siginfo info; }; static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); @@ -809,27 +811,46 @@ static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); static void do_bpf_send_signal(struct irq_work *entry) { struct send_signal_irq_work *work; + struct kernel_siginfo *siginfo; work = container_of(entry, struct send_signal_irq_work, irq_work); - group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type); + siginfo = work->has_siginfo ? &work->info : SEND_SIG_PRIV; + + group_send_sig_info(work->sig, siginfo, work->task, work->type); put_task_struct(work->task); } -static int bpf_send_signal_common(u32 sig, enum pid_type type) +static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struct *task, u64 value) { struct send_signal_irq_work *work = NULL; + struct kernel_siginfo info; + struct kernel_siginfo *siginfo; + + if (!task) { + task = current; + siginfo = SEND_SIG_PRIV; + } else { + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_KERNEL; + info.si_pid = 0; + info.si_uid = 0; + info.si_value.sival_ptr = (void *)(unsigned long)value; + siginfo = &info; + } /* Similar to bpf_probe_write_user, task needs to be * in a sound condition and kernel memory access be * permitted in order to send signal to the current * task. */ - if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING))) + if (unlikely(task->flags & (PF_KTHREAD | PF_EXITING))) return -EPERM; if (unlikely(!nmi_uaccess_okay())) return -EPERM; /* Task should not be pid=1 to avoid kernel panic. */ - if (unlikely(is_global_init(current))) + if (unlikely(is_global_init(task))) return -EPERM; if (irqs_disabled()) { @@ -847,19 +868,22 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) * to the irq_work. The current task may change when queued * irq works get executed. */ - work->task = get_task_struct(current); + work->task = get_task_struct(task); + work->has_siginfo = siginfo == &info; + if (work->has_siginfo) + copy_siginfo(&work->info, &info); work->sig = sig; work->type = type; irq_work_queue(&work->irq_work); return 0; } - return group_send_sig_info(sig, SEND_SIG_PRIV, current, type); + return group_send_sig_info(sig, siginfo, task, type); } BPF_CALL_1(bpf_send_signal, u32, sig) { - return bpf_send_signal_common(sig, PIDTYPE_TGID); + return bpf_send_signal_common(sig, PIDTYPE_TGID, NULL, 0); } static const struct bpf_func_proto bpf_send_signal_proto = { @@ -871,7 +895,7 @@ static const struct bpf_func_proto bpf_send_signal_proto = { BPF_CALL_1(bpf_send_signal_thread, u32, sig) { - return bpf_send_signal_common(sig, PIDTYPE_PID); + return bpf_send_signal_common(sig, PIDTYPE_PID, NULL, 0); } static const struct bpf_func_proto bpf_send_signal_thread_proto = { @@ -3484,3 +3508,16 @@ static int __init bpf_kprobe_multi_kfuncs_init(void) } late_initcall(bpf_kprobe_multi_kfuncs_init); + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid_type type, + u64 value) +{ + if (type != PIDTYPE_PID && type != PIDTYPE_TGID) + return -EINVAL; + + return bpf_send_signal_common(sig, type, task, value); +} + +__bpf_kfunc_end_defs(); -- cgit From 1cb80d9e93f861018fabe81a69ea0ded20f5a2d0 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 23 Oct 2024 16:47:48 -0700 Subject: bpf: Support __uptr type tag in BTF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces the "__uptr" type tag to BTF. It is to define a pointer pointing to the user space memory. This patch adds BTF logic to pass the "__uptr" type tag. btf_find_kptr() is reused for the "__uptr" tag. The "__uptr" will only be supported in the map_value of the task storage map. However, btf_parse_struct_meta() also uses btf_find_kptr() but it is not interested in "__uptr". This patch adds a "field_mask" argument to btf_find_kptr() which will return BTF_FIELD_IGNORE if the caller is not interested in a “__uptr” field. btf_parse_kptr() is also reused to parse the uptr. The btf_check_and_fixup_fields() is changed to do extra checks on the uptr to ensure that its struct size is not larger than PAGE_SIZE. It is not clear how a uptr pointing to a CO-RE supported kernel struct will be used, so it is also not allowed now. Signed-off-by: Kui-Feng Lee Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20241023234759.860539-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 34 +++++++++++++++++++++++++++++----- kernel/bpf/syscall.c | 2 ++ 2 files changed, 31 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 13dd1fa1d1b9..76cafff2d99c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3334,7 +3334,7 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t, } static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, - u32 off, int sz, struct btf_field_info *info) + u32 off, int sz, struct btf_field_info *info, u32 field_mask) { enum btf_field_type type; u32 res_id; @@ -3358,9 +3358,14 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, type = BPF_KPTR_REF; else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off))) type = BPF_KPTR_PERCPU; + else if (!strcmp("uptr", __btf_name_by_offset(btf, t->name_off))) + type = BPF_UPTR; else return -EINVAL; + if (!(type & field_mask)) + return BTF_FIELD_IGNORE; + /* Get the base type */ t = btf_type_skip_modifiers(btf, t->type, &res_id); /* Only pointer to struct is allowed */ @@ -3502,7 +3507,7 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_ field_mask_test_name(BPF_REFCOUNT, "bpf_refcount"); /* Only return BPF_KPTR when all other types with matchable names fail */ - if (field_mask & BPF_KPTR && !__btf_type_is_struct(var_type)) { + if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) { type = BPF_KPTR_REF; goto end; } @@ -3535,6 +3540,7 @@ static int btf_repeat_fields(struct btf_field_info *info, case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: case BPF_LIST_HEAD: case BPF_RB_ROOT: break; @@ -3661,8 +3667,9 @@ static int btf_find_field_one(const struct btf *btf, case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: ret = btf_find_kptr(btf, var_type, off, sz, - info_cnt ? &info[0] : &tmp); + info_cnt ? &info[0] : &tmp, field_mask); if (ret < 0) return ret; break; @@ -3985,6 +3992,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]); if (ret < 0) goto end; @@ -4044,12 +4052,28 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) * Hence we only need to ensure that bpf_{list_head,rb_root} ownership * does not form cycles. */ - if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_GRAPH_ROOT)) + if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & (BPF_GRAPH_ROOT | BPF_UPTR))) return 0; for (i = 0; i < rec->cnt; i++) { struct btf_struct_meta *meta; + const struct btf_type *t; u32 btf_id; + if (rec->fields[i].type == BPF_UPTR) { + /* The uptr only supports pinning one page and cannot + * point to a kernel struct + */ + if (btf_is_kernel(rec->fields[i].kptr.btf)) + return -EINVAL; + t = btf_type_by_id(rec->fields[i].kptr.btf, + rec->fields[i].kptr.btf_id); + if (!t->size) + return -EINVAL; + if (t->size > PAGE_SIZE) + return -E2BIG; + continue; + } + if (!(rec->fields[i].type & BPF_GRAPH_ROOT)) continue; btf_id = rec->fields[i].graph_root.value_btf_id; @@ -5560,7 +5584,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) goto free_aof; } - ret = btf_find_kptr(btf, t, 0, 0, &tmp); + ret = btf_find_kptr(btf, t, 0, 0, &tmp, BPF_KPTR); if (ret != BTF_FIELD_FOUND) continue; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4d04d4d9c1f3..2d2935d9c096 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -548,6 +548,7 @@ void btf_record_free(struct btf_record *rec) case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: if (rec->fields[i].kptr.module) module_put(rec->fields[i].kptr.module); if (btf_is_kernel(rec->fields[i].kptr.btf)) @@ -597,6 +598,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: if (btf_is_kernel(fields[i].kptr.btf)) btf_get(fields[i].kptr.btf); if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) { -- cgit From 99dde42e37497b3062516b1db7231f9dec744a00 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 23 Oct 2024 16:47:49 -0700 Subject: bpf: Handle BPF_UPTR in verifier This patch adds BPF_UPTR support to the verifier. Not that only the map_value will support the "__uptr" type tag. This patch enforces only BPF_LDX is allowed to the value of an uptr. After BPF_LDX, it will mark the dst_reg as PTR_TO_MEM | PTR_MAYBE_NULL with size deduced from the field.kptr.btf_id. This will make the dst_reg pointed memory to be readable and writable as scalar. There is a redundant "val_reg = reg_state(env, value_regno);" statement in the check_map_kptr_access(). This patch takes this chance to remove it also. Signed-off-by: Kui-Feng Lee Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20241023234759.860539-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f514247ba8ba..1bd0c3f41f2f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5485,6 +5485,22 @@ static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr return ret; } +static int mark_uptr_ld_reg(struct bpf_verifier_env *env, u32 regno, + struct btf_field *field) +{ + struct bpf_reg_state *reg; + const struct btf_type *t; + + t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id); + mark_reg_known_zero(env, cur_regs(env), regno); + reg = reg_state(env, regno); + reg->type = PTR_TO_MEM | PTR_MAYBE_NULL; + reg->mem_size = t->size; + reg->id = ++env->id_gen; + + return 0; +} + static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, int value_regno, int insn_idx, struct btf_field *kptr_field) @@ -5513,9 +5529,15 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, verbose(env, "store to referenced kptr disallowed\n"); return -EACCES; } + if (class != BPF_LDX && kptr_field->type == BPF_UPTR) { + verbose(env, "store to uptr disallowed\n"); + return -EACCES; + } if (class == BPF_LDX) { - val_reg = reg_state(env, value_regno); + if (kptr_field->type == BPF_UPTR) + return mark_uptr_ld_reg(env, value_regno, kptr_field); + /* We can simply mark the value_regno receiving the pointer * value from map as PTR_TO_BTF_ID, with the correct type. */ @@ -5573,21 +5595,26 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: if (src != ACCESS_DIRECT) { - verbose(env, "kptr cannot be accessed indirectly by helper\n"); + verbose(env, "%s cannot be accessed indirectly by helper\n", + btf_field_type_name(field->type)); return -EACCES; } if (!tnum_is_const(reg->var_off)) { - verbose(env, "kptr access cannot have variable offset\n"); + verbose(env, "%s access cannot have variable offset\n", + btf_field_type_name(field->type)); return -EACCES; } if (p != off + reg->var_off.value) { - verbose(env, "kptr access misaligned expected=%u off=%llu\n", + verbose(env, "%s access misaligned expected=%u off=%llu\n", + btf_field_type_name(field->type), p, off + reg->var_off.value); return -EACCES; } if (size != bpf_size_to_bytes(BPF_DW)) { - verbose(env, "kptr access size must be BPF_DW\n"); + verbose(env, "%s access size must be BPF_DW\n", + btf_field_type_name(field->type)); return -EACCES; } break; @@ -6953,7 +6980,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return err; if (tnum_is_const(reg->var_off)) kptr_field = btf_record_find(reg->map_ptr->record, - off + reg->var_off.value, BPF_KPTR); + off + reg->var_off.value, BPF_KPTR | BPF_UPTR); if (kptr_field) { err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field); } else if (t == BPF_READ && value_regno >= 0) { -- cgit From b9a5a07aeaa2a903fb1306eb422880b2fa5f937f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 23 Oct 2024 16:47:50 -0700 Subject: bpf: Add "bool swap_uptrs" arg to bpf_local_storage_update() and bpf_selem_alloc() In a later patch, the task local storage will only accept uptr from the syscall update_elem and will not accept uptr from the bpf prog. The reason is the bpf prog does not have a way to provide a valid user space address. bpf_local_storage_update() and bpf_selem_alloc() are used by both bpf prog bpf_task_storage_get(BPF_LOCAL_STORAGE_GET_F_CREATE) and bpf syscall update_elem. "bool swap_uptrs" arg is added to bpf_local_storage_update() and bpf_selem_alloc() to tell if it is called by the bpf prog or by the bpf syscall. When swap_uptrs==true, it is called by the syscall. The arg is named (swap_)uptrs because the later patch will swap the uptrs between the newly allocated selem and the user space provided map_value. It will make error handling easier in case map->ops->map_update_elem() fails and the caller can decide if it needs to unpin the uptr in the user space provided map_value or the bpf_local_storage_update() has already taken the uptr ownership and will take care of unpinning it also. Only swap_uptrs==false is passed now. The logic to handle the true case will be added in a later patch. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20241023234759.860539-4-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_cgrp_storage.c | 4 ++-- kernel/bpf/bpf_inode_storage.c | 4 ++-- kernel/bpf/bpf_local_storage.c | 8 ++++---- kernel/bpf/bpf_task_storage.c | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 28efd0a3f220..20f05de92e9c 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -107,7 +107,7 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, bpf_cgrp_storage_lock(); sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, - value, map_flags, GFP_ATOMIC); + value, map_flags, false, GFP_ATOMIC); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return PTR_ERR_OR_ZERO(sdata); @@ -181,7 +181,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, if (!percpu_ref_is_dying(&cgroup->self.refcnt) && (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, - value, BPF_NOEXIST, gfp_flags); + value, BPF_NOEXIST, false, gfp_flags); unlock: bpf_cgrp_storage_unlock(); diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 29da6d3838f6..44ccebc745e5 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -100,7 +100,7 @@ static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, sdata = bpf_local_storage_update(file_inode(fd_file(f)), (struct bpf_local_storage_map *)map, - value, map_flags, GFP_ATOMIC); + value, map_flags, false, GFP_ATOMIC); return PTR_ERR_OR_ZERO(sdata); } @@ -154,7 +154,7 @@ BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { sdata = bpf_local_storage_update( inode, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST, gfp_flags); + BPF_NOEXIST, false, gfp_flags); return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index c938dea5ddbf..1cf772cb26eb 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -73,7 +73,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, - void *value, bool charge_mem, gfp_t gfp_flags) + void *value, bool charge_mem, bool swap_uptrs, gfp_t gfp_flags) { struct bpf_local_storage_elem *selem; @@ -524,7 +524,7 @@ uncharge: */ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, - void *value, u64 map_flags, gfp_t gfp_flags) + void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags) { struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *alloc_selem, *selem = NULL; @@ -550,7 +550,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (err) return ERR_PTR(err); - selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); + selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags); if (!selem) return ERR_PTR(-ENOMEM); @@ -584,7 +584,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, /* A lookup has just been done before and concluded a new selem is * needed. The chance of an unnecessary alloc is unlikely. */ - alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); + alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags); if (!alloc_selem) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index adf6dfe0ba68..45dc3ca334d3 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -147,7 +147,7 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, bpf_task_storage_lock(); sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, map_flags, - GFP_ATOMIC); + false, GFP_ATOMIC); bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); @@ -219,7 +219,7 @@ static void *__bpf_task_storage_get(struct bpf_map *map, (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) { sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST, gfp_flags); + BPF_NOEXIST, false, gfp_flags); return IS_ERR(sdata) ? NULL : sdata->data; } -- cgit From 5bd5bab76669b1e1551f03f5fcbc165f3fa8d269 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 23 Oct 2024 16:47:51 -0700 Subject: bpf: Postpone bpf_selem_free() in bpf_selem_unlink_storage_nolock() In a later patch, bpf_selem_free() will call unpin_user_page() through bpf_obj_free_fields(). unpin_user_page() may take spin_lock. However, some bpf_selem_free() call paths have held a raw_spin_lock. Like this: raw_spin_lock_irqsave() bpf_selem_unlink_storage_nolock() bpf_selem_free() unpin_user_page() spin_lock() To avoid spinlock nested in raw_spinlock, bpf_selem_free() should be done after releasing the raw_spinlock. The "bool reuse_now" arg is replaced with "struct hlist_head *free_selem_list" in bpf_selem_unlink_storage_nolock(). The bpf_selem_unlink_storage_nolock() will append the to-be-free selem at the free_selem_list. The caller of bpf_selem_unlink_storage_nolock() will need to call the new bpf_selem_free_list(free_selem_list, reuse_now) to free the selem after releasing the raw_spinlock. Note that the selem->snode cannot be reused for linking to the free_selem_list because the selem->snode is protected by the raw_spinlock that we want to avoid holding. A new "struct hlist_node free_node;" is union-ized with the rcu_head. Only the first one successfully hlist_del_init_rcu(&selem->snode) will be able to use the free_node. After succeeding hlist_del_init_rcu(&selem->snode), the free_node and rcu_head usage is serialized such that they can share the 16 bytes in a union. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20241023234759.860539-5-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_local_storage.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 1cf772cb26eb..09a67dff2336 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -246,13 +246,30 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, } } +static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) +{ + struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map *smap; + struct hlist_node *n; + + /* The "_safe" iteration is needed. + * The loop is not removing the selem from the list + * but bpf_selem_free will use the selem->rcu_head + * which is union-ized with the selem->free_node. + */ + hlist_for_each_entry_safe(selem, n, list, free_node) { + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + bpf_selem_free(selem, smap, reuse_now); + } +} + /* local_storage->lock must be held and selem->local_storage == local_storage. * The caller must ensure selem->smap is still valid to be * dereferenced for its smap->elem_size and smap->cache_idx. */ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem, - bool uncharge_mem, bool reuse_now) + bool uncharge_mem, struct hlist_head *free_selem_list) { struct bpf_local_storage_map *smap; bool free_local_storage; @@ -296,7 +313,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); - bpf_selem_free(selem, smap, reuse_now); + hlist_add_head(&selem->free_node, free_selem_list); if (rcu_access_pointer(local_storage->smap) == smap) RCU_INIT_POINTER(local_storage->smap, NULL); @@ -345,6 +362,7 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, struct bpf_local_storage_map *storage_smap; struct bpf_local_storage *local_storage; bool bpf_ma, free_local_storage = false; + HLIST_HEAD(selem_free_list); unsigned long flags; if (unlikely(!selem_linked_to_storage_lockless(selem))) @@ -360,9 +378,11 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true, reuse_now); + local_storage, selem, true, &selem_free_list); raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_selem_free_list(&selem_free_list, reuse_now); + if (free_local_storage) bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now); } @@ -529,6 +549,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *alloc_selem, *selem = NULL; struct bpf_local_storage *local_storage; + HLIST_HEAD(old_selem_free_list); unsigned long flags; int err; @@ -624,11 +645,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), - true, false); + true, &old_selem_free_list); } unlock: raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_selem_free_list(&old_selem_free_list, false); if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); bpf_selem_free(alloc_selem, smap, true); @@ -706,6 +728,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) struct bpf_local_storage_map *storage_smap; struct bpf_local_storage_elem *selem; bool bpf_ma, free_storage = false; + HLIST_HEAD(free_selem_list); struct hlist_node *n; unsigned long flags; @@ -734,10 +757,12 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * of the loop will set the free_cgroup_storage to true. */ free_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true, true); + local_storage, selem, true, &free_selem_list); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_selem_free_list(&free_selem_list, true); + if (free_storage) bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true); } -- cgit From 9bac675e6368b96f448289010caba4ee3320ab24 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 23 Oct 2024 16:47:52 -0700 Subject: bpf: Postpone bpf_obj_free_fields to the rcu callback A later patch will enable the uptr usage in the task_local_storage map. This will require the unpin_user_page() to be done after the rcu task trace gp for the cases that the uptr may still be used by a bpf prog. The bpf_obj_free_fields() will be the one doing unpin_user_page(), so this patch is to postpone calling bpf_obj_free_fields() to the rcu callback. The bpf_obj_free_fields() is only required to be done in the rcu callback when bpf->bpf_ma==true and reuse_now==false. bpf->bpf_ma==true case is because uptr will only be enabled in task storage which has already been moved to bpf_mem_alloc. The bpf->bpf_ma==false case can be supported in the future also if there is a need. reuse_now==false when the selem (aka storage) is deleted by bpf prog (bpf_task_storage_delete) or by syscall delete_elem(). In both cases, bpf_obj_free_fields() needs to wait for rcu gp. A few words on reuse_now==true. reuse_now==true when the storage's owner (i.e. the task_struct) is destructing or the map itself is doing map_free(). In both cases, no bpf prog should have a hold on the selem and its uptrs, so there is no need to postpone bpf_obj_free_fields(). reuse_now==true should be the common case for local storage usage where the storage exists throughout the lifetime of its owner (task_struct). The bpf_obj_free_fields() needs to use the map->record. Doing bpf_obj_free_fields() in a rcu callback will require the bpf_local_storage_map_free() to wait for rcu_barrier. An optimization could be only waiting for rcu_barrier when the map has uptr in its map_value. This will require either yet another rcu callback function or adding a bool in the selem to flag if the SDATA(selem)->smap is still valid. This patch chooses to keep it simple and wait for rcu_barrier for maps that use bpf_mem_alloc. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20241023234759.860539-6-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_local_storage.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 09a67dff2336..ca871be1c42d 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -209,8 +209,12 @@ static void __bpf_selem_free(struct bpf_local_storage_elem *selem, static void bpf_selem_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map *smap; selem = container_of(rcu, struct bpf_local_storage_elem, rcu); + /* The bpf_local_storage_map_free will wait for rcu_barrier */ + smap = rcu_dereference_check(SDATA(selem)->smap, 1); + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); bpf_mem_cache_raw_free(selem); } @@ -226,16 +230,25 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, struct bpf_local_storage_map *smap, bool reuse_now) { - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - if (!smap->bpf_ma) { + /* Only task storage has uptrs and task storage + * has moved to bpf_mem_alloc. Meaning smap->bpf_ma == true + * for task storage, so this bpf_obj_free_fields() won't unpin + * any uptr. + */ + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); __bpf_selem_free(selem, reuse_now); return; } - if (!reuse_now) { - call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); - } else { + if (reuse_now) { + /* reuse_now == true only happens when the storage owner + * (e.g. task_struct) is being destructed or the map itself + * is being destructed (ie map_free). In both cases, + * no bpf prog can have a hold on the selem. It is + * safe to unpin the uptrs and free the selem now. + */ + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); /* Instead of using the vanilla call_rcu(), * bpf_mem_cache_free will be able to reuse selem * immediately. @@ -243,7 +256,10 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, migrate_disable(); bpf_mem_cache_free(&smap->selem_ma, selem); migrate_enable(); + return; } + + call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); } static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) @@ -908,6 +924,9 @@ void bpf_local_storage_map_free(struct bpf_map *map, synchronize_rcu(); if (smap->bpf_ma) { + rcu_barrier_tasks_trace(); + if (!rcu_trace_implies_rcu_gp()) + rcu_barrier(); bpf_mem_alloc_destroy(&smap->selem_ma); bpf_mem_alloc_destroy(&smap->storage_ma); } -- cgit From ba512b00e5efbf7e19cfb7fa9f66ce82669b7077 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 23 Oct 2024 16:47:53 -0700 Subject: bpf: Add uptr support in the map_value of the task local storage. This patch adds uptr support in the map_value of the task local storage. struct map_value { struct user_data __uptr *uptr; }; struct { __uint(type, BPF_MAP_TYPE_TASK_STORAGE); __uint(map_flags, BPF_F_NO_PREALLOC); __type(key, int); __type(value, struct value_type); } datamap SEC(".maps"); A new bpf_obj_pin_uptrs() is added to pin the user page and also stores the kernel address back to the uptr for the bpf prog to use later. It currently does not support the uptr pointing to a user struct across two pages. It also excludes PageHighMem support to keep it simple. As of now, the 32bit bpf jit is missing other more crucial bpf features. For example, many important bpf features depend on bpf kfunc now but so far only one arch (x86-32) supports it which was added by me as an example when kfunc was first introduced to bpf. The uptr can only be stored to the task local storage by the syscall update_elem. Meaning the uptr will not be considered if it is provided by the bpf prog through bpf_task_storage_get(BPF_LOCAL_STORAGE_GET_F_CREATE). This is enforced by only calling bpf_local_storage_update(swap_uptrs==true) in bpf_pid_task_storage_update_elem. Everywhere else will have swap_uptrs==false. This will pump down to bpf_selem_alloc(swap_uptrs==true). It is the only case that bpf_selem_alloc() will take the uptr value when updating the newly allocated selem. bpf_obj_swap_uptrs() is added to swap the uptr between the SDATA(selem)->data and the user provided map_value in "void *value". bpf_obj_swap_uptrs() makes the SDATA(selem)->data takes the ownership of the uptr and the user space provided map_value will have NULL in the uptr. The bpf_obj_unpin_uptrs() is called after map->ops->map_update_elem() returning error. If the map->ops->map_update_elem has reached a state that the local storage has taken the uptr ownership, the bpf_obj_unpin_uptrs() will be a no op because the uptr is NULL. A "__"bpf_obj_unpin_uptrs is added to make this error path unpin easier such that it does not have to check the map->record is NULL or not. BPF_F_LOCK is not supported when the map_value has uptr. This can be revisited later if there is a use case. A similar swap_uptrs idea can be considered. The final bit is to do unpin_user_page in the bpf_obj_free_fields(). The earlier patch has ensured that the bpf_obj_free_fields() has gone through the rcu gp when needed. Cc: linux-mm@kvack.org Cc: Shakeel Butt Signed-off-by: Martin KaFai Lau Acked-by: Shakeel Butt Link: https://lore.kernel.org/r/20241023234759.860539-7-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_local_storage.c | 7 ++- kernel/bpf/bpf_task_storage.c | 5 +- kernel/bpf/syscall.c | 106 +++++++++++++++++++++++++++++++++++++++-- 3 files changed, 111 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index ca871be1c42d..7e6a0af0afc1 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -99,9 +99,12 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, } if (selem) { - if (value) + if (value) { + /* No need to call check_and_init_map_value as memory is zero init */ copy_map_value(&smap->map, SDATA(selem)->data, value); - /* No need to call check_and_init_map_value as memory is zero init */ + if (swap_uptrs) + bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value); + } return selem; } diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 45dc3ca334d3..09705f9988f3 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -129,6 +129,9 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, struct pid *pid; int fd, err; + if ((map_flags & BPF_F_LOCK) && btf_record_has_field(map->record, BPF_UPTR)) + return -EOPNOTSUPP; + fd = *(int *)key; pid = pidfd_get_pid(fd, &f_flags); if (IS_ERR(pid)) @@ -147,7 +150,7 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, bpf_task_storage_lock(); sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, map_flags, - false, GFP_ATOMIC); + true, GFP_ATOMIC); bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2d2935d9c096..426a52e5c7da 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -155,6 +155,89 @@ static void maybe_wait_bpf_programs(struct bpf_map *map) synchronize_rcu(); } +static void unpin_uptr_kaddr(void *kaddr) +{ + if (kaddr) + unpin_user_page(virt_to_page(kaddr)); +} + +static void __bpf_obj_unpin_uptrs(struct btf_record *rec, u32 cnt, void *obj) +{ + const struct btf_field *field; + void **uptr_addr; + int i; + + for (i = 0, field = rec->fields; i < cnt; i++, field++) { + if (field->type != BPF_UPTR) + continue; + + uptr_addr = obj + field->offset; + unpin_uptr_kaddr(*uptr_addr); + } +} + +static void bpf_obj_unpin_uptrs(struct btf_record *rec, void *obj) +{ + if (!btf_record_has_field(rec, BPF_UPTR)) + return; + + __bpf_obj_unpin_uptrs(rec, rec->cnt, obj); +} + +static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj) +{ + const struct btf_field *field; + const struct btf_type *t; + unsigned long start, end; + struct page *page; + void **uptr_addr; + int i, err; + + if (!btf_record_has_field(rec, BPF_UPTR)) + return 0; + + for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) { + if (field->type != BPF_UPTR) + continue; + + uptr_addr = obj + field->offset; + start = *(unsigned long *)uptr_addr; + if (!start) + continue; + + t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id); + /* t->size was checked for zero before */ + if (check_add_overflow(start, t->size - 1, &end)) { + err = -EFAULT; + goto unpin_all; + } + + /* The uptr's struct cannot span across two pages */ + if ((start & PAGE_MASK) != (end & PAGE_MASK)) { + err = -EOPNOTSUPP; + goto unpin_all; + } + + err = pin_user_pages_fast(start, 1, FOLL_LONGTERM | FOLL_WRITE, &page); + if (err != 1) + goto unpin_all; + + if (PageHighMem(page)) { + err = -EOPNOTSUPP; + unpin_user_page(page); + goto unpin_all; + } + + *uptr_addr = page_address(page) + offset_in_page(start); + } + + return 0; + +unpin_all: + __bpf_obj_unpin_uptrs(rec, i, obj); + return err; +} + static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, void *key, void *value, __u64 flags) { @@ -199,9 +282,14 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { err = map->ops->map_push_elem(map, value, flags); } else { - rcu_read_lock(); - err = map->ops->map_update_elem(map, key, value, flags); - rcu_read_unlock(); + err = bpf_obj_pin_uptrs(map->record, value); + if (!err) { + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value, flags); + rcu_read_unlock(); + if (err) + bpf_obj_unpin_uptrs(map->record, value); + } } bpf_enable_instrumentation(); @@ -716,6 +804,10 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) field->kptr.dtor(xchgd_field); } break; + case BPF_UPTR: + /* The caller ensured that no one is using the uptr */ + unpin_uptr_kaddr(*(void **)field_ptr); + break; case BPF_LIST_HEAD: if (WARN_ON_ONCE(rec->spin_lock_off < 0)) continue; @@ -1107,7 +1199,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | - BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE, + BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1163,6 +1255,12 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, goto free_map_tab; } break; + case BPF_UPTR: + if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + break; case BPF_LIST_HEAD: case BPF_RB_ROOT: if (map->map_type != BPF_MAP_TYPE_HASH && -- cgit From 2e9a548009c2d804e55cdd5b0e9903756cf7d9b3 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 30 Oct 2024 15:28:18 -0700 Subject: bpf: Add open coded version of kmem_cache iterator Add a new open coded iterator for kmem_cache which can be called from a BPF program like below. It doesn't take any argument and traverses all kmem_cache entries. struct kmem_cache *pos; bpf_for_each(kmem_cache, pos) { ... } As it needs to grab slab_mutex, it should be called from sleepable BPF programs only. Also update the existing iterator code to use the open coded version internally as suggested by Andrii. Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20241030222819.1800667-1-namhyung@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 3 + kernel/bpf/kmem_cache_iter.c | 151 ++++++++++++++++++++++++++++++------------- 2 files changed, 110 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2e82f8d3a76f..395221e53832 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3112,6 +3112,9 @@ BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_get_kmem_cache) +BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/kmem_cache_iter.c b/kernel/bpf/kmem_cache_iter.c index ebc101d7da51..3ae2158d767f 100644 --- a/kernel/bpf/kmem_cache_iter.c +++ b/kernel/bpf/kmem_cache_iter.c @@ -8,16 +8,116 @@ #include "../../mm/slab.h" /* kmem_cache, slab_caches and slab_mutex */ +/* open-coded version */ +struct bpf_iter_kmem_cache { + __u64 __opaque[1]; +} __attribute__((aligned(8))); + +struct bpf_iter_kmem_cache_kern { + struct kmem_cache *pos; +} __attribute__((aligned(8))); + +#define KMEM_CACHE_POS_START ((void *)1L) + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_iter_kmem_cache_new(struct bpf_iter_kmem_cache *it) +{ + struct bpf_iter_kmem_cache_kern *kit = (void *)it; + + BUILD_BUG_ON(sizeof(*kit) > sizeof(*it)); + BUILD_BUG_ON(__alignof__(*kit) != __alignof__(*it)); + + kit->pos = KMEM_CACHE_POS_START; + return 0; +} + +__bpf_kfunc struct kmem_cache *bpf_iter_kmem_cache_next(struct bpf_iter_kmem_cache *it) +{ + struct bpf_iter_kmem_cache_kern *kit = (void *)it; + struct kmem_cache *prev = kit->pos; + struct kmem_cache *next; + bool destroy = false; + + if (!prev) + return NULL; + + mutex_lock(&slab_mutex); + + if (list_empty(&slab_caches)) { + mutex_unlock(&slab_mutex); + return NULL; + } + + if (prev == KMEM_CACHE_POS_START) + next = list_first_entry(&slab_caches, struct kmem_cache, list); + else if (list_last_entry(&slab_caches, struct kmem_cache, list) == prev) + next = NULL; + else + next = list_next_entry(prev, list); + + /* boot_caches have negative refcount, don't touch them */ + if (next && next->refcount > 0) + next->refcount++; + + /* Skip kmem_cache_destroy() for active entries */ + if (prev && prev != KMEM_CACHE_POS_START) { + if (prev->refcount > 1) + prev->refcount--; + else if (prev->refcount == 1) + destroy = true; + } + + mutex_unlock(&slab_mutex); + + if (destroy) + kmem_cache_destroy(prev); + + kit->pos = next; + return next; +} + +__bpf_kfunc void bpf_iter_kmem_cache_destroy(struct bpf_iter_kmem_cache *it) +{ + struct bpf_iter_kmem_cache_kern *kit = (void *)it; + struct kmem_cache *s = kit->pos; + bool destroy = false; + + if (s == NULL || s == KMEM_CACHE_POS_START) + return; + + mutex_lock(&slab_mutex); + + /* Skip kmem_cache_destroy() for active entries */ + if (s->refcount > 1) + s->refcount--; + else if (s->refcount == 1) + destroy = true; + + mutex_unlock(&slab_mutex); + + if (destroy) + kmem_cache_destroy(s); +} + +__bpf_kfunc_end_defs(); + struct bpf_iter__kmem_cache { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct kmem_cache *, s); }; +union kmem_cache_iter_priv { + struct bpf_iter_kmem_cache it; + struct bpf_iter_kmem_cache_kern kit; +}; + static void *kmem_cache_iter_seq_start(struct seq_file *seq, loff_t *pos) { loff_t cnt = 0; bool found = false; struct kmem_cache *s; + union kmem_cache_iter_priv *p = seq->private; mutex_lock(&slab_mutex); @@ -43,8 +143,9 @@ static void *kmem_cache_iter_seq_start(struct seq_file *seq, loff_t *pos) mutex_unlock(&slab_mutex); if (!found) - return NULL; + s = NULL; + p->kit.pos = s; return s; } @@ -55,63 +156,24 @@ static void kmem_cache_iter_seq_stop(struct seq_file *seq, void *v) .meta = &meta, .s = v, }; + union kmem_cache_iter_priv *p = seq->private; struct bpf_prog *prog; - bool destroy = false; meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog && !ctx.s) bpf_iter_run_prog(prog, &ctx); - if (ctx.s == NULL) - return; - - mutex_lock(&slab_mutex); - - /* Skip kmem_cache_destroy() for active entries */ - if (ctx.s->refcount > 1) - ctx.s->refcount--; - else if (ctx.s->refcount == 1) - destroy = true; - - mutex_unlock(&slab_mutex); - - if (destroy) - kmem_cache_destroy(ctx.s); + bpf_iter_kmem_cache_destroy(&p->it); } static void *kmem_cache_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct kmem_cache *s = v; - struct kmem_cache *next = NULL; - bool destroy = false; + union kmem_cache_iter_priv *p = seq->private; ++*pos; - mutex_lock(&slab_mutex); - - if (list_last_entry(&slab_caches, struct kmem_cache, list) != s) { - next = list_next_entry(s, list); - - WARN_ON_ONCE(next->refcount == 0); - - /* boot_caches have negative refcount, don't touch them */ - if (next->refcount > 0) - next->refcount++; - } - - /* Skip kmem_cache_destroy() for active entries */ - if (s->refcount > 1) - s->refcount--; - else if (s->refcount == 1) - destroy = true; - - mutex_unlock(&slab_mutex); - - if (destroy) - kmem_cache_destroy(s); - - return next; + return bpf_iter_kmem_cache_next(&p->it); } static int kmem_cache_iter_seq_show(struct seq_file *seq, void *v) @@ -143,6 +205,7 @@ BTF_ID_LIST_GLOBAL_SINGLE(bpf_kmem_cache_btf_id, struct, kmem_cache) static const struct bpf_iter_seq_info kmem_cache_iter_seq_info = { .seq_ops = &kmem_cache_iter_seq_ops, + .seq_priv_size = sizeof(union kmem_cache_iter_priv), }; static void bpf_iter_kmem_cache_show_fdinfo(const struct bpf_iter_aux_info *aux, -- cgit From 46f7ed32f7a873d6675ea72e1d6317df41a55f81 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sun, 3 Nov 2024 14:59:38 -0800 Subject: bpf: Tighten tail call checks for lingering locks, RCU, preempt_disable There are three situations when a program logically exits and transfers control to the kernel or another program: bpf_throw, BPF_EXIT, and tail calls. The former two check for any lingering locks and references, but tail calls currently do not. Expand the checks to check for spin locks, RCU read sections and preempt disabled sections. Spin locks are indirectly preventing tail calls as function calls are disallowed, but the checks for preemption and RCU are more relaxed, hence ensure tail calls are prevented in their presence. Fixes: 9bb00b2895cb ("bpf: Add kfunc bpf_rcu_read_lock/unlock()") Fixes: fc7566ad0a82 ("bpf: Introduce bpf_preempt_[disable,enable] kfuncs") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241103225940.1408302-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 797cf3ed32e0..0844b4383ff3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10620,11 +10620,26 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn switch (func_id) { case BPF_FUNC_tail_call: + if (env->cur_state->active_lock.ptr) { + verbose(env, "tail_call cannot be used inside bpf_spin_lock-ed region\n"); + return -EINVAL; + } + err = check_reference_leak(env, false); if (err) { verbose(env, "tail_call would lead to reference leak\n"); return err; } + + if (env->cur_state->active_rcu_lock) { + verbose(env, "tail_call cannot be used inside bpf_rcu_read_lock-ed region\n"); + return -EINVAL; + } + + if (env->cur_state->active_preempt_lock) { + verbose(env, "tail_call cannot be used inside bpf_preempt_disable-ed region\n"); + return -EINVAL; + } break; case BPF_FUNC_get_local_storage: /* check that flags argument in get_local_storage(map, flags) is 0, -- cgit From d402755ced2ea8fc1f0513136f074002d509bfa0 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sun, 3 Nov 2024 14:59:39 -0800 Subject: bpf: Unify resource leak checks There are similar checks for covering locks, references, RCU read sections and preempt_disable sections in 3 places in the verifer, i.e. for tail calls, bpf_ld_[abs, ind], and exit path (for BPF_EXIT and bpf_throw). Unify all of these into a common check_resource_leak function to avoid code duplication. Also update the error strings in selftests to the new ones in the same change to ensure clean bisection. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241103225940.1408302-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 90 +++++++++++++++++++-------------------------------- 1 file changed, 34 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0844b4383ff3..ba800c7611e3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10352,6 +10352,34 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi return refs_lingering ? -EINVAL : 0; } +static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit, bool check_lock, const char *prefix) +{ + int err; + + if (check_lock && env->cur_state->active_lock.ptr) { + verbose(env, "%s cannot be used inside bpf_spin_lock-ed region\n", prefix); + return -EINVAL; + } + + err = check_reference_leak(env, exception_exit); + if (err) { + verbose(env, "%s would lead to reference leak\n", prefix); + return err; + } + + if (check_lock && env->cur_state->active_rcu_lock) { + verbose(env, "%s cannot be used inside bpf_rcu_read_lock-ed region\n", prefix); + return -EINVAL; + } + + if (check_lock && env->cur_state->active_preempt_lock) { + verbose(env, "%s cannot be used inside bpf_preempt_disable-ed region\n", prefix); + return -EINVAL; + } + + return 0; +} + static int check_bpf_snprintf_call(struct bpf_verifier_env *env, struct bpf_reg_state *regs) { @@ -10620,26 +10648,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn switch (func_id) { case BPF_FUNC_tail_call: - if (env->cur_state->active_lock.ptr) { - verbose(env, "tail_call cannot be used inside bpf_spin_lock-ed region\n"); - return -EINVAL; - } - - err = check_reference_leak(env, false); - if (err) { - verbose(env, "tail_call would lead to reference leak\n"); + err = check_resource_leak(env, false, true, "tail_call"); + if (err) return err; - } - - if (env->cur_state->active_rcu_lock) { - verbose(env, "tail_call cannot be used inside bpf_rcu_read_lock-ed region\n"); - return -EINVAL; - } - - if (env->cur_state->active_preempt_lock) { - verbose(env, "tail_call cannot be used inside bpf_preempt_disable-ed region\n"); - return -EINVAL; - } break; case BPF_FUNC_get_local_storage: /* check that flags argument in get_local_storage(map, flags) is 0, @@ -15801,26 +15812,9 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) * gen_ld_abs() may terminate the program at runtime, leading to * reference leak. */ - err = check_reference_leak(env, false); - if (err) { - verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n"); + err = check_resource_leak(env, false, true, "BPF_LD_[ABS|IND]"); + if (err) return err; - } - - if (env->cur_state->active_lock.ptr) { - verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n"); - return -EINVAL; - } - - if (env->cur_state->active_rcu_lock) { - verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n"); - return -EINVAL; - } - - if (env->cur_state->active_preempt_lock) { - verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_preempt_disable-ed region\n"); - return -EINVAL; - } if (regs[ctx_reg].type != PTR_TO_CTX) { verbose(env, @@ -18606,30 +18600,14 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } process_bpf_exit_full: - if (env->cur_state->active_lock.ptr && !env->cur_state->curframe) { - verbose(env, "bpf_spin_unlock is missing\n"); - return -EINVAL; - } - - if (env->cur_state->active_rcu_lock && !env->cur_state->curframe) { - verbose(env, "bpf_rcu_read_unlock is missing\n"); - return -EINVAL; - } - - if (env->cur_state->active_preempt_lock && !env->cur_state->curframe) { - verbose(env, "%d bpf_preempt_enable%s missing\n", - env->cur_state->active_preempt_lock, - env->cur_state->active_preempt_lock == 1 ? " is" : "(s) are"); - return -EINVAL; - } - /* We must do check_reference_leak here before * prepare_func_exit to handle the case when * state->curframe > 0, it may be a callback * function, for which reference_state must * match caller reference state when it exits. */ - err = check_reference_leak(env, exception_exit); + err = check_resource_leak(env, exception_exit, !env->cur_state->curframe, + "BPF_EXIT instruction"); if (err) return err; -- cgit From cb4158ce8ec8a5bb528cc1693356a5eb8058094d Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 4 Nov 2024 09:19:57 -0800 Subject: bpf: Mark raw_tp arguments with PTR_MAYBE_NULL Arguments to a raw tracepoint are tagged as trusted, which carries the semantics that the pointer will be non-NULL. However, in certain cases, a raw tracepoint argument may end up being NULL. More context about this issue is available in [0]. Thus, there is a discrepancy between the reality, that raw_tp arguments can actually be NULL, and the verifier's knowledge, that they are never NULL, causing explicit NULL checks to be deleted, and accesses to such pointers potentially crashing the kernel. To fix this, mark raw_tp arguments as PTR_MAYBE_NULL, and then special case the dereference and pointer arithmetic to permit it, and allow passing them into helpers/kfuncs; these exceptions are made for raw_tp programs only. Ensure that we don't do this when ref_obj_id > 0, as in that case this is an acquired object and doesn't need such adjustment. The reason we do mask_raw_tp_trusted_reg logic is because other will recheck in places whether the register is a trusted_reg, and then consider our register as untrusted when detecting the presence of the PTR_MAYBE_NULL flag. To allow safe dereference, we enable PROBE_MEM marking when we see loads into trusted pointers with PTR_MAYBE_NULL. While trusted raw_tp arguments can also be passed into helpers or kfuncs where such broken assumption may cause issues, a future patch set will tackle their case separately, as PTR_TO_BTF_ID (without PTR_TRUSTED) can already be passed into helpers and causes similar problems. Thus, they are left alone for now. It is possible that these checks also permit passing non-raw_tp args that are trusted PTR_TO_BTF_ID with null marking. In such a case, allowing dereference when pointer is NULL expands allowed behavior, so won't regress existing programs, and the case of passing these into helpers is the same as above and will be dealt with later. Also update the failure case in tp_btf_nullable selftest to capture the new behavior, as the verifier will no longer cause an error when directly dereference a raw tracepoint argument marked as __nullable. [0]: https://lore.kernel.org/bpf/ZrCZS6nisraEqehw@jlelli-thinkpadt14gen4.remote.csb Reviewed-by: Jiri Olsa Reported-by: Juri Lelli Tested-by: Juri Lelli Fixes: 3f00c5239344 ("bpf: Allow trusted pointers to be passed to KF_TRUSTED_ARGS kfuncs") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241104171959.2938862-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 5 +++- kernel/bpf/verifier.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 76 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ed3219da7181..e7a59e6462a9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6588,7 +6588,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (prog_args_trusted(prog)) info->reg_type |= PTR_TRUSTED; - if (btf_param_match_suffix(btf, &args[arg], "__nullable")) + /* Raw tracepoint arguments always get marked as maybe NULL */ + if (bpf_prog_is_raw_tp(prog)) + info->reg_type |= PTR_MAYBE_NULL; + else if (btf_param_match_suffix(btf, &args[arg], "__nullable")) info->reg_type |= PTR_MAYBE_NULL; if (tgt_prog) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ba800c7611e3..7958d6ff6b73 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -418,6 +418,25 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) return rec; } +static bool mask_raw_tp_reg_cond(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) { + return reg->type == (PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL) && + bpf_prog_is_raw_tp(env->prog) && !reg->ref_obj_id; +} + +static bool mask_raw_tp_reg(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + if (!mask_raw_tp_reg_cond(env, reg)) + return false; + reg->type &= ~PTR_MAYBE_NULL; + return true; +} + +static void unmask_raw_tp_reg(struct bpf_reg_state *reg, bool result) +{ + if (result) + reg->type |= PTR_MAYBE_NULL; +} + static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog) { struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux; @@ -6622,6 +6641,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, const char *field_name = NULL; enum bpf_type_flag flag = 0; u32 btf_id = 0; + bool mask; int ret; if (!env->allow_ptr_leaks) { @@ -6693,7 +6713,21 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (ret < 0) return ret; - + /* For raw_tp progs, we allow dereference of PTR_MAYBE_NULL + * trusted PTR_TO_BTF_ID, these are the ones that are possibly + * arguments to the raw_tp. Since internal checks in for trusted + * reg in check_ptr_to_btf_access would consider PTR_MAYBE_NULL + * modifier as problematic, mask it out temporarily for the + * check. Don't apply this to pointers with ref_obj_id > 0, as + * those won't be raw_tp args. + * + * We may end up applying this relaxation to other trusted + * PTR_TO_BTF_ID with maybe null flag, since we cannot + * distinguish PTR_MAYBE_NULL tagged for arguments vs normal + * tagging, but that should expand allowed behavior, and not + * cause regression for existing behavior. + */ + mask = mask_raw_tp_reg(env, reg); if (ret != PTR_TO_BTF_ID) { /* just mark; */ @@ -6754,8 +6788,13 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, clear_trusted_flags(&flag); } - if (atype == BPF_READ && value_regno >= 0) + if (atype == BPF_READ && value_regno >= 0) { mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); + /* We've assigned a new type to regno, so don't undo masking. */ + if (regno == value_regno) + mask = false; + } + unmask_raw_tp_reg(reg, mask); return 0; } @@ -7140,7 +7179,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (base_type(reg->type) == PTR_TO_BTF_ID && - !type_may_be_null(reg->type)) { + (mask_raw_tp_reg_cond(env, reg) || !type_may_be_null(reg->type))) { err = check_ptr_to_btf_access(env, regs, regno, off, size, t, value_regno); } else if (reg->type == CONST_PTR_TO_MAP) { @@ -8833,6 +8872,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, enum bpf_reg_type type = reg->type; u32 *arg_btf_id = NULL; int err = 0; + bool mask; if (arg_type == ARG_DONTCARE) return 0; @@ -8873,11 +8913,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK) arg_btf_id = fn->arg_btf_id[arg]; + mask = mask_raw_tp_reg(env, reg); err = check_reg_type(env, regno, arg_type, arg_btf_id, meta); - if (err) - return err; - err = check_func_arg_reg_off(env, reg, regno, arg_type); + err = err ?: check_func_arg_reg_off(env, reg, regno, arg_type); + unmask_raw_tp_reg(reg, mask); if (err) return err; @@ -9672,14 +9712,17 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, return ret; } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { struct bpf_call_arg_meta meta; + bool mask; int err; if (register_is_null(reg) && type_may_be_null(arg->arg_type)) continue; memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */ + mask = mask_raw_tp_reg(env, reg); err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta); err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type); + unmask_raw_tp_reg(reg, mask); if (err) return err; } else { @@ -12007,6 +12050,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ enum bpf_arg_type arg_type = ARG_DONTCARE; u32 regno = i + 1, ref_id, type_size; bool is_ret_buf_sz = false; + bool mask = false; int kf_arg_type; t = btf_type_skip_modifiers(btf, args[i].type, NULL); @@ -12065,12 +12109,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } + mask = mask_raw_tp_reg(env, reg); if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) && (register_is_null(reg) || type_may_be_null(reg->type)) && !is_kfunc_arg_nullable(meta->btf, &args[i])) { verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); + unmask_raw_tp_reg(reg, mask); return -EACCES; } + unmask_raw_tp_reg(reg, mask); if (reg->ref_obj_id) { if (is_kfunc_release(meta) && meta->ref_obj_id) { @@ -12128,16 +12175,24 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) break; + /* Allow passing maybe NULL raw_tp arguments to + * kfuncs for compatibility. Don't apply this to + * arguments with ref_obj_id > 0. + */ + mask = mask_raw_tp_reg(env, reg); if (!is_trusted_reg(reg)) { if (!is_kfunc_rcu(meta)) { verbose(env, "R%d must be referenced or trusted\n", regno); + unmask_raw_tp_reg(reg, mask); return -EINVAL; } if (!is_rcu_reg(reg)) { verbose(env, "R%d must be a rcu pointer\n", regno); + unmask_raw_tp_reg(reg, mask); return -EINVAL; } } + unmask_raw_tp_reg(reg, mask); fallthrough; case KF_ARG_PTR_TO_CTX: case KF_ARG_PTR_TO_DYNPTR: @@ -12160,7 +12215,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_release(meta) && reg->ref_obj_id) arg_type |= OBJ_RELEASE; + mask = mask_raw_tp_reg(env, reg); ret = check_func_arg_reg_off(env, reg, regno, arg_type); + unmask_raw_tp_reg(reg, mask); if (ret < 0) return ret; @@ -12337,6 +12394,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ ref_tname = btf_name_by_offset(btf, ref_t->name_off); fallthrough; case KF_ARG_PTR_TO_BTF_ID: + mask = mask_raw_tp_reg(env, reg); /* Only base_type is checked, further checks are done here */ if ((base_type(reg->type) != PTR_TO_BTF_ID || (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) && @@ -12345,9 +12403,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "expected %s or socket\n", reg_type_str(env, base_type(reg->type) | (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS))); + unmask_raw_tp_reg(reg, mask); return -EINVAL; } ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i); + unmask_raw_tp_reg(reg, mask); if (ret < 0) return ret; break; @@ -13320,7 +13380,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, */ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn, - const struct bpf_reg_state *ptr_reg, + struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg) { struct bpf_verifier_state *vstate = env->cur_state; @@ -13334,6 +13394,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_sanitize_info info = {}; u8 opcode = BPF_OP(insn->code); u32 dst = insn->dst_reg; + bool mask; int ret; dst_reg = ®s[dst]; @@ -13360,11 +13421,14 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } + mask = mask_raw_tp_reg(env, ptr_reg); if (ptr_reg->type & PTR_MAYBE_NULL) { verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", dst, reg_type_str(env, ptr_reg->type)); + unmask_raw_tp_reg(ptr_reg, mask); return -EACCES; } + unmask_raw_tp_reg(ptr_reg, mask); switch (base_type(ptr_reg->type)) { case PTR_TO_CTX: @@ -19866,6 +19930,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) * for this case. */ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED: + case PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL: if (type == BPF_READ) { if (BPF_MODE(insn->code) == BPF_MEM) insn->code = BPF_LDX | BPF_PROBE_MEM | -- cgit From 17c4b65a24938c6dd79496cce5df15f70d9c253c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Nov 2024 14:45:32 +0100 Subject: bpf: Allow return values 0 and 1 for kprobe session The kprobe session program can return only 0 or 1, instruct verifier to check for that. Fixes: 535a3692ba72 ("bpf: Add support for kprobe session attach") Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241108134544.480660-2-jolsa@kernel.org --- kernel/bpf/verifier.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7958d6ff6b73..7d8ed377b35d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16024,6 +16024,15 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char return -ENOTSUPP; } break; + case BPF_PROG_TYPE_KPROBE: + switch (env->prog->expected_attach_type) { + case BPF_TRACE_KPROBE_SESSION: + range = retval_range(0, 1); + break; + default: + return 0; + } + break; case BPF_PROG_TYPE_SK_LOOKUP: range = retval_range(SK_DROP, SK_PASS); break; -- cgit From f505005bc7426f4309880da94cfbfc37efa225bd Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Nov 2024 14:45:33 +0100 Subject: bpf: Force uprobe bpf program to always return 0 As suggested by Andrii make uprobe multi bpf programs to always return 0, so they can't force uprobe removal. Keeping the int return type for uprobe_prog_run, because it will be used in following session changes. Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link") Suggested-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241108134544.480660-3-jolsa@kernel.org --- kernel/trace/bpf_trace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 88fd628850ca..db9e2792b42b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3229,7 +3229,6 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe, struct bpf_prog *prog = link->link.prog; bool sleepable = prog->sleepable; struct bpf_run_ctx *old_run_ctx; - int err = 0; if (link->task && !same_thread_group(current, link->task)) return 0; @@ -3242,7 +3241,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe, migrate_disable(); old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - err = bpf_prog_run(link->link.prog, regs); + bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); migrate_enable(); @@ -3251,7 +3250,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe, rcu_read_unlock_trace(); else rcu_read_unlock(); - return err; + return 0; } static bool -- cgit From d920179b3d4842a0e27cae54fdddbe5ef3977e73 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Nov 2024 14:45:34 +0100 Subject: bpf: Add support for uprobe multi session attach Adding support to attach BPF program for entry and return probe of the same function. This is common use case which at the moment requires to create two uprobe multi links. Adding new BPF_TRACE_UPROBE_SESSION attach type that instructs kernel to attach single link program to both entry and exit probe. It's possible to control execution of the BPF program on return probe simply by returning zero or non zero from the entry BPF program execution to execute or not the BPF program on return probe respectively. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241108134544.480660-4-jolsa@kernel.org --- kernel/bpf/syscall.c | 9 +++++++-- kernel/bpf/verifier.c | 1 + kernel/trace/bpf_trace.c | 36 +++++++++++++++++++++++++++--------- 3 files changed, 35 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8254b2973157..58190ca724a2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4103,10 +4103,14 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI && attach_type != BPF_TRACE_UPROBE_MULTI) return -EINVAL; + if (prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION && + attach_type != BPF_TRACE_UPROBE_SESSION) + return -EINVAL; if (attach_type != BPF_PERF_EVENT && attach_type != BPF_TRACE_KPROBE_MULTI && attach_type != BPF_TRACE_KPROBE_SESSION && - attach_type != BPF_TRACE_UPROBE_MULTI) + attach_type != BPF_TRACE_UPROBE_MULTI && + attach_type != BPF_TRACE_UPROBE_SESSION) return -EINVAL; return 0; case BPF_PROG_TYPE_SCHED_CLS: @@ -5359,7 +5363,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI || attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION) ret = bpf_kprobe_multi_link_attach(attr, prog); - else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI) + else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI || + attr->link_create.attach_type == BPF_TRACE_UPROBE_SESSION) ret = bpf_uprobe_multi_link_attach(attr, prog); break; default: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7d8ed377b35d..132fc172961f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16027,6 +16027,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char case BPF_PROG_TYPE_KPROBE: switch (env->prog->expected_attach_type) { case BPF_TRACE_KPROBE_SESSION: + case BPF_TRACE_UPROBE_SESSION: range = retval_range(0, 1); break; default: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index db9e2792b42b..9c04b1364de2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1581,6 +1581,17 @@ static inline bool is_kprobe_session(const struct bpf_prog *prog) return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; } +static inline bool is_uprobe_multi(const struct bpf_prog *prog) +{ + return prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI || + prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; +} + +static inline bool is_uprobe_session(const struct bpf_prog *prog) +{ + return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; +} + static const struct bpf_func_proto * kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1598,13 +1609,13 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_func_ip: if (is_kprobe_multi(prog)) return &bpf_get_func_ip_proto_kprobe_multi; - if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI) + if (is_uprobe_multi(prog)) return &bpf_get_func_ip_proto_uprobe_multi; return &bpf_get_func_ip_proto_kprobe; case BPF_FUNC_get_attach_cookie: if (is_kprobe_multi(prog)) return &bpf_get_attach_cookie_proto_kmulti; - if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI) + if (is_uprobe_multi(prog)) return &bpf_get_attach_cookie_proto_umulti; return &bpf_get_attach_cookie_proto_trace; default: @@ -3096,6 +3107,7 @@ struct bpf_uprobe { u64 cookie; struct uprobe *uprobe; struct uprobe_consumer consumer; + bool session; }; struct bpf_uprobe_multi_link { @@ -3267,9 +3279,13 @@ uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs, __u64 *data) { struct bpf_uprobe *uprobe; + int ret; uprobe = container_of(con, struct bpf_uprobe, consumer); - return uprobe_prog_run(uprobe, instruction_pointer(regs), regs); + ret = uprobe_prog_run(uprobe, instruction_pointer(regs), regs); + if (uprobe->session) + return ret ? UPROBE_HANDLER_IGNORE : 0; + return 0; } static int @@ -3279,7 +3295,8 @@ uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, s struct bpf_uprobe *uprobe; uprobe = container_of(con, struct bpf_uprobe, consumer); - return uprobe_prog_run(uprobe, func, regs); + uprobe_prog_run(uprobe, func, regs); + return 0; } static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) @@ -3318,7 +3335,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; - if (prog->expected_attach_type != BPF_TRACE_UPROBE_MULTI) + if (!is_uprobe_multi(prog)) return -EINVAL; flags = attr->link_create.uprobe_multi.flags; @@ -3394,11 +3411,12 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr uprobes[i].link = link; - if (flags & BPF_F_UPROBE_MULTI_RETURN) - uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler; - else + if (!(flags & BPF_F_UPROBE_MULTI_RETURN)) uprobes[i].consumer.handler = uprobe_multi_link_handler; - + if (flags & BPF_F_UPROBE_MULTI_RETURN || is_uprobe_session(prog)) + uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler; + if (is_uprobe_session(prog)) + uprobes[i].session = true; if (pid) uprobes[i].consumer.filter = uprobe_multi_link_filter; } -- cgit From 99b403d2060d3e2604958a0ec3a7f37b18256d6b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Nov 2024 14:45:35 +0100 Subject: bpf: Add support for uprobe multi session context Placing bpf_session_run_ctx layer in between bpf_run_ctx and bpf_uprobe_multi_run_ctx, so the session data can be retrieved from uprobe_multi link. Plus granting session kfuncs access to uprobe session programs. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241108134544.480660-5-jolsa@kernel.org --- kernel/trace/bpf_trace.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9c04b1364de2..949a3870946c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3120,7 +3120,7 @@ struct bpf_uprobe_multi_link { }; struct bpf_uprobe_multi_run_ctx { - struct bpf_run_ctx run_ctx; + struct bpf_session_run_ctx session_ctx; unsigned long entry_ip; struct bpf_uprobe *uprobe; }; @@ -3231,16 +3231,22 @@ static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { static int uprobe_prog_run(struct bpf_uprobe *uprobe, unsigned long entry_ip, - struct pt_regs *regs) + struct pt_regs *regs, + bool is_return, void *data) { struct bpf_uprobe_multi_link *link = uprobe->link; struct bpf_uprobe_multi_run_ctx run_ctx = { + .session_ctx = { + .is_return = is_return, + .data = data, + }, .entry_ip = entry_ip, .uprobe = uprobe, }; struct bpf_prog *prog = link->link.prog; bool sleepable = prog->sleepable; struct bpf_run_ctx *old_run_ctx; + int err; if (link->task && !same_thread_group(current, link->task)) return 0; @@ -3252,8 +3258,8 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe, migrate_disable(); - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - bpf_prog_run(link->link.prog, regs); + old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx); + err = bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); migrate_enable(); @@ -3262,7 +3268,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe, rcu_read_unlock_trace(); else rcu_read_unlock(); - return 0; + return err; } static bool @@ -3282,7 +3288,7 @@ uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs, int ret; uprobe = container_of(con, struct bpf_uprobe, consumer); - ret = uprobe_prog_run(uprobe, instruction_pointer(regs), regs); + ret = uprobe_prog_run(uprobe, instruction_pointer(regs), regs, false, data); if (uprobe->session) return ret ? UPROBE_HANDLER_IGNORE : 0; return 0; @@ -3295,7 +3301,7 @@ uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, s struct bpf_uprobe *uprobe; uprobe = container_of(con, struct bpf_uprobe, consumer); - uprobe_prog_run(uprobe, func, regs); + uprobe_prog_run(uprobe, func, regs, true, data); return 0; } @@ -3303,7 +3309,8 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) { struct bpf_uprobe_multi_run_ctx *run_ctx; - run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx); + run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, + session_ctx.run_ctx); return run_ctx->entry_ip; } @@ -3311,7 +3318,8 @@ static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx) { struct bpf_uprobe_multi_run_ctx *run_ctx; - run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx); + run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, + session_ctx.run_ctx); return run_ctx->uprobe->cookie; } @@ -3505,7 +3513,7 @@ static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id) if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id)) return 0; - if (!is_kprobe_session(prog)) + if (!is_kprobe_session(prog) && !is_uprobe_session(prog)) return -EACCES; return 0; -- cgit From b9e9ed90b10c82a4e9d4d70a2890f06bfcdd3b78 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 6 Nov 2024 14:35:40 +0800 Subject: bpf: Call free_htab_elem() after htab_unlock_bucket() For htab of maps, when the map is removed from the htab, it may hold the last reference of the map. bpf_map_fd_put_ptr() will invoke bpf_map_free_id() to free the id of the removed map element. However, bpf_map_fd_put_ptr() is invoked while holding a bucket lock (raw_spin_lock_t), and bpf_map_free_id() attempts to acquire map_idr_lock (spinlock_t), triggering the following lockdep warning: ============================= [ BUG: Invalid wait context ] 6.11.0-rc4+ #49 Not tainted ----------------------------- test_maps/4881 is trying to lock: ffffffff84884578 (map_idr_lock){+...}-{3:3}, at: bpf_map_free_id.part.0+0x21/0x70 other info that might help us debug this: context-{5:5} 2 locks held by test_maps/4881: #0: ffffffff846caf60 (rcu_read_lock){....}-{1:3}, at: bpf_fd_htab_map_update_elem+0xf9/0x270 #1: ffff888149ced148 (&htab->lockdep_key#2){....}-{2:2}, at: htab_map_update_elem+0x178/0xa80 stack backtrace: CPU: 0 UID: 0 PID: 4881 Comm: test_maps Not tainted 6.11.0-rc4+ #49 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ... Call Trace: dump_stack_lvl+0x6e/0xb0 dump_stack+0x10/0x20 __lock_acquire+0x73e/0x36c0 lock_acquire+0x182/0x450 _raw_spin_lock_irqsave+0x43/0x70 bpf_map_free_id.part.0+0x21/0x70 bpf_map_put+0xcf/0x110 bpf_map_fd_put_ptr+0x9a/0xb0 free_htab_elem+0x69/0xe0 htab_map_update_elem+0x50f/0xa80 bpf_fd_htab_map_update_elem+0x131/0x270 htab_map_update_elem+0x50f/0xa80 bpf_fd_htab_map_update_elem+0x131/0x270 bpf_map_update_value+0x266/0x380 __sys_bpf+0x21bb/0x36b0 __x64_sys_bpf+0x45/0x60 x64_sys_call+0x1b2a/0x20d0 do_syscall_64+0x5d/0x100 entry_SYSCALL_64_after_hwframe+0x76/0x7e One way to fix the lockdep warning is using raw_spinlock_t for map_idr_lock as well. However, bpf_map_alloc_id() invokes idr_alloc_cyclic() after acquiring map_idr_lock, it will trigger a similar lockdep warning because the slab's lock (s->cpu_slab->lock) is still a spinlock. Instead of changing map_idr_lock's type, fix the issue by invoking htab_put_fd_value() after htab_unlock_bucket(). However, only deferring the invocation of htab_put_fd_value() is not enough, because the old map pointers in htab of maps can not be saved during batched deletion. Therefore, also defer the invocation of free_htab_elem(), so these to-be-freed elements could be linked together similar to lru map. There are four callers for ->map_fd_put_ptr: (1) alloc_htab_elem() (through htab_put_fd_value()) It invokes ->map_fd_put_ptr() under a raw_spinlock_t. The invocation of htab_put_fd_value() can not simply move after htab_unlock_bucket(), because the old element has already been stashed in htab->extra_elems. It may be reused immediately after htab_unlock_bucket() and the invocation of htab_put_fd_value() after htab_unlock_bucket() may release the newly-added element incorrectly. Therefore, saving the map pointer of the old element for htab of maps before unlocking the bucket and releasing the map_ptr after unlock. Beside the map pointer in the old element, should do the same thing for the special fields in the old element as well. (2) free_htab_elem() (through htab_put_fd_value()) Its caller includes __htab_map_lookup_and_delete_elem(), htab_map_delete_elem() and __htab_map_lookup_and_delete_batch(). For htab_map_delete_elem(), simply invoke free_htab_elem() after htab_unlock_bucket(). For __htab_map_lookup_and_delete_batch(), just like lru map, linking the to-be-freed element into node_to_free list and invoking free_htab_elem() for these element after unlock. It is safe to reuse batch_flink as the link for node_to_free, because these elements have been removed from the hash llist. Because htab of maps doesn't support lookup_and_delete operation, __htab_map_lookup_and_delete_elem() doesn't have the problem, so kept it as is. (3) fd_htab_map_free() It invokes ->map_fd_put_ptr without raw_spinlock_t. (4) bpf_fd_htab_map_update_elem() It invokes ->map_fd_put_ptr without raw_spinlock_t. After moving free_htab_elem() outside htab bucket lock scope, using pcpu_freelist_push() instead of __pcpu_freelist_push() to disable the irq before freeing elements, and protecting the invocations of bpf_mem_cache_free() with migrate_{disable|enable} pair. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20241106063542.357743-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- kernel/bpf/hashtab.c | 56 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b14b87463ee0..3ec941a0ea41 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -896,9 +896,12 @@ find_first_elem: static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { check_and_free_fields(htab, l); + + migrate_disable(); if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); bpf_mem_cache_free(&htab->ma, l); + migrate_enable(); } static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) @@ -948,7 +951,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) if (htab_is_prealloc(htab)) { bpf_map_dec_elem_count(&htab->map); check_and_free_fields(htab, l); - __pcpu_freelist_push(&htab->freelist, &l->fnode); + pcpu_freelist_push(&htab->freelist, &l->fnode); } else { dec_elem_count(htab); htab_elem_free(htab, l); @@ -1018,7 +1021,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, */ pl_new = this_cpu_ptr(htab->extra_elems); l_new = *pl_new; - htab_put_fd_value(htab, old_elem); *pl_new = old_elem; } else { struct pcpu_freelist_node *l; @@ -1105,6 +1107,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, struct htab_elem *l_new = NULL, *l_old; struct hlist_nulls_head *head; unsigned long flags; + void *old_map_ptr; struct bucket *b; u32 key_size, hash; int ret; @@ -1183,12 +1186,27 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, hlist_nulls_add_head_rcu(&l_new->hash_node, head); if (l_old) { hlist_nulls_del_rcu(&l_old->hash_node); + + /* l_old has already been stashed in htab->extra_elems, free + * its special fields before it is available for reuse. Also + * save the old map pointer in htab of maps before unlock + * and release it after unlock. + */ + old_map_ptr = NULL; + if (htab_is_prealloc(htab)) { + if (map->ops->map_fd_put_ptr) + old_map_ptr = fd_htab_map_get_ptr(map, l_old); + check_and_free_fields(htab, l_old); + } + } + htab_unlock_bucket(htab, b, hash, flags); + if (l_old) { + if (old_map_ptr) + map->ops->map_fd_put_ptr(map, old_map_ptr, true); if (!htab_is_prealloc(htab)) free_htab_elem(htab, l_old); - else - check_and_free_fields(htab, l_old); } - ret = 0; + return 0; err: htab_unlock_bucket(htab, b, hash, flags); return ret; @@ -1432,15 +1450,15 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key) return ret; l = lookup_elem_raw(head, hash, key, key_size); - - if (l) { + if (l) hlist_nulls_del_rcu(&l->hash_node); - free_htab_elem(htab, l); - } else { + else ret = -ENOENT; - } htab_unlock_bucket(htab, b, hash, flags); + + if (l) + free_htab_elem(htab, l); return ret; } @@ -1853,13 +1871,14 @@ again_nocopy: * may cause deadlock. See comments in function * prealloc_lru_pop(). Let us do bpf_lru_push_free() * after releasing the bucket lock. + * + * For htab of maps, htab_put_fd_value() in + * free_htab_elem() may acquire a spinlock with bucket + * lock being held and it violates the lock rule, so + * invoke free_htab_elem() after unlock as well. */ - if (is_lru_map) { - l->batch_flink = node_to_free; - node_to_free = l; - } else { - free_htab_elem(htab, l); - } + l->batch_flink = node_to_free; + node_to_free = l; } dst_key += key_size; dst_val += value_size; @@ -1871,7 +1890,10 @@ again_nocopy: while (node_to_free) { l = node_to_free; node_to_free = node_to_free->batch_flink; - htab_lru_push_free(htab, l); + if (is_lru_map) + htab_lru_push_free(htab, l); + else + free_htab_elem(htab, l); } next_batch: -- cgit From f6b9a69a9e56b2083aca8a925fc1a28eb698e3ed Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 9 Nov 2024 15:14:29 -0800 Subject: bpf: Refactor active lock management When bpf_spin_lock was introduced originally, there was deliberation on whether to use an array of lock IDs, but since bpf_spin_lock is limited to holding a single lock at any given time, we've been using a single ID to identify the held lock. In preparation for introducing spin locks that can be taken multiple times, introduce support for acquiring multiple lock IDs. For this purpose, reuse the acquired_refs array and store both lock and pointer references. We tag the entry with REF_TYPE_PTR or REF_TYPE_LOCK to disambiguate and find the relevant entry. The ptr field is used to track the map_ptr or btf (for bpf_obj_new allocations) to ensure locks can be matched with protected fields within the same "allocation", i.e. bpf_obj_new object or map value. The struct active_lock is changed to an int as the state is part of the acquired_refs array, and we only need active_lock as a cheap way of detecting lock presence. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241109231430.2475236-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- kernel/bpf/verifier.c | 146 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 107 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 132fc172961f..d55ca27dc031 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1284,6 +1284,7 @@ static int copy_reference_state(struct bpf_func_state *dst, const struct bpf_fun if (!dst->refs) return -ENOMEM; + dst->active_locks = src->active_locks; dst->acquired_refs = src->acquired_refs; return 0; } @@ -1354,6 +1355,7 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) if (err) return err; id = ++env->id_gen; + state->refs[new_ofs].type = REF_TYPE_PTR; state->refs[new_ofs].id = id; state->refs[new_ofs].insn_idx = insn_idx; state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0; @@ -1361,6 +1363,25 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) return id; } +static int acquire_lock_state(struct bpf_verifier_env *env, int insn_idx, enum ref_state_type type, + int id, void *ptr) +{ + struct bpf_func_state *state = cur_func(env); + int new_ofs = state->acquired_refs; + int err; + + err = resize_reference_state(state, state->acquired_refs + 1); + if (err) + return err; + state->refs[new_ofs].type = type; + state->refs[new_ofs].id = id; + state->refs[new_ofs].insn_idx = insn_idx; + state->refs[new_ofs].ptr = ptr; + + state->active_locks++; + return 0; +} + /* release function corresponding to acquire_reference_state(). Idempotent. */ static int release_reference_state(struct bpf_func_state *state, int ptr_id) { @@ -1368,6 +1389,8 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id) last_idx = state->acquired_refs - 1; for (i = 0; i < state->acquired_refs; i++) { + if (state->refs[i].type != REF_TYPE_PTR) + continue; if (state->refs[i].id == ptr_id) { /* Cannot release caller references in callbacks */ if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno) @@ -1383,6 +1406,45 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id) return -EINVAL; } +static int release_lock_state(struct bpf_func_state *state, int type, int id, void *ptr) +{ + int i, last_idx; + + last_idx = state->acquired_refs - 1; + for (i = 0; i < state->acquired_refs; i++) { + if (state->refs[i].type != type) + continue; + if (state->refs[i].id == id && state->refs[i].ptr == ptr) { + if (last_idx && i != last_idx) + memcpy(&state->refs[i], &state->refs[last_idx], + sizeof(*state->refs)); + memset(&state->refs[last_idx], 0, sizeof(*state->refs)); + state->acquired_refs--; + state->active_locks--; + return 0; + } + } + return -EINVAL; +} + +static struct bpf_reference_state *find_lock_state(struct bpf_verifier_env *env, enum ref_state_type type, + int id, void *ptr) +{ + struct bpf_func_state *state = cur_func(env); + int i; + + for (i = 0; i < state->acquired_refs; i++) { + struct bpf_reference_state *s = &state->refs[i]; + + if (s->type == REF_TYPE_PTR || s->type != type) + continue; + + if (s->id == id && s->ptr == ptr) + return s; + } + return NULL; +} + static void free_func_state(struct bpf_func_state *state) { if (!state) @@ -1453,8 +1515,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->active_preempt_lock = src->active_preempt_lock; dst_state->in_sleepable = src->in_sleepable; dst_state->curframe = src->curframe; - dst_state->active_lock.ptr = src->active_lock.ptr; - dst_state->active_lock.id = src->active_lock.id; dst_state->branches = src->branches; dst_state->parent = src->parent; dst_state->first_insn_idx = src->first_insn_idx; @@ -5442,7 +5502,7 @@ static bool in_sleepable(struct bpf_verifier_env *env) static bool in_rcu_cs(struct bpf_verifier_env *env) { return env->cur_state->active_rcu_lock || - env->cur_state->active_lock.ptr || + cur_func(env)->active_locks || !in_sleepable(env); } @@ -7724,19 +7784,20 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg * Since only one bpf_spin_lock is allowed the checks are simpler than * reg_is_refcounted() logic. The verifier needs to remember only * one spin_lock instead of array of acquired_refs. - * cur_state->active_lock remembers which map value element or allocated + * cur_func(env)->active_locks remembers which map value element or allocated * object got locked and clears it after bpf_spin_unlock. */ static int process_spin_lock(struct bpf_verifier_env *env, int regno, bool is_lock) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - struct bpf_verifier_state *cur = env->cur_state; bool is_const = tnum_is_const(reg->var_off); + struct bpf_func_state *cur = cur_func(env); u64 val = reg->var_off.value; struct bpf_map *map = NULL; struct btf *btf = NULL; struct btf_record *rec; + int err; if (!is_const) { verbose(env, @@ -7768,16 +7829,23 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, return -EINVAL; } if (is_lock) { - if (cur->active_lock.ptr) { + void *ptr; + + if (map) + ptr = map; + else + ptr = btf; + + if (cur->active_locks) { verbose(env, "Locking two bpf_spin_locks are not allowed\n"); return -EINVAL; } - if (map) - cur->active_lock.ptr = map; - else - cur->active_lock.ptr = btf; - cur->active_lock.id = reg->id; + err = acquire_lock_state(env, env->insn_idx, REF_TYPE_LOCK, reg->id, ptr); + if (err < 0) { + verbose(env, "Failed to acquire lock state\n"); + return err; + } } else { void *ptr; @@ -7786,20 +7854,17 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, else ptr = btf; - if (!cur->active_lock.ptr) { + if (!cur->active_locks) { verbose(env, "bpf_spin_unlock without taking a lock\n"); return -EINVAL; } - if (cur->active_lock.ptr != ptr || - cur->active_lock.id != reg->id) { + + if (release_lock_state(cur_func(env), REF_TYPE_LOCK, reg->id, ptr)) { verbose(env, "bpf_spin_unlock of different lock\n"); return -EINVAL; } invalidate_non_owning_refs(env); - - cur->active_lock.ptr = NULL; - cur->active_lock.id = 0; } return 0; } @@ -9861,7 +9926,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, const char *sub_name = subprog_name(env, subprog); /* Only global subprogs cannot be called with a lock held. */ - if (env->cur_state->active_lock.ptr) { + if (cur_func(env)->active_locks) { verbose(env, "global function calls are not allowed while holding a lock,\n" "use static function instead\n"); return -EINVAL; @@ -10386,6 +10451,8 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi return 0; for (i = 0; i < state->acquired_refs; i++) { + if (state->refs[i].type != REF_TYPE_PTR) + continue; if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno) continue; verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", @@ -10399,7 +10466,7 @@ static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit { int err; - if (check_lock && env->cur_state->active_lock.ptr) { + if (check_lock && cur_func(env)->active_locks) { verbose(env, "%s cannot be used inside bpf_spin_lock-ed region\n", prefix); return -EINVAL; } @@ -11620,10 +11687,9 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - struct bpf_verifier_state *state = env->cur_state; struct btf_record *rec = reg_btf_record(reg); - if (!state->active_lock.ptr) { + if (!cur_func(env)->active_locks) { verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n"); return -EFAULT; } @@ -11720,6 +11786,7 @@ static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_o */ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { + struct bpf_reference_state *s; void *ptr; u32 id; @@ -11736,10 +11803,10 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_ } id = reg->id; - if (!env->cur_state->active_lock.ptr) + if (!cur_func(env)->active_locks) return -EINVAL; - if (env->cur_state->active_lock.ptr != ptr || - env->cur_state->active_lock.id != id) { + s = find_lock_state(env, REF_TYPE_LOCK, id, ptr); + if (!s) { verbose(env, "held lock and object are not in the same allocation\n"); return -EINVAL; } @@ -17635,8 +17702,22 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur, return false; for (i = 0; i < old->acquired_refs; i++) { - if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap)) + if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) || + old->refs[i].type != cur->refs[i].type) + return false; + switch (old->refs[i].type) { + case REF_TYPE_PTR: + if (old->refs[i].callback_ref != cur->refs[i].callback_ref) + return false; + break; + case REF_TYPE_LOCK: + if (old->refs[i].ptr != cur->refs[i].ptr) + return false; + break; + default: + WARN_ONCE(1, "Unhandled enum type for reference state: %d\n", old->refs[i].type); return false; + } } return true; @@ -17714,19 +17795,6 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->speculative && !cur->speculative) return false; - if (old->active_lock.ptr != cur->active_lock.ptr) - return false; - - /* Old and cur active_lock's have to be either both present - * or both absent. - */ - if (!!old->active_lock.id != !!cur->active_lock.id) - return false; - - if (old->active_lock.id && - !check_ids(old->active_lock.id, cur->active_lock.id, &env->idmap_scratch)) - return false; - if (old->active_rcu_lock != cur->active_rcu_lock) return false; @@ -18625,7 +18693,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - if (env->cur_state->active_lock.ptr) { + if (cur_func(env)->active_locks) { if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) || (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && (insn->off != 0 || !is_bpf_graph_api_kfunc(insn->imm)))) { -- cgit From ae6e3a273f590a2b64f14a9fab3546c3a8f44ed4 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 9 Nov 2024 15:14:30 -0800 Subject: bpf: Drop special callback reference handling Logic to prevent callbacks from acquiring new references for the program (i.e. leaving acquired references), and releasing caller references (i.e. those acquired in parent frames) was introduced in commit 9d9d00ac29d0 ("bpf: Fix reference state management for synchronous callbacks"). This was necessary because back then, the verifier simulated each callback once (that could potentially be executed N times, where N can be zero). This meant that callbacks that left lingering resources or cleared caller resources could do it more than once, operating on undefined state or leaking memory. With the fixes to callback verification in commit ab5cfac139ab ("bpf: verify callbacks as if they are called unknown number of times"), all of this extra logic is no longer necessary. Hence, drop it as part of this commit. Cc: Eduard Zingerman Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241109231430.2475236-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- kernel/bpf/verifier.c | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d55ca27dc031..9f5de8d4fbd0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1358,7 +1358,6 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) state->refs[new_ofs].type = REF_TYPE_PTR; state->refs[new_ofs].id = id; state->refs[new_ofs].insn_idx = insn_idx; - state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0; return id; } @@ -1392,9 +1391,6 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id) if (state->refs[i].type != REF_TYPE_PTR) continue; if (state->refs[i].id == ptr_id) { - /* Cannot release caller references in callbacks */ - if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno) - return -EINVAL; if (last_idx && i != last_idx) memcpy(&state->refs[i], &state->refs[last_idx], sizeof(*state->refs)); @@ -10267,17 +10263,10 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) caller->regs[BPF_REG_0] = *r0; } - /* callback_fn frame should have released its own additions to parent's - * reference state at this point, or check_reference_leak would - * complain, hence it must be the same as the caller. There is no need - * to copy it back. - */ - if (!callee->in_callback_fn) { - /* Transfer references to the caller */ - err = copy_reference_state(caller, callee); - if (err) - return err; - } + /* Transfer references to the caller */ + err = copy_reference_state(caller, callee); + if (err) + return err; /* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite, * there function call logic would reschedule callback visit. If iteration @@ -10447,14 +10436,12 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi bool refs_lingering = false; int i; - if (!exception_exit && state->frameno && !state->in_callback_fn) + if (!exception_exit && state->frameno) return 0; for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].type != REF_TYPE_PTR) continue; - if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno) - continue; verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", state->refs[i].id, state->refs[i].insn_idx); refs_lingering = true; @@ -17707,8 +17694,6 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur, return false; switch (old->refs[i].type) { case REF_TYPE_PTR: - if (old->refs[i].callback_ref != cur->refs[i].callback_ref) - return false; break; case REF_TYPE_LOCK: if (old->refs[i].ptr != cur->refs[i].ptr) -- cgit From a76ab5731e32d50ff5b1ae97e9dc4b23f41c23f5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 Nov 2024 08:39:07 -0800 Subject: bpf: Find eligible subprogs for private stack support Private stack will be allocated with percpu allocator in jit time. To avoid complexity at runtime, only one copy of private stack is available per cpu per prog. So runtime recursion check is necessary to avoid stack corruption. Current private stack only supports kprobe/perf_event/tp/raw_tp which has recursion check in the kernel, and prog types that use bpf trampoline recursion check. For trampoline related prog types, currently only tracing progs have recursion checking. To avoid complexity, all async_cb subprogs use normal kernel stack including those subprogs used by both main prog subtree and async_cb subtree. Any prog having tail call also uses kernel stack. To avoid jit penalty with private stack support, a subprog stack size threshold is set such that only if the stack size is no less than the threshold, private stack is supported. The current threshold is 64 bytes. This avoids jit penality if the stack usage is small. A useless 'continue' is also removed from a loop in func check_max_stack_depth(). Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20241112163907.2223839-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 5 +++ kernel/bpf/verifier.c | 96 +++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 91 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 233ea78f8f1b..14d9288441f2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3045,6 +3045,11 @@ bool __weak bpf_jit_supports_exceptions(void) return false; } +bool __weak bpf_jit_supports_private_stack(void) +{ + return false; +} + void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie) { } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9f5de8d4fbd0..fb23793ac53d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -194,6 +194,8 @@ struct bpf_verifier_stack_elem { #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512 +#define BPF_PRIV_STACK_MIN_SIZE 64 + static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); static void invalidate_non_owning_refs(struct bpf_verifier_env *env); @@ -6090,6 +6092,34 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, strict); } +static enum priv_stack_mode bpf_enable_priv_stack(struct bpf_prog *prog) +{ + if (!bpf_jit_supports_private_stack()) + return NO_PRIV_STACK; + + /* bpf_prog_check_recur() checks all prog types that use bpf trampoline + * while kprobe/tp/perf_event/raw_tp don't use trampoline hence checked + * explicitly. + */ + switch (prog->type) { + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + return PRIV_STACK_ADAPTIVE; + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_STRUCT_OPS: + if (bpf_prog_check_recur(prog)) + return PRIV_STACK_ADAPTIVE; + fallthrough; + default: + break; + } + + return NO_PRIV_STACK; +} + static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth) { if (env->prog->jit_requested) @@ -6107,17 +6137,20 @@ static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth) * Since recursion is prevented by check_cfg() this algorithm * only needs a local stack of MAX_CALL_FRAMES to remember callsites */ -static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx) +static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx, + bool priv_stack_supported) { struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; - int depth = 0, frame = 0, i, subprog_end; + int depth = 0, frame = 0, i, subprog_end, subprog_depth; bool tail_call_reachable = false; int ret_insn[MAX_CALL_FRAMES]; int ret_prog[MAX_CALL_FRAMES]; int j; i = subprog[idx].start; + if (!priv_stack_supported) + subprog[idx].priv_stack_mode = NO_PRIV_STACK; process_func: /* protect against potential stack overflow that might happen when * bpf2bpf calls get combined with tailcalls. Limit the caller's stack @@ -6144,11 +6177,31 @@ process_func: depth); return -EACCES; } - depth += round_up_stack_depth(env, subprog[idx].stack_depth); - if (depth > MAX_BPF_STACK) { - verbose(env, "combined stack size of %d calls is %d. Too large\n", - frame + 1, depth); - return -EACCES; + + subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth); + if (priv_stack_supported) { + /* Request private stack support only if the subprog stack + * depth is no less than BPF_PRIV_STACK_MIN_SIZE. This is to + * avoid jit penalty if the stack usage is small. + */ + if (subprog[idx].priv_stack_mode == PRIV_STACK_UNKNOWN && + subprog_depth >= BPF_PRIV_STACK_MIN_SIZE) + subprog[idx].priv_stack_mode = PRIV_STACK_ADAPTIVE; + } + + if (subprog[idx].priv_stack_mode == PRIV_STACK_ADAPTIVE) { + if (subprog_depth > MAX_BPF_STACK) { + verbose(env, "stack size of subprog %d is %d. Too large\n", + idx, subprog_depth); + return -EACCES; + } + } else { + depth += subprog_depth; + if (depth > MAX_BPF_STACK) { + verbose(env, "combined stack size of %d calls is %d. Too large\n", + frame + 1, depth); + return -EACCES; + } } continue_func: subprog_end = subprog[idx + 1].start; @@ -6205,6 +6258,8 @@ continue_func: } i = next_insn; idx = sidx; + if (!priv_stack_supported) + subprog[idx].priv_stack_mode = NO_PRIV_STACK; if (subprog[idx].has_tail_call) tail_call_reachable = true; @@ -6238,7 +6293,8 @@ continue_func: */ if (frame == 0) return 0; - depth -= round_up_stack_depth(env, subprog[idx].stack_depth); + if (subprog[idx].priv_stack_mode != PRIV_STACK_ADAPTIVE) + depth -= round_up_stack_depth(env, subprog[idx].stack_depth); frame--; i = ret_insn[frame]; idx = ret_prog[frame]; @@ -6247,16 +6303,36 @@ continue_func: static int check_max_stack_depth(struct bpf_verifier_env *env) { + enum priv_stack_mode priv_stack_mode = PRIV_STACK_UNKNOWN; struct bpf_subprog_info *si = env->subprog_info; + bool priv_stack_supported; int ret; for (int i = 0; i < env->subprog_cnt; i++) { + if (si[i].has_tail_call) { + priv_stack_mode = NO_PRIV_STACK; + break; + } + } + + if (priv_stack_mode == PRIV_STACK_UNKNOWN) + priv_stack_mode = bpf_enable_priv_stack(env->prog); + + /* All async_cb subprogs use normal kernel stack. If a particular + * subprog appears in both main prog and async_cb subtree, that + * subprog will use normal kernel stack to avoid potential nesting. + * The reverse subprog traversal ensures when main prog subtree is + * checked, the subprogs appearing in async_cb subtrees are already + * marked as using normal kernel stack, so stack size checking can + * be done properly. + */ + for (int i = env->subprog_cnt - 1; i >= 0; i--) { if (!i || si[i].is_async_cb) { - ret = check_max_stack_depth_subprog(env, i); + priv_stack_supported = !i && priv_stack_mode == PRIV_STACK_ADAPTIVE; + ret = check_max_stack_depth_subprog(env, i, priv_stack_supported); if (ret < 0) return ret; } - continue; } return 0; } -- cgit From e00931c02568dc6ac76f94b1ab471de05e6fdfe8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 Nov 2024 08:39:12 -0800 Subject: bpf: Enable private stack for eligible subprogs If private stack is used by any subprog, set that subprog prog->aux->jits_use_priv_stack to be true so later jit can allocate private stack for that subprog properly. Also set env->prog->aux->jits_use_priv_stack to be true if any subprog uses private stack. This is a use case for a single main prog (no subprogs) to use private stack, and also a use case for later struct-ops progs where env->prog->aux->jits_use_priv_stack will enable recursion check if any subprog uses private stack. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20241112163912.2224007-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fb23793ac53d..176d19ad9d07 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6334,6 +6334,14 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) return ret; } } + + for (int i = 0; i < env->subprog_cnt; i++) { + if (si[i].priv_stack_mode == PRIV_STACK_ADAPTIVE) { + env->prog->aux->jits_use_priv_stack = true; + break; + } + } + return 0; } @@ -20274,6 +20282,9 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; + if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE) + func[i]->aux->jits_use_priv_stack = true; + func[i]->jit_requested = 1; func[i]->blinding_requested = prog->blinding_requested; func[i]->aux->kfunc_tab = prog->aux->kfunc_tab; -- cgit From 5bd36da1e37e7a78e8b38efd287de6e1394b7d6e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 Nov 2024 08:39:33 -0800 Subject: bpf: Support private stack for struct_ops progs For struct_ops progs, whether a particular prog uses private stack depends on prog->aux->priv_stack_requested setting before actual insn-level verification for that prog. One particular implementation is to piggyback on struct_ops->check_member(). The next patch has an example for this. The struct_ops->check_member() sets prog->aux->priv_stack_requested to be true which enables private stack usage. The struct_ops prog follows the same rule as kprobe/tracing progs after function bpf_enable_priv_stack(). For example, even a struct_ops prog requests private stack, it could still use normal kernel stack if the stack size is small (< 64 bytes). Similar to tracing progs, nested same cpu same prog run will be skipped. A field (recursion_detected()) is added to bpf_prog_aux structure. If bpf_prog->aux->recursion_detected is implemented by the struct_ops subsystem and nested same cpu/prog happens, the function will be triggered to report an error, collect related info, etc. Acked-by: Tejun Heo Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20241112163933.2224962-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 4 ++++ kernel/bpf/verifier.c | 7 ++++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 9f36c049f4c2..a8d188b31da5 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -899,6 +899,8 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); + if (prog->aux->recursion_detected) + prog->aux->recursion_detected(prog); return 0; } return bpf_prog_start_time(); @@ -975,6 +977,8 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); + if (prog->aux->recursion_detected) + prog->aux->recursion_detected(prog); return 0; } return bpf_prog_start_time(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 176d19ad9d07..f4c39bb50511 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6110,7 +6110,7 @@ static enum priv_stack_mode bpf_enable_priv_stack(struct bpf_prog *prog) case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: - if (bpf_prog_check_recur(prog)) + if (prog->aux->priv_stack_requested || bpf_prog_check_recur(prog)) return PRIV_STACK_ADAPTIVE; fallthrough; default: @@ -22053,6 +22053,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) } } + if (prog->aux->priv_stack_requested && !bpf_jit_supports_private_stack()) { + verbose(env, "Private stack not supported by jit\n"); + return -EACCES; + } + /* btf_ctx_access() used this to provide argument type info */ prog->aux->ctx_arg_info = st_ops_desc->arg_info[member_idx].info; -- cgit From bd9d9b48eb1814ad761cce45774a18d95c33803c Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Tue, 12 Nov 2024 22:58:47 +0800 Subject: bpf: Remove unused member rcu from bpf_struct_ops_map The rcu member in bpf_struct_ops_map is not used after commit b671c2067a04 ("bpf: Retire the struct_ops map kvalue->refcnt.") Remove it. Suggested-by: Martin KaFai Lau Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20241112145849.3436772-2-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_struct_ops.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index fda3dd2ee984..40a93e690473 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -23,7 +23,6 @@ struct bpf_struct_ops_value { struct bpf_struct_ops_map { struct bpf_map map; - struct rcu_head rcu; const struct bpf_struct_ops_desc *st_ops_desc; /* protect map_update */ struct mutex lock; -- cgit From 821a3fa32bbe3bc0fa23b3189325d3720a49a24c Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Tue, 12 Nov 2024 22:58:48 +0800 Subject: bpf: Use function pointers count as struct_ops links count Only function pointers in a struct_ops structure can be linked to bpf progs, so set the links count to the function pointers count, instead of the total members count in the structure. Suggested-by: Martin KaFai Lau Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20241112145849.3436772-3-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_struct_ops.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 40a93e690473..ff94c8120ebb 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -31,7 +31,7 @@ struct bpf_struct_ops_map { * (in kvalue.data). */ struct bpf_link **links; - u32 links_cnt; + u32 funcs_cnt; u32 image_pages_cnt; /* image_pages is an array of pages that has all the trampolines * that stores the func args before calling the bpf_prog. @@ -480,11 +480,11 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) { u32 i; - for (i = 0; i < st_map->links_cnt; i++) { - if (st_map->links[i]) { - bpf_link_put(st_map->links[i]); - st_map->links[i] = NULL; - } + for (i = 0; i < st_map->funcs_cnt; i++) { + if (!st_map->links[i]) + break; + bpf_link_put(st_map->links[i]); + st_map->links[i] = NULL; } } @@ -600,6 +600,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, int prog_fd, err; u32 i, trampoline_start, image_off = 0; void *cur_image = NULL, *image = NULL; + struct bpf_link **plink; if (flags) return -EINVAL; @@ -638,6 +639,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, udata = &uvalue->data; kdata = &kvalue->data; + plink = st_map->links; module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]); for_each_member(i, t, member) { const struct btf_type *mtype, *ptype; @@ -713,7 +715,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, } bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog); - st_map->links[i] = &link->link; + *plink++ = &link->link; trampoline_start = image_off; err = bpf_struct_ops_prepare_trampoline(tlinks, link, @@ -894,6 +896,19 @@ static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) return 0; } +static u32 count_func_ptrs(const struct btf *btf, const struct btf_type *t) +{ + int i; + u32 count; + const struct btf_member *member; + + count = 0; + for_each_member(i, t, member) + if (btf_type_resolve_func_ptr(btf, member->type, NULL)) + count++; + return count; +} + static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) { const struct bpf_struct_ops_desc *st_ops_desc; @@ -960,9 +975,9 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) map = &st_map->map; st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); - st_map->links_cnt = btf_type_vlen(t); + st_map->funcs_cnt = count_func_ptrs(btf, t); st_map->links = - bpf_map_area_alloc(st_map->links_cnt * sizeof(struct bpf_links *), + bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_link *), NUMA_NO_NODE); if (!st_map->uvalue || !st_map->links) { ret = -ENOMEM; @@ -993,7 +1008,7 @@ static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map) usage = sizeof(*st_map) + vt->size - sizeof(struct bpf_struct_ops_value); usage += vt->size; - usage += btf_type_vlen(vt) * sizeof(struct bpf_links *); + usage += st_map->funcs_cnt * sizeof(struct bpf_link *); usage += PAGE_SIZE; return usage; } -- cgit From 7c8ce4ffb684676039b1ff9ff81c126794e8d88e Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Tue, 12 Nov 2024 22:58:49 +0800 Subject: bpf: Add kernel symbol for struct_ops trampoline Without kernel symbols for struct_ops trampoline, the unwinder may produce unexpected stacktraces. For example, the x86 ORC and FP unwinders check if an IP is in kernel text by verifying the presence of the IP's kernel symbol. When a struct_ops trampoline address is encountered, the unwinder stops due to the absence of symbol, resulting in an incomplete stacktrace that consists only of direct and indirect child functions called from the trampoline. The arm64 unwinder is another example. While the arm64 unwinder can proceed across a struct_ops trampoline address, the corresponding symbol name is displayed as "unknown", which is confusing. Thus, add kernel symbol for struct_ops trampoline. The name is bpf___, where is the type name of the struct_ops, and is the name of the member that the trampoline is linked to. Below is a comparison of stacktraces captured on x86 by perf record, before and after this patch. Before: ffffffff8116545d __lock_acquire+0xad ([kernel.kallsyms]) ffffffff81167fcc lock_acquire+0xcc ([kernel.kallsyms]) ffffffff813088f4 __bpf_prog_enter+0x34 ([kernel.kallsyms]) After: ffffffff811656bd __lock_acquire+0x30d ([kernel.kallsyms]) ffffffff81167fcc lock_acquire+0xcc ([kernel.kallsyms]) ffffffff81309024 __bpf_prog_enter+0x34 ([kernel.kallsyms]) ffffffffc000d7e9 bpf__tcp_congestion_ops_cong_avoid+0x3e ([kernel.kallsyms]) ffffffff81f250a5 tcp_ack+0x10d5 ([kernel.kallsyms]) ffffffff81f27c66 tcp_rcv_established+0x3b6 ([kernel.kallsyms]) ffffffff81f3ad03 tcp_v4_do_rcv+0x193 ([kernel.kallsyms]) ffffffff81d65a18 __release_sock+0xd8 ([kernel.kallsyms]) ffffffff81d65af4 release_sock+0x34 ([kernel.kallsyms]) ffffffff81f15c4b tcp_sendmsg+0x3b ([kernel.kallsyms]) ffffffff81f663d7 inet_sendmsg+0x47 ([kernel.kallsyms]) ffffffff81d5ab40 sock_write_iter+0x160 ([kernel.kallsyms]) ffffffff8149c67b vfs_write+0x3fb ([kernel.kallsyms]) ffffffff8149caf6 ksys_write+0xc6 ([kernel.kallsyms]) ffffffff8149cb5d __x64_sys_write+0x1d ([kernel.kallsyms]) ffffffff81009200 x64_sys_call+0x1d30 ([kernel.kallsyms]) ffffffff82232d28 do_syscall_64+0x68 ([kernel.kallsyms]) ffffffff8240012f entry_SYSCALL_64_after_hwframe+0x76 ([kernel.kallsyms]) Fixes: 85d33df357b6 ("bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS") Signed-off-by: Xu Kuohai Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20241112145849.3436772-4-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_struct_ops.c | 79 ++++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/dispatcher.c | 3 +- kernel/bpf/trampoline.c | 9 ++++-- 3 files changed, 87 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index ff94c8120ebb..606efe32485a 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -31,6 +31,8 @@ struct bpf_struct_ops_map { * (in kvalue.data). */ struct bpf_link **links; + /* ksyms for bpf trampolines */ + struct bpf_ksym **ksyms; u32 funcs_cnt; u32 image_pages_cnt; /* image_pages is an array of pages that has all the trampolines @@ -585,6 +587,49 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, return 0; } +static void bpf_struct_ops_ksym_init(const char *tname, const char *mname, + void *image, unsigned int size, + struct bpf_ksym *ksym) +{ + snprintf(ksym->name, KSYM_NAME_LEN, "bpf__%s_%s", tname, mname); + INIT_LIST_HEAD_RCU(&ksym->lnode); + bpf_image_ksym_init(image, size, ksym); +} + +static void bpf_struct_ops_map_add_ksyms(struct bpf_struct_ops_map *st_map) +{ + u32 i; + + for (i = 0; i < st_map->funcs_cnt; i++) { + if (!st_map->ksyms[i]) + break; + bpf_image_ksym_add(st_map->ksyms[i]); + } +} + +static void bpf_struct_ops_map_del_ksyms(struct bpf_struct_ops_map *st_map) +{ + u32 i; + + for (i = 0; i < st_map->funcs_cnt; i++) { + if (!st_map->ksyms[i]) + break; + bpf_image_ksym_del(st_map->ksyms[i]); + } +} + +static void bpf_struct_ops_map_free_ksyms(struct bpf_struct_ops_map *st_map) +{ + u32 i; + + for (i = 0; i < st_map->funcs_cnt; i++) { + if (!st_map->ksyms[i]) + break; + kfree(st_map->ksyms[i]); + st_map->ksyms[i] = NULL; + } +} + static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { @@ -601,6 +646,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, u32 i, trampoline_start, image_off = 0; void *cur_image = NULL, *image = NULL; struct bpf_link **plink; + struct bpf_ksym **pksym; + const char *tname, *mname; if (flags) return -EINVAL; @@ -640,14 +687,18 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, kdata = &kvalue->data; plink = st_map->links; + pksym = st_map->ksyms; + tname = btf_name_by_offset(st_map->btf, t->name_off); module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]); for_each_member(i, t, member) { const struct btf_type *mtype, *ptype; struct bpf_prog *prog; struct bpf_tramp_link *link; + struct bpf_ksym *ksym; u32 moff; moff = __btf_member_bit_offset(t, member) / 8; + mname = btf_name_by_offset(st_map->btf, member->name_off); ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL); if (ptype == module_type) { if (*(void **)(udata + moff)) @@ -717,6 +768,13 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, &bpf_struct_ops_link_lops, prog); *plink++ = &link->link; + ksym = kzalloc(sizeof(*ksym), GFP_USER); + if (!ksym) { + err = -ENOMEM; + goto reset_unlock; + } + *pksym++ = ksym; + trampoline_start = image_off; err = bpf_struct_ops_prepare_trampoline(tlinks, link, &st_ops->func_models[i], @@ -736,6 +794,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, /* put prog_id to udata */ *(unsigned long *)(udata + moff) = prog->aux->id; + + /* init ksym for this trampoline */ + bpf_struct_ops_ksym_init(tname, mname, + image + trampoline_start, + image_off - trampoline_start, + ksym); } if (st_ops->validate) { @@ -784,6 +848,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, */ reset_unlock: + bpf_struct_ops_map_free_ksyms(st_map); bpf_struct_ops_map_free_image(st_map); bpf_struct_ops_map_put_progs(st_map); memset(uvalue, 0, map->value_size); @@ -791,6 +856,8 @@ reset_unlock: unlock: kfree(tlinks); mutex_unlock(&st_map->lock); + if (!err) + bpf_struct_ops_map_add_ksyms(st_map); return err; } @@ -850,7 +917,10 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map) if (st_map->links) bpf_struct_ops_map_put_progs(st_map); + if (st_map->ksyms) + bpf_struct_ops_map_free_ksyms(st_map); bpf_map_area_free(st_map->links); + bpf_map_area_free(st_map->ksyms); bpf_struct_ops_map_free_image(st_map); bpf_map_area_free(st_map->uvalue); bpf_map_area_free(st_map); @@ -867,6 +937,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map) if (btf_is_module(st_map->btf)) module_put(st_map->st_ops_desc->st_ops->owner); + bpf_struct_ops_map_del_ksyms(st_map); + /* The struct_ops's function may switch to another struct_ops. * * For example, bpf_tcp_cc_x->init() may switch to @@ -979,7 +1051,11 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) st_map->links = bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_link *), NUMA_NO_NODE); - if (!st_map->uvalue || !st_map->links) { + + st_map->ksyms = + bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_ksym *), + NUMA_NO_NODE); + if (!st_map->uvalue || !st_map->links || !st_map->ksyms) { ret = -ENOMEM; goto errout_free; } @@ -1009,6 +1085,7 @@ static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map) vt->size - sizeof(struct bpf_struct_ops_value); usage += vt->size; usage += st_map->funcs_cnt * sizeof(struct bpf_link *); + usage += st_map->funcs_cnt * sizeof(struct bpf_ksym *); usage += PAGE_SIZE; return usage; } diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index 70fb82bf1637..b77db7413f8c 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -154,7 +154,8 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, d->image = NULL; goto out; } - bpf_image_ksym_add(d->image, PAGE_SIZE, &d->ksym); + bpf_image_ksym_init(d->image, PAGE_SIZE, &d->ksym); + bpf_image_ksym_add(&d->ksym); } prev_num_progs = d->num_progs; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index a8d188b31da5..c4b1a98ff726 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -115,10 +115,14 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); } -void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym) +void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym) { ksym->start = (unsigned long) data; ksym->end = ksym->start + size; +} + +void bpf_image_ksym_add(struct bpf_ksym *ksym) +{ bpf_ksym_add(ksym); perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, PAGE_SIZE, false, ksym->name); @@ -377,7 +381,8 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size) ksym = &im->ksym; INIT_LIST_HEAD_RCU(&ksym->lnode); snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key); - bpf_image_ksym_add(image, size, ksym); + bpf_image_ksym_init(image, size, ksym); + bpf_image_ksym_add(ksym); return im; out_free_image: -- cgit From b795379757eb054925fbb6783559c86f01c1a614 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Nov 2024 18:56:15 -0800 Subject: bpf: Introduce range_tree data structure and use it in bpf arena Introduce range_tree data structure and use it in bpf arena to track ranges of allocated pages. range_tree is a large bitmap that is implemented as interval tree plus rbtree. The contiguous sequence of bits represents unallocated pages. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20241108025616.17625-2-alexei.starovoitov@gmail.com --- kernel/bpf/Makefile | 2 +- kernel/bpf/arena.c | 34 ++++--- kernel/bpf/range_tree.c | 262 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/range_tree.h | 21 ++++ 4 files changed, 304 insertions(+), 15 deletions(-) create mode 100644 kernel/bpf/range_tree.c create mode 100644 kernel/bpf/range_tree.h (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 105328f0b9c0..9762bdddf1de 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -16,7 +16,7 @@ obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) -obj-$(CONFIG_BPF_SYSCALL) += arena.o +obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o endif obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index e52b3ad231b9..3e1dfe349ced 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -6,6 +6,7 @@ #include #include #include +#include "range_tree.h" /* * bpf_arena is a sparsely populated shared memory region between bpf program and @@ -45,7 +46,7 @@ struct bpf_arena { u64 user_vm_start; u64 user_vm_end; struct vm_struct *kern_vm; - struct maple_tree mt; + struct range_tree rt; struct list_head vma_list; struct mutex lock; }; @@ -132,7 +133,8 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) INIT_LIST_HEAD(&arena->vma_list); bpf_map_init_from_attr(&arena->map, attr); - mt_init_flags(&arena->mt, MT_FLAGS_ALLOC_RANGE); + range_tree_init(&arena->rt); + range_tree_set(&arena->rt, 0, attr->max_entries); mutex_init(&arena->lock); return &arena->map; @@ -183,7 +185,7 @@ static void arena_map_free(struct bpf_map *map) apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL); free_vm_area(arena->kern_vm); - mtree_destroy(&arena->mt); + range_tree_destroy(&arena->rt); bpf_map_area_free(arena); } @@ -274,20 +276,20 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) /* User space requested to segfault when page is not allocated by bpf prog */ return VM_FAULT_SIGSEGV; - ret = mtree_insert(&arena->mt, vmf->pgoff, MT_ENTRY, GFP_KERNEL); + ret = range_tree_clear(&arena->rt, vmf->pgoff, 1); if (ret) return VM_FAULT_SIGSEGV; /* Account into memcg of the process that created bpf_arena */ ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page); if (ret) { - mtree_erase(&arena->mt, vmf->pgoff); + range_tree_set(&arena->rt, vmf->pgoff, 1); return VM_FAULT_SIGSEGV; } ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page); if (ret) { - mtree_erase(&arena->mt, vmf->pgoff); + range_tree_set(&arena->rt, vmf->pgoff, 1); __free_page(page); return VM_FAULT_SIGSEGV; } @@ -444,12 +446,16 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt guard(mutex)(&arena->lock); - if (uaddr) - ret = mtree_insert_range(&arena->mt, pgoff, pgoff + page_cnt - 1, - MT_ENTRY, GFP_KERNEL); - else - ret = mtree_alloc_range(&arena->mt, &pgoff, MT_ENTRY, - page_cnt, 0, page_cnt_max - 1, GFP_KERNEL); + if (uaddr) { + ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); + if (ret) + goto out_free_pages; + ret = range_tree_clear(&arena->rt, pgoff, page_cnt); + } else { + ret = pgoff = range_tree_find(&arena->rt, page_cnt); + if (pgoff >= 0) + ret = range_tree_clear(&arena->rt, pgoff, page_cnt); + } if (ret) goto out_free_pages; @@ -476,7 +482,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt kvfree(pages); return clear_lo32(arena->user_vm_start) + uaddr32; out: - mtree_erase(&arena->mt, pgoff); + range_tree_set(&arena->rt, pgoff, page_cnt); out_free_pages: kvfree(pages); return 0; @@ -516,7 +522,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) pgoff = compute_pgoff(arena, uaddr); /* clear range */ - mtree_store_range(&arena->mt, pgoff, pgoff + page_cnt - 1, NULL, GFP_KERNEL); + range_tree_set(&arena->rt, pgoff, page_cnt); if (page_cnt > 1) /* bulk zap if multiple pages being freed */ diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c new file mode 100644 index 000000000000..f7915ab0a6d3 --- /dev/null +++ b/kernel/bpf/range_tree.c @@ -0,0 +1,262 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include "range_tree.h" + +/* + * struct range_tree is a data structure used to allocate contiguous memory + * ranges in bpf arena. It's a large bitmap. The contiguous sequence of bits is + * represented by struct range_node or 'rn' for short. + * rn->rn_rbnode links it into an interval tree while + * rn->rb_range_size links it into a second rbtree sorted by size of the range. + * __find_range() performs binary search and best fit algorithm to find the + * range less or equal requested size. + * range_tree_clear/set() clears or sets a range of bits in this bitmap. The + * adjacent ranges are merged or split at the same time. + * + * The split/merge logic is based/borrowed from XFS's xbitmap32 added + * in commit 6772fcc8890a ("xfs: convert xbitmap to interval tree"). + * + * The implementation relies on external lock to protect rbtree-s. + * The alloc/free of range_node-s is done via bpf_mem_alloc. + * + * bpf arena is using range_tree to represent unallocated slots. + * At init time: + * range_tree_set(rt, 0, max); + * Then: + * start = range_tree_find(rt, len); + * if (start >= 0) + * range_tree_clear(rt, start, len); + * to find free range and mark slots as allocated and later: + * range_tree_set(rt, start, len); + * to mark as unallocated after use. + */ +struct range_node { + struct rb_node rn_rbnode; + struct rb_node rb_range_size; + u32 rn_start; + u32 rn_last; /* inclusive */ + u32 __rn_subtree_last; +}; + +static struct range_node *rb_to_range_node(struct rb_node *rb) +{ + return rb_entry(rb, struct range_node, rb_range_size); +} + +static u32 rn_size(struct range_node *rn) +{ + return rn->rn_last - rn->rn_start + 1; +} + +/* Find range that fits best to requested size */ +static inline struct range_node *__find_range(struct range_tree *rt, u32 len) +{ + struct rb_node *rb = rt->range_size_root.rb_root.rb_node; + struct range_node *best = NULL; + + while (rb) { + struct range_node *rn = rb_to_range_node(rb); + + if (len <= rn_size(rn)) { + best = rn; + rb = rb->rb_right; + } else { + rb = rb->rb_left; + } + } + + return best; +} + +s64 range_tree_find(struct range_tree *rt, u32 len) +{ + struct range_node *rn; + + rn = __find_range(rt, len); + if (!rn) + return -ENOENT; + return rn->rn_start; +} + +/* Insert the range into rbtree sorted by the range size */ +static inline void __range_size_insert(struct range_node *rn, + struct rb_root_cached *root) +{ + struct rb_node **link = &root->rb_root.rb_node, *rb = NULL; + u64 size = rn_size(rn); + bool leftmost = true; + + while (*link) { + rb = *link; + if (size > rn_size(rb_to_range_node(rb))) { + link = &rb->rb_left; + } else { + link = &rb->rb_right; + leftmost = false; + } + } + + rb_link_node(&rn->rb_range_size, rb, link); + rb_insert_color_cached(&rn->rb_range_size, root, leftmost); +} + +#define START(node) ((node)->rn_start) +#define LAST(node) ((node)->rn_last) + +INTERVAL_TREE_DEFINE(struct range_node, rn_rbnode, u32, + __rn_subtree_last, START, LAST, + static inline __maybe_unused, + __range_it) + +static inline __maybe_unused void +range_it_insert(struct range_node *rn, struct range_tree *rt) +{ + __range_size_insert(rn, &rt->range_size_root); + __range_it_insert(rn, &rt->it_root); +} + +static inline __maybe_unused void +range_it_remove(struct range_node *rn, struct range_tree *rt) +{ + rb_erase_cached(&rn->rb_range_size, &rt->range_size_root); + RB_CLEAR_NODE(&rn->rb_range_size); + __range_it_remove(rn, &rt->it_root); +} + +static inline __maybe_unused struct range_node * +range_it_iter_first(struct range_tree *rt, u32 start, u32 last) +{ + return __range_it_iter_first(&rt->it_root, start, last); +} + +/* Clear the range in this range tree */ +int range_tree_clear(struct range_tree *rt, u32 start, u32 len) +{ + u32 last = start + len - 1; + struct range_node *new_rn; + struct range_node *rn; + + while ((rn = range_it_iter_first(rt, start, last))) { + if (rn->rn_start < start && rn->rn_last > last) { + u32 old_last = rn->rn_last; + + /* Overlaps with the entire clearing range */ + range_it_remove(rn, rt); + rn->rn_last = start - 1; + range_it_insert(rn, rt); + + /* Add a range */ + new_rn = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node)); + if (!new_rn) + return -ENOMEM; + new_rn->rn_start = last + 1; + new_rn->rn_last = old_last; + range_it_insert(new_rn, rt); + } else if (rn->rn_start < start) { + /* Overlaps with the left side of the clearing range */ + range_it_remove(rn, rt); + rn->rn_last = start - 1; + range_it_insert(rn, rt); + } else if (rn->rn_last > last) { + /* Overlaps with the right side of the clearing range */ + range_it_remove(rn, rt); + rn->rn_start = last + 1; + range_it_insert(rn, rt); + break; + } else { + /* in the middle of the clearing range */ + range_it_remove(rn, rt); + bpf_mem_free(&bpf_global_ma, rn); + } + } + return 0; +} + +/* Is the whole range set ? */ +int is_range_tree_set(struct range_tree *rt, u32 start, u32 len) +{ + u32 last = start + len - 1; + struct range_node *left; + + /* Is this whole range set ? */ + left = range_it_iter_first(rt, start, last); + if (left && left->rn_start <= start && left->rn_last >= last) + return 0; + return -ESRCH; +} + +/* Set the range in this range tree */ +int range_tree_set(struct range_tree *rt, u32 start, u32 len) +{ + u32 last = start + len - 1; + struct range_node *right; + struct range_node *left; + int err; + + /* Is this whole range already set ? */ + left = range_it_iter_first(rt, start, last); + if (left && left->rn_start <= start && left->rn_last >= last) + return 0; + + /* Clear out everything in the range we want to set. */ + err = range_tree_clear(rt, start, len); + if (err) + return err; + + /* Do we have a left-adjacent range ? */ + left = range_it_iter_first(rt, start - 1, start - 1); + if (left && left->rn_last + 1 != start) + return -EFAULT; + + /* Do we have a right-adjacent range ? */ + right = range_it_iter_first(rt, last + 1, last + 1); + if (right && right->rn_start != last + 1) + return -EFAULT; + + if (left && right) { + /* Combine left and right adjacent ranges */ + range_it_remove(left, rt); + range_it_remove(right, rt); + left->rn_last = right->rn_last; + range_it_insert(left, rt); + bpf_mem_free(&bpf_global_ma, right); + } else if (left) { + /* Combine with the left range */ + range_it_remove(left, rt); + left->rn_last = last; + range_it_insert(left, rt); + } else if (right) { + /* Combine with the right range */ + range_it_remove(right, rt); + right->rn_start = start; + range_it_insert(right, rt); + } else { + left = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node)); + if (!left) + return -ENOMEM; + left->rn_start = start; + left->rn_last = last; + range_it_insert(left, rt); + } + return 0; +} + +void range_tree_destroy(struct range_tree *rt) +{ + struct range_node *rn; + + while ((rn = range_it_iter_first(rt, 0, -1U))) { + range_it_remove(rn, rt); + bpf_mem_free(&bpf_global_ma, rn); + } +} + +void range_tree_init(struct range_tree *rt) +{ + rt->it_root = RB_ROOT_CACHED; + rt->range_size_root = RB_ROOT_CACHED; +} diff --git a/kernel/bpf/range_tree.h b/kernel/bpf/range_tree.h new file mode 100644 index 000000000000..ff0b9110eb71 --- /dev/null +++ b/kernel/bpf/range_tree.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#ifndef _RANGE_TREE_H +#define _RANGE_TREE_H 1 + +struct range_tree { + /* root of interval tree */ + struct rb_root_cached it_root; + /* root of rbtree of interval sizes */ + struct rb_root_cached range_size_root; +}; + +void range_tree_init(struct range_tree *rt); +void range_tree_destroy(struct range_tree *rt); + +int range_tree_clear(struct range_tree *rt, u32 start, u32 len); +int range_tree_set(struct range_tree *rt, u32 start, u32 len); +int is_range_tree_set(struct range_tree *rt, u32 start, u32 len); +s64 range_tree_find(struct range_tree *rt, u32 len); + +#endif -- cgit From ab4dc30c5322fc46d0db938d1c0bdd56d7adcea1 Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Fri, 15 Nov 2024 09:25:48 +0100 Subject: bpf: Do not alloc arena on unsupported arches Do not allocate BPF arena on arches that do not support it, instead return EOPNOTSUPP. This is useful to prevent bugs such as soft lockups while trying to free the arena which we have witnessed on ppc64le [1]. [1] https://lore.kernel.org/bpf/4afdcb50-13f2-4772-8db1-3fd02bd985b3@redhat.com/ Signed-off-by: Viktor Malik Link: https://lore.kernel.org/r/20241115082548.74972-1-vmalik@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arena.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 3e1dfe349ced..945a5680f6a5 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -3,6 +3,7 @@ #include #include #include +#include "linux/filter.h" #include #include #include @@ -99,6 +100,9 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) u64 vm_range; int err = -ENOMEM; + if (!bpf_jit_supports_arena()) + return ERR_PTR(-EOPNOTSUPP); + if (attr->key_size || attr->value_size || attr->max_entries == 0 || /* BPF_F_MMAPABLE must be set */ !(attr->map_flags & BPF_F_MMAPABLE) || -- cgit From 4ff04abf9d5bc33d33c7a799887517619188b068 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 14 Nov 2024 22:03:54 -0800 Subject: bpf: Add necessary migrate_disable to range_tree. When running bpf selftest (./test_progs -j), the following warnings showed up: $ ./test_progs -t arena_atomics ... BUG: using smp_processor_id() in preemptible [00000000] code: kworker/u19:0/12501 caller is bpf_mem_free+0x128/0x330 ... Call Trace: dump_stack_lvl check_preemption_disabled bpf_mem_free range_tree_destroy arena_map_free bpf_map_free_deferred process_scheduled_works ... For selftests arena_htab and arena_list, similar smp_process_id() BUGs are dumped, and the following are two stack trace: dump_stack_lvl check_preemption_disabled bpf_mem_alloc range_tree_set arena_map_alloc map_create ... dump_stack_lvl check_preemption_disabled bpf_mem_alloc range_tree_clear arena_vm_fault do_pte_missing handle_mm_fault do_user_addr_fault ... Add migrate_{disable,enable}() around related bpf_mem_{alloc,free}() calls to fix the issue. Fixes: b795379757eb ("bpf: Introduce range_tree data structure and use it in bpf arena") Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20241115060354.2832495-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/range_tree.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c index f7915ab0a6d3..5bdf9aadca3a 100644 --- a/kernel/bpf/range_tree.c +++ b/kernel/bpf/range_tree.c @@ -150,7 +150,9 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len) range_it_insert(rn, rt); /* Add a range */ + migrate_disable(); new_rn = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node)); + migrate_enable(); if (!new_rn) return -ENOMEM; new_rn->rn_start = last + 1; @@ -170,7 +172,9 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len) } else { /* in the middle of the clearing range */ range_it_remove(rn, rt); + migrate_disable(); bpf_mem_free(&bpf_global_ma, rn); + migrate_enable(); } } return 0; @@ -223,7 +227,9 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len) range_it_remove(right, rt); left->rn_last = right->rn_last; range_it_insert(left, rt); + migrate_disable(); bpf_mem_free(&bpf_global_ma, right); + migrate_enable(); } else if (left) { /* Combine with the left range */ range_it_remove(left, rt); @@ -235,7 +241,9 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len) right->rn_start = start; range_it_insert(right, rt); } else { + migrate_disable(); left = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node)); + migrate_enable(); if (!left) return -ENOMEM; left->rn_start = start; @@ -251,7 +259,9 @@ void range_tree_destroy(struct range_tree *rt) while ((rn = range_it_iter_first(rt, 0, -1U))) { range_it_remove(rn, rt); + migrate_disable(); bpf_mem_free(&bpf_global_ma, rn); + migrate_enable(); } } -- cgit From 96a30e469ca1d2b8cc7811b40911f8614b558241 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Nov 2024 16:13:03 -0800 Subject: bpf: use common instruction history across all states Instead of allocating and copying instruction history each time we enqueue child verifier state, switch to a model where we use one common dynamically sized array of instruction history entries across all states. The key observation for proving this is correct is that instruction history is only relevant while state is active, which means it either is a current state (and thus we are actively modifying instruction history and no other state can interfere with us) or we are checkpointed state with some children still active (either enqueued or being current). In the latter case our portion of instruction history is finalized and won't change or grow, so as long as we keep it immutable until the state is finalized, we are good. Now, when state is finalized and is put into state hash for potentially future pruning lookups, instruction history is not used anymore. This is because instruction history is only used by precision marking logic, and we never modify precision markings for finalized states. So, instead of each state having its own small instruction history, we keep a global dynamically-sized instruction history, where each state in current DFS path from root to active state remembers its portion of instruction history. Current state can append to this history, but cannot modify any of its parent histories. Async callback state enqueueing, while logically detached from parent state, still is part of verification backtracking tree, so has to follow the same schema as normal state checkpoints. Because the insn_hist array can be grown through realloc, states don't keep pointers, they instead maintain two indices, [start, end), into global instruction history array. End is exclusive index, so `start == end` means there is no relevant instruction history. This eliminates a lot of allocations and minimizes overall memory usage. For instance, running a worst-case test from [0] (but without the heuristics-based fix [1]), it took 12.5 minutes until we get -ENOMEM. With the changes in this patch the whole test succeeds in 10 minutes (very slow, so heuristics from [1] is important, of course). To further validate correctness, veristat-based comparison was performed for Meta production BPF objects and BPF selftests objects. In both cases there were no differences *at all* in terms of verdict or instruction and state counts, providing a good confidence in the change. Having this low-memory-overhead solution of keeping dynamic per-instruction history cheaply opens up some new possibilities, like keeping extra information for literally every single validated instruction. This will be used for simplifying precision backpropagation logic in follow up patches. [0] https://lore.kernel.org/bpf/20241029172641.1042523-2-eddyz87@gmail.com/ [1] https://lore.kernel.org/bpf/20241029172641.1042523-1-eddyz87@gmail.com/ Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20241115001303.277272-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 107 ++++++++++++++++++++++++-------------------------- 1 file changed, 52 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 09f7fa635f67..1c4ebb326785 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1452,13 +1452,6 @@ static void free_func_state(struct bpf_func_state *state) kfree(state); } -static void clear_jmp_history(struct bpf_verifier_state *state) -{ - kfree(state->jmp_history); - state->jmp_history = NULL; - state->jmp_history_cnt = 0; -} - static void free_verifier_state(struct bpf_verifier_state *state, bool free_self) { @@ -1468,7 +1461,6 @@ static void free_verifier_state(struct bpf_verifier_state *state, free_func_state(state->frame[i]); state->frame[i] = NULL; } - clear_jmp_history(state); if (free_self) kfree(state); } @@ -1494,13 +1486,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, struct bpf_func_state *dst; int i, err; - dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history, - src->jmp_history_cnt, sizeof(*dst_state->jmp_history), - GFP_USER); - if (!dst_state->jmp_history) - return -ENOMEM; - dst_state->jmp_history_cnt = src->jmp_history_cnt; - /* if dst has more stack frames then src frame, free them, this is also * necessary in case of exceptional exits using bpf_throw. */ @@ -1517,6 +1502,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->parent = src->parent; dst_state->first_insn_idx = src->first_insn_idx; dst_state->last_insn_idx = src->last_insn_idx; + dst_state->insn_hist_start = src->insn_hist_start; + dst_state->insn_hist_end = src->insn_hist_end; dst_state->dfs_depth = src->dfs_depth; dst_state->callback_unroll_depth = src->callback_unroll_depth; dst_state->used_as_loop_entry = src->used_as_loop_entry; @@ -2569,9 +2556,14 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, * The caller state doesn't matter. * This is async callback. It starts in a fresh stack. * Initialize it similar to do_check_common(). + * But we do need to make sure to not clobber insn_hist, so we keep + * chaining insn_hist_start/insn_hist_end indices as for a normal + * child state. */ elem->st.branches = 1; elem->st.in_sleepable = is_sleepable; + elem->st.insn_hist_start = env->cur_state->insn_hist_end; + elem->st.insn_hist_end = elem->st.insn_hist_start; frame = kzalloc(sizeof(*frame), GFP_KERNEL); if (!frame) goto err; @@ -3551,11 +3543,10 @@ static void linked_regs_unpack(u64 val, struct linked_regs *s) } /* for any branch, call, exit record the history of jmps in the given state */ -static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, - int insn_flags, u64 linked_regs) +static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, + int insn_flags, u64 linked_regs) { - u32 cnt = cur->jmp_history_cnt; - struct bpf_jmp_history_entry *p; + struct bpf_insn_hist_entry *p; size_t alloc_size; /* combine instruction flags if we already recorded this instruction */ @@ -3575,29 +3566,32 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_st return 0; } - cnt++; - alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); - p = krealloc(cur->jmp_history, alloc_size, GFP_USER); - if (!p) - return -ENOMEM; - cur->jmp_history = p; + if (cur->insn_hist_end + 1 > env->insn_hist_cap) { + alloc_size = size_mul(cur->insn_hist_end + 1, sizeof(*p)); + p = kvrealloc(env->insn_hist, alloc_size, GFP_USER); + if (!p) + return -ENOMEM; + env->insn_hist = p; + env->insn_hist_cap = alloc_size / sizeof(*p); + } - p = &cur->jmp_history[cnt - 1]; + p = &env->insn_hist[cur->insn_hist_end]; p->idx = env->insn_idx; p->prev_idx = env->prev_insn_idx; p->flags = insn_flags; p->linked_regs = linked_regs; - cur->jmp_history_cnt = cnt; + + cur->insn_hist_end++; env->cur_hist_ent = p; return 0; } -static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st, - u32 hist_end, int insn_idx) +static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env *env, + u32 hist_start, u32 hist_end, int insn_idx) { - if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx) - return &st->jmp_history[hist_end - 1]; + if (hist_end > hist_start && env->insn_hist[hist_end - 1].idx == insn_idx) + return &env->insn_hist[hist_end - 1]; return NULL; } @@ -3614,25 +3608,26 @@ static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_stat * history entry recording a jump from last instruction of parent state and * first instruction of given state. */ -static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, - u32 *history) +static int get_prev_insn_idx(const struct bpf_verifier_env *env, + struct bpf_verifier_state *st, + int insn_idx, u32 hist_start, u32 *hist_endp) { - u32 cnt = *history; + u32 hist_end = *hist_endp; + u32 cnt = hist_end - hist_start; - if (i == st->first_insn_idx) { + if (insn_idx == st->first_insn_idx) { if (cnt == 0) return -ENOENT; - if (cnt == 1 && st->jmp_history[0].idx == i) + if (cnt == 1 && env->insn_hist[hist_start].idx == insn_idx) return -ENOENT; } - if (cnt && st->jmp_history[cnt - 1].idx == i) { - i = st->jmp_history[cnt - 1].prev_idx; - (*history)--; + if (cnt && env->insn_hist[hist_end - 1].idx == insn_idx) { + (*hist_endp)--; + return env->insn_hist[hist_end - 1].prev_idx; } else { - i--; + return insn_idx - 1; } - return i; } static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) @@ -3804,7 +3799,7 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) /* If any register R in hist->linked_regs is marked as precise in bt, * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs. */ -static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist) +static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_insn_hist_entry *hist) { struct linked_regs linked_regs; bool some_precise = false; @@ -3849,7 +3844,7 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); * - *was* processed previously during backtracking. */ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, - struct bpf_jmp_history_entry *hist, struct backtrack_state *bt) + struct bpf_insn_hist_entry *hist, struct backtrack_state *bt) { const struct bpf_insn_cbs cbs = { .cb_call = disasm_kfunc_name, @@ -4268,7 +4263,7 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_ * SCALARS, as well as any other registers and slots that contribute to * a tracked state of given registers/stack slots, depending on specific BPF * assembly instructions (see backtrack_insns() for exact instruction handling - * logic). This backtracking relies on recorded jmp_history and is able to + * logic). This backtracking relies on recorded insn_hist and is able to * traverse entire chain of parent states. This process ends only when all the * necessary registers/slots and their transitive dependencies are marked as * precise. @@ -4385,8 +4380,9 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) for (;;) { DECLARE_BITMAP(mask, 64); - u32 history = st->jmp_history_cnt; - struct bpf_jmp_history_entry *hist; + u32 hist_start = st->insn_hist_start; + u32 hist_end = st->insn_hist_end; + struct bpf_insn_hist_entry *hist; if (env->log.level & BPF_LOG_LEVEL2) { verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n", @@ -4425,7 +4421,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) err = 0; skip_first = false; } else { - hist = get_jmp_hist_entry(st, history, i); + hist = get_insn_hist_entry(env, hist_start, hist_end, i); err = backtrack_insn(env, i, subseq_idx, hist, bt); } if (err == -ENOTSUPP) { @@ -4442,7 +4438,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) */ return 0; subseq_idx = i; - i = get_prev_insn_idx(st, i, &history); + i = get_prev_insn_idx(env, st, i, hist_start, &hist_end); if (i == -ENOENT) break; if (i >= env->prog->len) { @@ -4808,7 +4804,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (insn_flags) - return push_jmp_history(env, env->cur_state, insn_flags, 0); + return push_insn_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -5115,7 +5111,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, insn_flags = 0; /* we are not restoring spilled register */ } if (insn_flags) - return push_jmp_history(env, env->cur_state, insn_flags, 0); + return push_insn_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -15740,7 +15736,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (dst_reg->type == SCALAR_VALUE && dst_reg->id) collect_linked_regs(this_branch, dst_reg->id, &linked_regs); if (linked_regs.cnt > 1) { - err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); + err = push_insn_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); if (err) return err; } @@ -18129,7 +18125,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || /* Avoid accumulating infinitely long jmp history */ - cur->jmp_history_cnt > 40; + cur->insn_hist_end - cur->insn_hist_start > 40; /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 @@ -18327,7 +18323,7 @@ hit: * the current state. */ if (is_jmp_point(env, env->insn_idx)) - err = err ? : push_jmp_history(env, cur, 0, 0); + err = err ? : push_insn_history(env, cur, 0, 0); err = err ? : propagate_precision(env, &sl->state); if (err) return err; @@ -18426,8 +18422,8 @@ next: cur->parent = new; cur->first_insn_idx = insn_idx; + cur->insn_hist_start = cur->insn_hist_end; cur->dfs_depth = new->dfs_depth + 1; - clear_jmp_history(cur); new_sl->next = *explored_state(env, insn_idx); *explored_state(env, insn_idx) = new_sl; /* connect new state to parentage chain. Current frame needs all @@ -18595,7 +18591,7 @@ static int do_check(struct bpf_verifier_env *env) } if (is_jmp_point(env, env->insn_idx)) { - err = push_jmp_history(env, state, 0, 0); + err = push_insn_history(env, state, 0, 0); if (err) return err; } @@ -22789,6 +22785,7 @@ err_unlock: if (!is_priv) mutex_unlock(&bpf_verifier_lock); vfree(env->insn_aux_data); + kvfree(env->insn_hist); err_free_env: kvfree(env); return ret; -- cgit