From 65334e64a493c6a0976de7ad56bf8b7a9ff04b4a Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 25 Feb 2023 16:40:08 +0100 Subject: bpf: Support kptrs in percpu hashmap and percpu LRU hashmap Enable support for kptrs in percpu BPF hashmap and percpu BPF LRU hashmap by wiring up the freeing of these kptrs from percpu map elements. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230225154010.391965-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e3fcdc9836a6..da117a2a83b2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1059,7 +1059,9 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, case BPF_KPTR_UNREF: case BPF_KPTR_REF: if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_PERCPU_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) { ret = -EOPNOTSUPP; -- cgit From 9db44fdd8105da00669d425acab887c668df75f6 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 25 Feb 2023 16:40:09 +0100 Subject: bpf: Support kptrs in local storage maps Enable support for kptrs in local storage maps by wiring up the freeing of these kptrs from map value. Freeing of bpf_local_storage_map is only delayed in case there are special fields, therefore bpf_selem_free_* path can also only dereference smap safely in that case. This is recorded using a bool utilizing a hole in bpF_local_storage_elem. It could have been tagged in the pointer value smap using the lowest bit (since alignment > 1), but since there was already a hole I went with the simpler option. Only the map structure freeing is delayed using RCU barriers, as the buckets aren't used when selem is being freed, so they can be freed once all readers of the bucket lists can no longer access it. Cc: Martin KaFai Lau Cc: KP Singh Cc: Paul E. McKenney Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230225154010.391965-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index da117a2a83b2..eb50025b03c1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1063,7 +1063,11 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && - map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) { + map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY && + map->map_type != BPF_MAP_TYPE_SK_STORAGE && + map->map_type != BPF_MAP_TYPE_INODE_STORAGE && + map->map_type != BPF_MAP_TYPE_TASK_STORAGE && + map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) { ret = -EOPNOTSUPP; goto free_map_tab; } -- cgit From 90a5527d7686d3ebe0dd2a831356a6c7d7dc31bc Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:45:58 +0000 Subject: bpf: add new map ops ->map_mem_usage Add a new map ops ->map_mem_usage to print the memory usage of a bpf map. This is a preparation for the followup change. Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-2-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ede5f987484f..7c96e68859ec 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -771,16 +771,15 @@ static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) } #ifdef CONFIG_PROC_FS -/* Provides an approximation of the map's memory footprint. - * Used only to provide a backward compatibility and display - * a reasonable "memlock" info. - */ -static unsigned long bpf_map_memory_footprint(const struct bpf_map *map) +/* Show the memory usage of a bpf map */ +static u64 bpf_map_memory_usage(const struct bpf_map *map) { unsigned long size; - size = round_up(map->key_size + bpf_map_value_size(map), 8); + if (map->ops->map_mem_usage) + return map->ops->map_mem_usage(map); + size = round_up(map->key_size + bpf_map_value_size(map), 8); return round_up(map->max_entries * size, PAGE_SIZE); } @@ -803,7 +802,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) "max_entries:\t%u\n" "map_flags:\t%#x\n" "map_extra:\t%#llx\n" - "memlock:\t%lu\n" + "memlock:\t%llu\n" "map_id:\t%u\n" "frozen:\t%u\n", map->map_type, @@ -812,7 +811,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) map->max_entries, map->map_flags, (unsigned long long)map->map_extra, - bpf_map_memory_footprint(map), + bpf_map_memory_usage(map), map->id, READ_ONCE(map->frozen)); if (type) { -- cgit From 9629363cd05642fe43aded44938adec067ad1da3 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:46:14 +0000 Subject: bpf: offload map memory usage A new helper is introduced to calculate offload map memory usage. But currently the memory dynamically allocated in netdev dev_ops, like nsim_map_update_elem, is not counted. Let's just put it aside now. Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-18-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7c96e68859ec..053409d951d2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -105,6 +105,7 @@ const struct bpf_map_ops bpf_map_offload_ops = { .map_alloc = bpf_map_offload_map_alloc, .map_free = bpf_map_offload_map_free, .map_check_btf = map_check_no_btf, + .map_mem_usage = bpf_map_offload_map_mem_usage, }; static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) -- cgit From 6b4a6ea2c62d34272d64161d43a19c02355576e2 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:46:15 +0000 Subject: bpf: enforce all maps having memory usage callback We have implemented memory usage callback for all maps, and we enforce any newly added map having a callback as well. We check this callback at map creation time. If it doesn't have the callback, we will return EINVAL. Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-19-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 053409d951d2..f406dfa13792 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -129,6 +129,8 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) } if (attr->map_ifindex) ops = &bpf_map_offload_ops; + if (!ops->map_mem_usage) + return ERR_PTR(-EINVAL); map = ops->map_alloc(attr); if (IS_ERR(map)) return map; @@ -775,13 +777,7 @@ static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) /* Show the memory usage of a bpf map */ static u64 bpf_map_memory_usage(const struct bpf_map *map) { - unsigned long size; - - if (map->ops->map_mem_usage) - return map->ops->map_mem_usage(map); - - size = round_up(map->key_size + bpf_map_value_size(map), 8); - return round_up(map->max_entries * size, PAGE_SIZE); + return map->ops->map_mem_usage(map); } static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) -- cgit From 74843b57ec70af7b67b7e6153374834ee18d139f Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 9 Mar 2023 10:01:08 -0800 Subject: bpf: Change btf_record_find enum parameter to field_mask btf_record_find's 3rd parameter can be multiple enum btf_field_type's masked together. The function is called with BPF_KPTR in two places in verifier.c, so it works with masked values already. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230309180111.1618459-4-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f406dfa13792..cc4b7684910c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -520,14 +520,14 @@ static int btf_field_cmp(const void *a, const void *b) } struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset, - enum btf_field_type type) + u32 field_mask) { struct btf_field *field; - if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & type)) + if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask)) return NULL; field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp); - if (!field || !(field->type & type)) + if (!field || !(field->type & field_mask)) return NULL; return field; } -- cgit From c8e18754091479fac3f5b6c053c6bc4be0b7fb11 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Fri, 10 Mar 2023 15:07:41 -0800 Subject: bpf: Support __kptr to local kptrs If a PTR_TO_BTF_ID type comes from program BTF - not vmlinux or module BTF - it must have been allocated by bpf_obj_new and therefore must be free'd with bpf_obj_drop. Such a PTR_TO_BTF_ID is considered a "local kptr" and is tagged with MEM_ALLOC type tag by bpf_obj_new. This patch adds support for treating __kptr-tagged pointers to "local kptrs" as having an implicit bpf_obj_drop destructor for referenced kptr acquire / release semantics. Consider the following example: struct node_data { long key; long data; struct bpf_rb_node node; }; struct map_value { struct node_data __kptr *node; }; struct { __uint(type, BPF_MAP_TYPE_ARRAY); __type(key, int); __type(value, struct map_value); __uint(max_entries, 1); } some_nodes SEC(".maps"); If struct node_data had a matching definition in kernel BTF, the verifier would expect a destructor for the type to be registered. Since struct node_data does not match any type in kernel BTF, the verifier knows that there is no kfunc that provides a PTR_TO_BTF_ID to this type, and that such a PTR_TO_BTF_ID can only come from bpf_obj_new. So instead of searching for a registered dtor, a bpf_obj_drop dtor can be assumed. This allows the runtime to properly destruct such kptrs in bpf_obj_free_fields, which enables maps to clean up map_vals w/ such kptrs when going away. Implementation notes: * "kernel_btf" variable is renamed to "kptr_btf" in btf_parse_kptr. Before this patch, the variable would only ever point to vmlinux or module BTFs, but now it can point to some program BTF for local kptr type. It's later used to populate the (btf, btf_id) pair in kptr btf field. * It's necessary to btf_get the program BTF when populating btf_field for local kptr. btf_record_free later does a btf_put. * Behavior for non-local referenced kptrs is not modified, as bpf_find_btf_id helper only searches vmlinux and module BTFs for matching BTF type. If such a type is found, btf_field_kptr's btf will pass btf_is_kernel check, and the associated release function is some one-argument dtor. If btf_is_kernel check fails, associated release function is two-arg bpf_obj_drop_impl. Before this patch only btf_field_kptr's w/ kernel or module BTFs were created. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230310230743.2320707-2-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cc4b7684910c..0684febc447a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -659,8 +659,10 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) return; fields = rec->fields; for (i = 0; i < rec->cnt; i++) { + struct btf_struct_meta *pointee_struct_meta; const struct btf_field *field = &fields[i]; void *field_ptr = obj + field->offset; + void *xchgd_field; switch (fields[i].type) { case BPF_SPIN_LOCK: @@ -672,7 +674,17 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) WRITE_ONCE(*(u64 *)field_ptr, 0); break; case BPF_KPTR_REF: - field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0)); + xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0); + if (!btf_is_kernel(field->kptr.btf)) { + pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, + field->kptr.btf_id); + WARN_ON_ONCE(!pointee_struct_meta); + field->kptr.obj_drop(xchgd_field, pointee_struct_meta ? + pointee_struct_meta->record : + NULL); + } else { + field->kptr.dtor(xchgd_field); + } break; case BPF_LIST_HEAD: if (WARN_ON_ONCE(rec->spin_lock_off < 0)) -- cgit From 9e36a204bd43553a9cd4bd574612cd9a5df791ea Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 13 Mar 2023 14:46:41 -0700 Subject: bpf: Disable migration when freeing stashed local kptr using obj drop When a local kptr is stashed in a map and freed when the map goes away, currently an error like the below appears: [ 39.195695] BUG: using smp_processor_id() in preemptible [00000000] code: kworker/u32:15/2875 [ 39.196549] caller is bpf_mem_free+0x56/0xc0 [ 39.196958] CPU: 15 PID: 2875 Comm: kworker/u32:15 Tainted: G O 6.2.0-13016-g22df776a9a86 #4477 [ 39.197897] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 [ 39.198949] Workqueue: events_unbound bpf_map_free_deferred [ 39.199470] Call Trace: [ 39.199703] [ 39.199911] dump_stack_lvl+0x60/0x70 [ 39.200267] check_preemption_disabled+0xbf/0xe0 [ 39.200704] bpf_mem_free+0x56/0xc0 [ 39.201032] ? bpf_obj_new_impl+0xa0/0xa0 [ 39.201430] bpf_obj_free_fields+0x1cd/0x200 [ 39.201838] array_map_free+0xad/0x220 [ 39.202193] ? finish_task_switch+0xe5/0x3c0 [ 39.202614] bpf_map_free_deferred+0xea/0x210 [ 39.203006] ? lockdep_hardirqs_on_prepare+0xe/0x220 [ 39.203460] process_one_work+0x64f/0xbe0 [ 39.203822] ? pwq_dec_nr_in_flight+0x110/0x110 [ 39.204264] ? do_raw_spin_lock+0x107/0x1c0 [ 39.204662] ? lockdep_hardirqs_on_prepare+0xe/0x220 [ 39.205107] worker_thread+0x74/0x7a0 [ 39.205451] ? process_one_work+0xbe0/0xbe0 [ 39.205818] kthread+0x171/0x1a0 [ 39.206111] ? kthread_complete_and_exit+0x20/0x20 [ 39.206552] ret_from_fork+0x1f/0x30 [ 39.206886] This happens because the call to __bpf_obj_drop_impl I added in the patch adding support for stashing local kptrs doesn't disable migration. Prior to that patch, __bpf_obj_drop_impl logic only ran when called by a BPF progarm, whereas now it can be called from map free path, so it's necessary to explicitly disable migration. Also, refactor a bit to just call __bpf_obj_drop_impl directly instead of bothering w/ dtor union and setting pointer-to-obj_drop. Fixes: c8e187540914 ("bpf: Support __kptr to local kptrs") Reported-by: Alexei Starovoitov Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230313214641.3731908-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0684febc447a..5b88301a2ae0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -650,6 +650,8 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj) bpf_timer_cancel_and_free(obj + rec->timer_off); } +extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -679,9 +681,11 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, field->kptr.btf_id); WARN_ON_ONCE(!pointee_struct_meta); - field->kptr.obj_drop(xchgd_field, pointee_struct_meta ? - pointee_struct_meta->record : - NULL); + migrate_disable(); + __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? + pointee_struct_meta->record : + NULL); + migrate_enable(); } else { field->kptr.dtor(xchgd_field); } -- cgit From 31bf1dbccfb0a9861d4846755096b3fff5687f8a Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Fri, 10 Mar 2023 08:40:59 +0100 Subject: bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules This resolves two problems with attachment of fentry/fexit/fmod_ret/lsm to functions located in modules: 1. The verifier tries to find the address to attach to in kallsyms. This is always done by searching the entire kallsyms, not respecting the module in which the function is located. Such approach causes an incorrect attachment address to be computed if the function to attach to is shadowed by a function of the same name located earlier in kallsyms. 2. If the address to attach to is located in a module, the module reference is only acquired in register_fentry. If the module is unloaded between the place where the address is found (bpf_check_attach_target in the verifier) and register_fentry, it is possible that another module is loaded to the same address which may lead to potential errors. Since the attachment must contain the BTF of the program to attach to, we extract the module from it and search for the function address in the correct module (resolving problem no. 1). Then, the module reference is taken directly in bpf_check_attach_target and stored in the bpf program (in bpf_prog_aux). The reference is only released when the program is unloaded (resolving problem no. 2). Signed-off-by: Viktor Malik Acked-by: Jiri Olsa Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/3f6a9d8ae850532b5ef864ef16327b0f7a669063.1678432753.git.vmalik@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5b88301a2ae0..099e9068bcdd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2067,6 +2067,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) { bpf_prog_kallsyms_del_all(prog); btf_put(prog->aux->btf); + module_put(prog->aux->mod); kvfree(prog->aux->jited_linfo); kvfree(prog->aux->linfo); kfree(prog->aux->kfunc_tab); @@ -3113,6 +3114,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, if (err) goto out_unlock; + if (tgt_info.tgt_mod) { + module_put(prog->aux->mod); + prog->aux->mod = tgt_info.tgt_mod; + } + tr = bpf_trampoline_get(key, &tgt_info); if (!tr) { err = -ENOMEM; -- cgit From b671c2067a04c0668df174ff5dfdb573d1f9b074 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 22 Mar 2023 20:23:58 -0700 Subject: bpf: Retire the struct_ops map kvalue->refcnt. We have replaced kvalue-refcnt with synchronize_rcu() to wait for an RCU grace period. Maintenance of kvalue->refcnt was a complicated task, as we had to simultaneously keep track of two reference counts: one for the reference count of bpf_map. When the kvalue->refcnt reaches zero, we also have to reduce the reference count on bpf_map - yet these steps are not performed in an atomic manner and require us to be vigilant when managing them. By eliminating kvalue->refcnt, we can make our maintenance more straightforward as the refcount of bpf_map is now solely managed! To prevent the trampoline image of a struct_ops from being released while it is still in use, we wait for an RCU grace period. The setsockopt(TCP_CONGESTION, "...") command allows you to change your socket's congestion control algorithm and can result in releasing the old struct_ops implementation. It is fine. However, this function is exposed through bpf_setsockopt(), it may be accessed by BPF programs as well. To ensure that the trampoline image belonging to struct_op can be safely called while its method is in use, the trampoline safeguarde the BPF program with rcu_read_lock(). Doing so prevents any destruction of the associated images before returning from a trampoline and requires us to wait for an RCU grace period. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20230323032405.3735486-2-kuifeng@meta.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/syscall.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 099e9068bcdd..cff0348a2871 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1303,8 +1303,10 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) return map; } -/* map_idr_lock should have been held */ -static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) +/* map_idr_lock should have been held or the map should have been + * protected by rcu read lock. + */ +struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) { int refold; -- cgit From 68b04864ca425d1894c96b8141d4fba1181f11cb Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 22 Mar 2023 20:24:00 -0700 Subject: bpf: Create links for BPF struct_ops maps. Make bpf_link support struct_ops. Previously, struct_ops were always used alone without any associated links. Upon updating its value, a struct_ops would be activated automatically. Yet other BPF program types required to make a bpf_link with their instances before they could become active. Now, however, you can create an inactive struct_ops, and create a link to activate it later. With bpf_links, struct_ops has a behavior similar to other BPF program types. You can pin/unpin them from their links and the struct_ops will be deactivated when its link is removed while previously need someone to delete the value for it to be deactivated. bpf_links are responsible for registering their associated struct_ops. You can only use a struct_ops that has the BPF_F_LINK flag set to create a bpf_link, while a structs without this flag behaves in the same manner as before and is registered upon updating its value. The BPF_LINK_TYPE_STRUCT_OPS serves a dual purpose. Not only is it used to craft the links for BPF struct_ops programs, but also to create links for BPF struct_ops them-self. Since the links of BPF struct_ops programs are only used to create trampolines internally, they are never seen in other contexts. Thus, they can be reused for struct_ops themself. To maintain a reference to the map supporting this link, we add bpf_struct_ops_link as an additional type. The pointer of the map is RCU and won't be necessary until later in the patchset. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20230323032405.3735486-4-kuifeng@meta.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/syscall.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cff0348a2871..21f76698875c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2825,16 +2825,19 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) const struct bpf_prog *prog = link->prog; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, "link_type:\t%s\n" - "link_id:\t%u\n" - "prog_tag:\t%s\n" - "prog_id:\t%u\n", + "link_id:\t%u\n", bpf_link_type_strs[link->type], - link->id, - prog_tag, - prog->aux->id); + link->id); + if (prog) { + bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); + seq_printf(m, + "prog_tag:\t%s\n" + "prog_id:\t%u\n", + prog_tag, + prog->aux->id); + } if (link->ops->show_fdinfo) link->ops->show_fdinfo(link, m); } @@ -4314,7 +4317,8 @@ static int bpf_link_get_info_by_fd(struct file *file, info.type = link->type; info.id = link->id; - info.prog_id = link->prog->aux->id; + if (link->prog) + info.prog_id = link->prog->aux->id; if (link->ops->fill_link_info) { err = link->ops->fill_link_info(link, &info); @@ -4577,6 +4581,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) if (CHECK_ATTR(BPF_LINK_CREATE)) return -EINVAL; + if (attr->link_create.attach_type == BPF_STRUCT_OPS) + return bpf_struct_ops_link_create(attr); + prog = bpf_prog_get(attr->link_create.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); -- cgit From aef56f2e918bf8fc8de25f0b36e8c2aba44116ec Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 22 Mar 2023 20:24:02 -0700 Subject: bpf: Update the struct_ops of a bpf_link. By improving the BPF_LINK_UPDATE command of bpf(), it should allow you to conveniently switch between different struct_ops on a single bpf_link. This would enable smoother transitions from one struct_ops to another. The struct_ops maps passing along with BPF_LINK_UPDATE should have the BPF_F_LINK flag. Signed-off-by: Kui-Feng Lee Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230323032405.3735486-6-kuifeng@meta.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/syscall.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 21f76698875c..b4d758fa5981 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4682,6 +4682,35 @@ out: return ret; } +static int link_update_map(struct bpf_link *link, union bpf_attr *attr) +{ + struct bpf_map *new_map, *old_map = NULL; + int ret; + + new_map = bpf_map_get(attr->link_update.new_map_fd); + if (IS_ERR(new_map)) + return -EINVAL; + + if (attr->link_update.flags & BPF_F_REPLACE) { + old_map = bpf_map_get(attr->link_update.old_map_fd); + if (IS_ERR(old_map)) { + ret = -EINVAL; + goto out_put; + } + } else if (attr->link_update.old_map_fd) { + ret = -EINVAL; + goto out_put; + } + + ret = link->ops->update_map(link, new_map, old_map); + + if (old_map) + bpf_map_put(old_map); +out_put: + bpf_map_put(new_map); + return ret; +} + #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd static int link_update(union bpf_attr *attr) @@ -4702,6 +4731,11 @@ static int link_update(union bpf_attr *attr) if (IS_ERR(link)) return PTR_ERR(link); + if (link->ops->update_map) { + ret = link_update_map(link, attr); + goto out_put_link; + } + new_prog = bpf_prog_get(attr->link_update.new_prog_fd); if (IS_ERR(new_prog)) { ret = PTR_ERR(new_prog); -- cgit From 55fbae05476df65e5eee8be54f61d0257af0240b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 24 Mar 2023 11:42:41 -0700 Subject: bpf: Check IS_ERR for the bpf_map_get() return value This patch fixes a mistake in checking NULL instead of checking IS_ERR for the bpf_map_get() return value. It also fixes the return value in link_update_map() from -EINVAL to PTR_ERR(*_map). Reported-by: syzbot+71ccc0fe37abb458406b@syzkaller.appspotmail.com Fixes: 68b04864ca42 ("bpf: Create links for BPF struct_ops maps.") Fixes: aef56f2e918b ("bpf: Update the struct_ops of a bpf_link.") Signed-off-by: Martin KaFai Lau Acked-by: Kui-Feng Lee Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20230324184241.1387437-1-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b4d758fa5981..a09597c95029 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4689,12 +4689,12 @@ static int link_update_map(struct bpf_link *link, union bpf_attr *attr) new_map = bpf_map_get(attr->link_update.new_map_fd); if (IS_ERR(new_map)) - return -EINVAL; + return PTR_ERR(new_map); if (attr->link_update.flags & BPF_F_REPLACE) { old_map = bpf_map_get(attr->link_update.old_map_fd); if (IS_ERR(old_map)) { - ret = -EINVAL; + ret = PTR_ERR(old_map); goto out_put; } } else if (attr->link_update.old_map_fd) { -- cgit From 1431d0b584a673ea690c88a5f7e1aedd9caf0e84 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Sat, 25 Mar 2023 16:31:42 -0500 Subject: bpf: Only invoke kptr dtor following non-NULL xchg When a map value is being freed, we loop over all of the fields of the corresponding BPF object and issue the appropriate cleanup calls corresponding to the field's type. If the field is a referenced kptr, we atomically xchg the value out of the map, and invoke the kptr's destructor on whatever was there before (or bpf_obj_drop() it if it was a local kptr). Currently, we always invoke the destructor (either bpf_obj_drop() or the kptr's registered destructor) on any KPTR_REF-type field in a map, even if there wasn't a value in the map. This means that any function serving as the kptr's KF_RELEASE destructor must always treat the argument as possibly NULL, as the following can and regularly does happen: void *xchgd_field; /* No value was in the map, so xchgd_field is NULL */ xchgd_field = (void *)xchg(unsigned long *field_ptr, 0); field->kptr.dtor(xchgd_field); These are odd semantics to impose on KF_RELEASE kfuncs -- BPF programs are prohibited by the verifier from passing NULL pointers to KF_RELEASE kfuncs, so it doesn't make sense to require this of BPF programs, but not the main kernel destructor path. It's also unnecessary to invoke any cleanup logic for local kptrs. If there is no object there, there's nothing to drop. So as to allow KF_RELEASE kfuncs to fully assume that an argument is non-NULL, this patch updates a KPTR_REF's destructor to only be invoked when a non-NULL value is xchg'd out of the kptr map field. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20230325213144.486885-2-void@manifault.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a09597c95029..e18ac7fdc210 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -677,6 +677,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) break; case BPF_KPTR_REF: xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0); + if (!xchgd_field) + break; + if (!btf_is_kernel(field->kptr.btf)) { pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, field->kptr.btf_id); -- cgit From 47a71c1f9af0a334c9dfa97633c41de4feda4287 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:58 -0700 Subject: bpf: Add log_true_size output field to return necessary log buffer size Add output-only log_true_size and btf_log_true_size field to BPF_PROG_LOAD and BPF_BTF_LOAD commands, respectively. It will return the size of log buffer necessary to fit in all the log contents at specified log_level. This is very useful for BPF loader libraries like libbpf to be able to size log buffer correctly, but could be used by users directly, if necessary, as well. This patch plumbs all this through the code, taking into account actual bpf_attr size provided by user to determine if these new fields are expected by users. And if they are, set them from kernel on return. We refactory btf_parse() function to accommodate this, moving attr and uattr handling inside it. The rest is very straightforward code, which is split from the logging accounting changes in the previous patch to make it simpler to review logic vs UAPI changes. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-13-andrii@kernel.org --- kernel/bpf/syscall.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e18ac7fdc210..6d575505f89c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2501,9 +2501,9 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size +#define BPF_PROG_LOAD_LAST_FIELD log_true_size -static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) +static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog, *dst_prog = NULL; @@ -2653,7 +2653,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) goto free_prog_sec; /* run eBPF verifier */ - err = bpf_check(&prog, attr, uattr); + err = bpf_check(&prog, attr, uattr, uattr_size); if (err < 0) goto free_used_maps; @@ -4371,9 +4371,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, return err; } -#define BPF_BTF_LOAD_LAST_FIELD btf_log_level +#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size -static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr) +static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { if (CHECK_ATTR(BPF_BTF_LOAD)) return -EINVAL; @@ -4381,7 +4381,7 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr) if (!bpf_capable()) return -EPERM; - return btf_new_fd(attr, uattr); + return btf_new_fd(attr, uattr, uattr_size); } #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id @@ -5059,7 +5059,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) err = map_freeze(&attr); break; case BPF_PROG_LOAD: - err = bpf_prog_load(&attr, uattr); + err = bpf_prog_load(&attr, uattr, size); break; case BPF_OBJ_PIN: err = bpf_obj_pin(&attr); @@ -5104,7 +5104,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) err = bpf_raw_tracepoint_open(&attr); break; case BPF_BTF_LOAD: - err = bpf_btf_load(&attr, uattr); + err = bpf_btf_load(&attr, uattr, size); break; case BPF_BTF_GET_FD_BY_ID: err = bpf_btf_get_fd_by_id(&attr); -- cgit From cd2a8079014aced27da9b2e669784f31680f1351 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sat, 15 Apr 2023 13:18:03 -0700 Subject: bpf: Remove btf_field_offs, use btf_record's fields instead The btf_field_offs struct contains (offset, size) for btf_record fields, sorted by offset. btf_field_offs is always used in conjunction with btf_record, which has btf_field 'fields' array with (offset, type), the latter of which btf_field_offs' size is derived from via btf_field_type_size. This patch adds a size field to struct btf_field and sorts btf_record's fields by offset, making it possible to get rid of btf_field_offs. Less data duplication and less code complexity results. Since btf_field_offs' lifetime closely followed the btf_record used to populate it, most complexity wins are from removal of initialization code like: if (btf_record_successfully_initialized) { foffs = btf_parse_field_offs(rec); if (IS_ERR_OR_NULL(foffs)) // free the btf_record and return err } Other changes in this patch are pretty mechanical: * foffs->field_off[i] -> rec->fields[i].offset * foffs->field_sz[i] -> rec->fields[i].size * Sort rec->fields in btf_parse_fields before returning * It's possible that this is necessary independently of other changes in this patch. btf_record_find in syscall.c expects btf_record's fields to be sorted by offset, yet there's no explicit sorting of them before this patch, record's fields are populated in the order they're read from BTF struct definition. BTF docs don't say anything about the sortedness of struct fields. * All functions taking struct btf_field_offs * input now instead take struct btf_record *. All callsites of these functions already have access to the correct btf_record. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230415201811.343116-2-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6d575505f89c..c08b7933bf8f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -717,14 +717,13 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); - struct btf_field_offs *foffs = map->field_offs; struct btf_record *rec = map->record; security_bpf_map_free(map); bpf_map_release_memcg(map); /* implementation dependent freeing */ map->ops->map_free(map); - /* Delay freeing of field_offs and btf_record for maps, as map_free + /* Delay freeing of btf_record for maps, as map_free * callback usually needs access to them. It is better to do it here * than require each callback to do the free itself manually. * @@ -733,7 +732,6 @@ static void bpf_map_free_deferred(struct work_struct *work) * eventually calls bpf_map_free_meta, since inner_map_meta is only a * template bpf_map struct used during verification. */ - kfree(foffs); btf_record_free(rec); } @@ -1125,7 +1123,6 @@ free_map_tab: static int map_create(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); - struct btf_field_offs *foffs; struct bpf_map *map; int f_flags; int err; @@ -1205,17 +1202,9 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } - - foffs = btf_parse_field_offs(map->record); - if (IS_ERR(foffs)) { - err = PTR_ERR(foffs); - goto free_map; - } - map->field_offs = foffs; - err = security_bpf_map_alloc(map); if (err) - goto free_map_field_offs; + goto free_map; err = bpf_map_alloc_id(map); if (err) @@ -1239,8 +1228,6 @@ static int map_create(union bpf_attr *attr) free_map_sec: security_bpf_map_free(map); -free_map_field_offs: - kfree(map->field_offs); free_map: btf_put(map->btf); map->ops->map_free(map); -- cgit From d54730b50bae1f3119bd686d551d66f0fcc387ca Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sat, 15 Apr 2023 13:18:04 -0700 Subject: bpf: Introduce opaque bpf_refcount struct and add btf_record plumbing A 'struct bpf_refcount' is added to the set of opaque uapi/bpf.h types meant for use in BPF programs. Similarly to other opaque types like bpf_spin_lock and bpf_rbtree_node, the verifier needs to know where in user-defined struct types a bpf_refcount can be located, so necessary btf_record plumbing is added to enable this. bpf_refcount is sized to hold a refcount_t. Similarly to bpf_spin_lock, the offset of a bpf_refcount is cached in btf_record as refcount_off in addition to being in the field array. Caching refcount_off makes sense for this field because further patches in the series will modify functions that take local kptrs (e.g. bpf_obj_drop) to change their behavior if the type they're operating on is refcounted. So enabling fast "is this type refcounted?" checks is desirable. No such verifier behavior changes are introduced in this patch, just logic to recognize 'struct bpf_refcount' in btf_record. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230415201811.343116-3-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c08b7933bf8f..28eac7434d32 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -552,6 +552,7 @@ void btf_record_free(struct btf_record *rec) case BPF_RB_NODE: case BPF_SPIN_LOCK: case BPF_TIMER: + case BPF_REFCOUNT: /* Nothing to release */ break; default: @@ -599,6 +600,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_RB_NODE: case BPF_SPIN_LOCK: case BPF_TIMER: + case BPF_REFCOUNT: /* Nothing to acquire */ break; default: @@ -705,6 +707,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) break; case BPF_LIST_NODE: case BPF_RB_NODE: + case BPF_REFCOUNT: break; default: WARN_ON_ONCE(1); @@ -1032,7 +1035,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | - BPF_RB_ROOT, + BPF_RB_ROOT | BPF_REFCOUNT, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1071,6 +1074,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_REFCOUNT: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_PERCPU_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && -- cgit From 69a8c792cd9518071dc801bb110e0f2210d9f958 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Mon, 17 Apr 2023 09:17:48 +0100 Subject: bpf: lirc program type should not require SYS_CAP_ADMIN Make it possible to load lirc program type with just CAP_BPF. There is nothing exceptional about lirc programs that means they require SYS_CAP_ADMIN. In order to attach or detach a lirc program type you need permission to open /dev/lirc0; if you have permission to do that, you can alter all sorts of lirc receiving options. Changing the IR protocol decoder is no different. Right now on a typical distribution /dev/lirc devices are only read/write by root. Ideally we would make them group read/write like other devices so that local users can use them without becoming root. Signed-off-by: Sean Young Link: https://lore.kernel.org/r/ZD0ArKpwnDBJZsrE@gofer.mess.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 28eac7434d32..bcf1a1920ddd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2454,7 +2454,6 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_LWT_SEG6LOCAL: case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: - case BPF_PROG_TYPE_LIRC_MODE2: case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SOCK: -- cgit From 84601d6ee68ae820dec97450934797046d62db4b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 21 Apr 2023 19:02:54 +0200 Subject: bpf: add bpf_link support for BPF_NETFILTER programs Add bpf_link support skeleton. To keep this reviewable, no bpf program can be invoked yet, if a program is attached only a c-stub is called and not the actual bpf program. Defaults to 'y' if both netfilter and bpf syscall are enabled in kconfig. Uapi example usage: union bpf_attr attr = { }; attr.link_create.prog_fd = progfd; attr.link_create.attach_type = 0; /* unused */ attr.link_create.netfilter.pf = PF_INET; attr.link_create.netfilter.hooknum = NF_INET_LOCAL_IN; attr.link_create.netfilter.priority = -128; err = bpf(BPF_LINK_CREATE, &attr, sizeof(attr)); ... this would attach progfd to ipv4:input hook. Such hook gets removed automatically if the calling program exits. BPF_NETFILTER program invocation is added in followup change. NF_HOOK_OP_BPF enum will eventually be read from nfnetlink_hook, it allows to tell userspace which program is attached at the given hook when user runs 'nft hook list' command rather than just the priority and not-very-helpful 'this hook runs a bpf prog but I can't tell which one'. Will also be used to disallow registration of two bpf programs with same priority in a followup patch. v4: arm32 cmpxchg only supports 32bit operand s/prio/priority/ v3: restrict prog attachment to ip/ip6 for now, lets lift restrictions if more use cases pop up (arptables, ebtables, netdev ingress/egress etc). Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20230421170300.24115-2-fw@strlen.de Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bcf1a1920ddd..14f39c1e573e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -35,6 +35,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -2462,6 +2463,7 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_EXT: /* extends any prog */ + case BPF_PROG_TYPE_NETFILTER: return true; case BPF_PROG_TYPE_CGROUP_SKB: /* always unpriv */ @@ -4588,6 +4590,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) switch (prog->type) { case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_NETFILTER: break; case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_TRACEPOINT: @@ -4654,6 +4657,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_XDP: ret = bpf_xdp_link_attach(attr, prog); break; + case BPF_PROG_TYPE_NETFILTER: + ret = bpf_nf_link_attach(attr, prog); + break; #endif case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_TRACEPOINT: -- cgit From 132328e8e85174ea788faf8f627c33258c88fbad Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 5 Jun 2023 15:14:45 +0200 Subject: bpf: netfilter: Add BPF_NETFILTER bpf_attach_type Andrii Nakryiko writes: And we currently don't have an attach type for NETLINK BPF link. Thankfully it's not too late to add it. I see that link_create() in kernel/bpf/syscall.c just bypasses attach_type check. We shouldn't have done that. Instead we need to add BPF_NETLINK attach type to enum bpf_attach_type. And wire all that properly throughout the kernel and libbpf itself. This adds BPF_NETFILTER and uses it. This breaks uabi but this wasn't in any non-rc release yet, so it should be fine. v2: check link_attack prog type in link_create too Fixes: 84601d6ee68a ("bpf: add bpf_link support for BPF_NETFILTER programs") Suggested-by: Andrii Nakryiko Signed-off-by: Florian Westphal Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/CAEf4BzZ69YgrQW7DHCJUT_X+GqMq_ZQQPBwopaJJVGFD5=d5Vg@mail.gmail.com/ Link: https://lore.kernel.org/bpf/20230605131445.32016-1-fw@strlen.de --- kernel/bpf/syscall.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 14f39c1e573e..0c21d0d8efe4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2433,6 +2433,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_NETFILTER: + if (expected_attach_type == BPF_NETFILTER) + return 0; + return -EINVAL; case BPF_PROG_TYPE_SYSCALL: case BPF_PROG_TYPE_EXT: if (expected_attach_type) @@ -4590,7 +4594,12 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) switch (prog->type) { case BPF_PROG_TYPE_EXT: + break; case BPF_PROG_TYPE_NETFILTER: + if (attr->link_create.attach_type != BPF_NETFILTER) { + ret = -EINVAL; + goto out; + } break; case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_TRACEPOINT: -- cgit From db8eae6bc5c702d8e3ab2d0c6bb5976c131576eb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 18 Jun 2023 15:14:14 +0200 Subject: bpf: Force kprobe multi expected_attach_type for kprobe_multi link We currently allow to create perf link for program with expected_attach_type == BPF_TRACE_KPROBE_MULTI. This will cause crash when we call helpers like get_attach_cookie or get_func_ip in such program, because it will call the kprobe_multi's version (current->bpf_ctx context setup) of those helpers while it expects perf_link's current->bpf_ctx context setup. Making sure that we use BPF_TRACE_KPROBE_MULTI expected_attach_type only for programs attaching through kprobe_multi link. Fixes: ca74823c6e16 ("bpf: Add cookie support to programs attached with kprobe multi link") Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20230618131414.75649-1-jolsa@kernel.org --- kernel/bpf/syscall.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/bpf/syscall.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0c21d0d8efe4..f1c8733f76b8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3440,6 +3440,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; + case BPF_PROG_TYPE_KPROBE: + if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && + attach_type != BPF_TRACE_KPROBE_MULTI) + return -EINVAL; + return 0; default: return 0; } -- cgit