From 1cf3bfc60f9836f44da951f58b6ae24680484b35 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 13 Apr 2023 01:06:32 +0200 Subject: bpf: Support 64-bit pointers to kfuncs test_ksyms_module fails to emit a kfunc call targeting a module on s390x, because the verifier stores the difference between kfunc address and __bpf_call_base in bpf_insn.imm, which is s32, and modules are roughly (1 << 42) bytes away from the kernel on s390x. Fix by keeping BTF id in bpf_insn.imm for BPF_PSEUDO_KFUNC_CALLs, and storing the absolute address in bpf_kfunc_desc. Introduce bpf_jit_supports_far_kfunc_call() in order to limit this new behavior to the s390x JIT. Otherwise other JITs need to be modified, which is not desired. Introduce bpf_get_kfunc_addr() instead of exposing both find_kfunc_desc() and struct bpf_kfunc_desc. In addition to sorting kfuncs by imm, also sort them by offset, in order to handle conflicting imms from different modules. Do this on all architectures in order to simplify code. Factor out resolving specialized kfuncs (XPD and dynptr) from fixup_kfunc_call(). This was required in the first place, because fixup_kfunc_call() uses find_kfunc_desc(), which returns a const pointer, so it's not possible to modify kfunc addr without stripping const, which is not nice. It also removes repetition of code like: if (bpf_jit_supports_far_kfunc_call()) desc->addr = func; else insn->imm = BPF_CALL_IMM(func); and separates kfunc_desc_tab fixups from kfunc_call fixups. Suggested-by: Jiri Olsa Signed-off-by: Ilya Leoshkevich Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20230412230632.885985-1-iii@linux.ibm.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 10 ++++++++++ include/linux/filter.h | 1 + 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2c6095bd7d69..88845aadc47d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2295,6 +2295,9 @@ bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); const struct btf_func_model * bpf_jit_find_kfunc_model(const struct bpf_prog *prog, const struct bpf_insn *insn); +int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id, + u16 btf_fd_idx, u8 **func_addr); + struct bpf_core_ctx { struct bpf_verifier_log *log; const struct btf *btf; @@ -2545,6 +2548,13 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog, return NULL; } +static inline int +bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id, + u16 btf_fd_idx, u8 **func_addr) +{ + return -ENOTSUPP; +} + static inline bool unprivileged_ebpf_enabled(void) { return false; diff --git a/include/linux/filter.h b/include/linux/filter.h index 5364b0c52c1d..bbce89937fde 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -920,6 +920,7 @@ void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); bool bpf_jit_supports_subprog_tailcalls(void); bool bpf_jit_supports_kfunc_call(void); +bool bpf_jit_supports_far_kfunc_call(void); bool bpf_helper_changes_pkt_data(void *func); static inline bool bpf_dump_raw_ok(const struct cred *cred) -- cgit From cd2a8079014aced27da9b2e669784f31680f1351 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sat, 15 Apr 2023 13:18:03 -0700 Subject: bpf: Remove btf_field_offs, use btf_record's fields instead The btf_field_offs struct contains (offset, size) for btf_record fields, sorted by offset. btf_field_offs is always used in conjunction with btf_record, which has btf_field 'fields' array with (offset, type), the latter of which btf_field_offs' size is derived from via btf_field_type_size. This patch adds a size field to struct btf_field and sorts btf_record's fields by offset, making it possible to get rid of btf_field_offs. Less data duplication and less code complexity results. Since btf_field_offs' lifetime closely followed the btf_record used to populate it, most complexity wins are from removal of initialization code like: if (btf_record_successfully_initialized) { foffs = btf_parse_field_offs(rec); if (IS_ERR_OR_NULL(foffs)) // free the btf_record and return err } Other changes in this patch are pretty mechanical: * foffs->field_off[i] -> rec->fields[i].offset * foffs->field_sz[i] -> rec->fields[i].size * Sort rec->fields in btf_parse_fields before returning * It's possible that this is necessary independently of other changes in this patch. btf_record_find in syscall.c expects btf_record's fields to be sorted by offset, yet there's no explicit sorting of them before this patch, record's fields are populated in the order they're read from BTF struct definition. BTF docs don't say anything about the sortedness of struct fields. * All functions taking struct btf_field_offs * input now instead take struct btf_record *. All callsites of these functions already have access to the correct btf_record. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230415201811.343116-2-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 44 +++++++++++++++++++------------------------- include/linux/btf.h | 2 -- 2 files changed, 19 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 88845aadc47d..7888ed497432 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -210,6 +210,7 @@ struct btf_field_graph_root { struct btf_field { u32 offset; + u32 size; enum btf_field_type type; union { struct btf_field_kptr kptr; @@ -225,12 +226,6 @@ struct btf_record { struct btf_field fields[]; }; -struct btf_field_offs { - u32 cnt; - u32 field_off[BTF_FIELDS_MAX]; - u8 field_sz[BTF_FIELDS_MAX]; -}; - struct bpf_map { /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). @@ -257,7 +252,6 @@ struct bpf_map { struct obj_cgroup *objcg; #endif char name[BPF_OBJ_NAME_LEN]; - struct btf_field_offs *field_offs; /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ @@ -360,14 +354,14 @@ static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_f return rec->field_mask & type; } -static inline void bpf_obj_init(const struct btf_field_offs *foffs, void *obj) +static inline void bpf_obj_init(const struct btf_record *rec, void *obj) { int i; - if (!foffs) + if (IS_ERR_OR_NULL(rec)) return; - for (i = 0; i < foffs->cnt; i++) - memset(obj + foffs->field_off[i], 0, foffs->field_sz[i]); + for (i = 0; i < rec->cnt; i++) + memset(obj + rec->fields[i].offset, 0, rec->fields[i].size); } /* 'dst' must be a temporary buffer and should not point to memory that is being @@ -379,7 +373,7 @@ static inline void bpf_obj_init(const struct btf_field_offs *foffs, void *obj) */ static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { - bpf_obj_init(map->field_offs, dst); + bpf_obj_init(map->record, dst); } /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and @@ -399,14 +393,14 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) } /* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */ -static inline void bpf_obj_memcpy(struct btf_field_offs *foffs, +static inline void bpf_obj_memcpy(struct btf_record *rec, void *dst, void *src, u32 size, bool long_memcpy) { u32 curr_off = 0; int i; - if (likely(!foffs)) { + if (IS_ERR_OR_NULL(rec)) { if (long_memcpy) bpf_long_memcpy(dst, src, round_up(size, 8)); else @@ -414,49 +408,49 @@ static inline void bpf_obj_memcpy(struct btf_field_offs *foffs, return; } - for (i = 0; i < foffs->cnt; i++) { - u32 next_off = foffs->field_off[i]; + for (i = 0; i < rec->cnt; i++) { + u32 next_off = rec->fields[i].offset; u32 sz = next_off - curr_off; memcpy(dst + curr_off, src + curr_off, sz); - curr_off += foffs->field_sz[i] + sz; + curr_off += rec->fields[i].size + sz; } memcpy(dst + curr_off, src + curr_off, size - curr_off); } static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) { - bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, false); + bpf_obj_memcpy(map->record, dst, src, map->value_size, false); } static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src) { - bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, true); + bpf_obj_memcpy(map->record, dst, src, map->value_size, true); } -static inline void bpf_obj_memzero(struct btf_field_offs *foffs, void *dst, u32 size) +static inline void bpf_obj_memzero(struct btf_record *rec, void *dst, u32 size) { u32 curr_off = 0; int i; - if (likely(!foffs)) { + if (IS_ERR_OR_NULL(rec)) { memset(dst, 0, size); return; } - for (i = 0; i < foffs->cnt; i++) { - u32 next_off = foffs->field_off[i]; + for (i = 0; i < rec->cnt; i++) { + u32 next_off = rec->fields[i].offset; u32 sz = next_off - curr_off; memset(dst + curr_off, 0, sz); - curr_off += foffs->field_sz[i] + sz; + curr_off += rec->fields[i].size + sz; } memset(dst + curr_off, 0, size - curr_off); } static inline void zero_map_value(struct bpf_map *map, void *dst) { - bpf_obj_memzero(map->field_offs, dst, map->value_size); + bpf_obj_memzero(map->record, dst, map->value_size); } void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, diff --git a/include/linux/btf.h b/include/linux/btf.h index 495250162422..813227bff58a 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -113,7 +113,6 @@ struct btf_id_dtor_kfunc { struct btf_struct_meta { u32 btf_id; struct btf_record *record; - struct btf_field_offs *field_offs; }; struct btf_struct_metas { @@ -207,7 +206,6 @@ int btf_find_timer(const struct btf *btf, const struct btf_type *t); struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, u32 field_mask, u32 value_size); int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec); -struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, -- cgit From d54730b50bae1f3119bd686d551d66f0fcc387ca Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sat, 15 Apr 2023 13:18:04 -0700 Subject: bpf: Introduce opaque bpf_refcount struct and add btf_record plumbing A 'struct bpf_refcount' is added to the set of opaque uapi/bpf.h types meant for use in BPF programs. Similarly to other opaque types like bpf_spin_lock and bpf_rbtree_node, the verifier needs to know where in user-defined struct types a bpf_refcount can be located, so necessary btf_record plumbing is added to enable this. bpf_refcount is sized to hold a refcount_t. Similarly to bpf_spin_lock, the offset of a bpf_refcount is cached in btf_record as refcount_off in addition to being in the field array. Caching refcount_off makes sense for this field because further patches in the series will modify functions that take local kptrs (e.g. bpf_obj_drop) to change their behavior if the type they're operating on is refcounted. So enabling fast "is this type refcounted?" checks is desirable. No such verifier behavior changes are introduced in this patch, just logic to recognize 'struct bpf_refcount' in btf_record. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230415201811.343116-3-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7888ed497432..be44d765b7a4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -187,6 +187,7 @@ enum btf_field_type { BPF_RB_NODE = (1 << 7), BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD | BPF_RB_NODE | BPF_RB_ROOT, + BPF_REFCOUNT = (1 << 8), }; typedef void (*btf_dtor_kfunc_t)(void *); @@ -223,6 +224,7 @@ struct btf_record { u32 field_mask; int spin_lock_off; int timer_off; + int refcount_off; struct btf_field fields[]; }; @@ -293,6 +295,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "bpf_rb_root"; case BPF_RB_NODE: return "bpf_rb_node"; + case BPF_REFCOUNT: + return "bpf_refcount"; default: WARN_ON_ONCE(1); return "unknown"; @@ -317,6 +321,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(struct bpf_rb_root); case BPF_RB_NODE: return sizeof(struct bpf_rb_node); + case BPF_REFCOUNT: + return sizeof(struct bpf_refcount); default: WARN_ON_ONCE(1); return 0; @@ -341,6 +347,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(struct bpf_rb_root); case BPF_RB_NODE: return __alignof__(struct bpf_rb_node); + case BPF_REFCOUNT: + return __alignof__(struct bpf_refcount); default: WARN_ON_ONCE(1); return 0; -- cgit From 1512217c47f0e8ea076dd0e67262e5a668a78f01 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sat, 15 Apr 2023 13:18:05 -0700 Subject: bpf: Support refcounted local kptrs in existing semantics A local kptr is considered 'refcounted' when it is of a type that has a bpf_refcount field. When such a kptr is created, its refcount should be initialized to 1; when destroyed, the object should be free'd only if a refcount decr results in 0 refcount. Existing logic always frees the underlying memory when destroying a local kptr, and 0-initializes all btf_record fields. This patch adds checks for "is local kptr refcounted?" and new logic for that case in the appropriate places. This patch focuses on changing existing semantics and thus conspicuously does _not_ provide a way for BPF programs in increment refcount. That follows later in the series. __bpf_obj_drop_impl is modified to do the right thing when it sees a refcounted type. Container types for graph nodes (list, tree, stashed in map) are migrated to use __bpf_obj_drop_impl as a destructor for their nodes instead of each having custom destruction code in their _free paths. Now that "drop" isn't a synonym for "free" when the type is refcounted it makes sense to centralize this logic. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230415201811.343116-4-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index be44d765b7a4..b065de2cfcea 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -370,6 +370,9 @@ static inline void bpf_obj_init(const struct btf_record *rec, void *obj) return; for (i = 0; i < rec->cnt; i++) memset(obj + rec->fields[i].offset, 0, rec->fields[i].size); + + if (rec->refcount_off >= 0) + refcount_set((refcount_t *)(obj + rec->refcount_off), 1); } /* 'dst' must be a temporary buffer and should not point to memory that is being -- cgit From d2dcc67df910dd85253a701b6a5b747f955d28f5 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sat, 15 Apr 2023 13:18:07 -0700 Subject: bpf: Migrate bpf_rbtree_add and bpf_list_push_{front,back} to possibly fail Consider this code snippet: struct node { long key; bpf_list_node l; bpf_rb_node r; bpf_refcount ref; } int some_bpf_prog(void *ctx) { struct node *n = bpf_obj_new(/*...*/), *m; bpf_spin_lock(&glock); bpf_rbtree_add(&some_tree, &n->r, /* ... */); m = bpf_refcount_acquire(n); bpf_rbtree_add(&other_tree, &m->r, /* ... */); bpf_spin_unlock(&glock); /* ... */ } After bpf_refcount_acquire, n and m point to the same underlying memory, and that node's bpf_rb_node field is being used by the some_tree insert, so overwriting it as a result of the second insert is an error. In order to properly support refcounted nodes, the rbtree and list insert functions must be allowed to fail. This patch adds such support. The kfuncs bpf_rbtree_add, bpf_list_push_{front,back} are modified to return an int indicating success/failure, with 0 -> success, nonzero -> failure. bpf_obj_drop on failure ======================= Currently the only reason an insert can fail is the example above: the bpf_{list,rb}_node is already in use. When such a failure occurs, the insert kfuncs will bpf_obj_drop the input node. This allows the insert operations to logically fail without changing their verifier owning ref behavior, namely the unconditional release_reference of the input owning ref. With insert that always succeeds, ownership of the node is always passed to the collection, since the node always ends up in the collection. With a possibly-failed insert w/ bpf_obj_drop, ownership of the node is always passed either to the collection (success), or to bpf_obj_drop (failure). Regardless, it's correct to continue unconditionally releasing the input owning ref, as something is always taking ownership from the calling program on insert. Keeping owning ref behavior unchanged results in a nice default UX for insert functions that can fail. If the program's reaction to a failed insert is "fine, just get rid of this owning ref for me and let me go on with my business", then there's no reason to check for failure since that's default behavior. e.g.: long important_failures = 0; int some_bpf_prog(void *ctx) { struct node *n, *m, *o; /* all bpf_obj_new'd */ bpf_spin_lock(&glock); bpf_rbtree_add(&some_tree, &n->node, /* ... */); bpf_rbtree_add(&some_tree, &m->node, /* ... */); if (bpf_rbtree_add(&some_tree, &o->node, /* ... */)) { important_failures++; } bpf_spin_unlock(&glock); } If we instead chose to pass ownership back to the program on failed insert - by returning NULL on success or an owning ref on failure - programs would always have to do something with the returned ref on failure. The most likely action is probably "I'll just get rid of this owning ref and go about my business", which ideally would look like: if (n = bpf_rbtree_add(&some_tree, &n->node, /* ... */)) bpf_obj_drop(n); But bpf_obj_drop isn't allowed in a critical section and inserts must occur within one, so in reality error handling would become a hard-to-parse mess. For refcounted nodes, we can replicate the "pass ownership back to program on failure" logic with this patch's semantics, albeit in an ugly way: struct node *n = bpf_obj_new(/* ... */), *m; bpf_spin_lock(&glock); m = bpf_refcount_acquire(n); if (bpf_rbtree_add(&some_tree, &n->node, /* ... */)) { /* Do something with m */ } bpf_spin_unlock(&glock); bpf_obj_drop(m); bpf_refcount_acquire is used to simulate "return owning ref on failure". This should be an uncommon occurrence, though. Addition of two verifier-fixup'd args to collection inserts =========================================================== The actual bpf_obj_drop kfunc is bpf_obj_drop_impl(void *, struct btf_struct_meta *), with bpf_obj_drop macro populating the second arg with 0 and the verifier later filling in the arg during insn fixup. Because bpf_rbtree_add and bpf_list_push_{front,back} now might do bpf_obj_drop, these kfuncs need a btf_struct_meta parameter that can be passed to bpf_obj_drop_impl. Similarly, because the 'node' param to those insert functions is the bpf_{list,rb}_node within the node type, and bpf_obj_drop expects a pointer to the beginning of the node, the insert functions need to be able to find the beginning of the node struct. A second verifier-populated param is necessary: the offset of {list,rb}_node within the node type. These two new params allow the insert kfuncs to correctly call __bpf_obj_drop_impl: beginning_of_node = bpf_rb_node_ptr - offset if (already_inserted) __bpf_obj_drop_impl(beginning_of_node, btf_struct_meta->record); Similarly to other kfuncs with "hidden" verifier-populated params, the insert functions are renamed with _impl prefix and a macro is provided for common usage. For example, bpf_rbtree_add kfunc is now bpf_rbtree_add_impl and bpf_rbtree_add is now a macro which sets "hidden" args to 0. Due to the two new args BPF progs will need to be recompiled to work with the new _impl kfuncs. This patch also rewrites the "hidden argument" explanation to more directly say why the BPF program writer doesn't need to populate the arguments with anything meaningful. How does this new logic affect non-owning references? ===================================================== Currently, non-owning refs are valid until the end of the critical section in which they're created. We can make this guarantee because, if a non-owning ref exists, the referent was added to some collection. The collection will drop() its nodes when it goes away, but it can't go away while our program is accessing it, so that's not a problem. If the referent is removed from the collection in the same CS that it was added in, it can't be bpf_obj_drop'd until after CS end. Those are the only two ways to free the referent's memory and neither can happen until after the non-owning ref's lifetime ends. On first glance, having these collection insert functions potentially bpf_obj_drop their input seems like it breaks the "can't be bpf_obj_drop'd until after CS end" line of reasoning. But we care about the memory not being _freed_ until end of CS end, and a previous patch in the series modified bpf_obj_drop such that it doesn't free refcounted nodes until refcount == 0. So the statement can be more accurately rewritten as "can't be free'd until after CS end". We can prove that this rewritten statement holds for any non-owning reference produced by collection insert functions: * If the input to the insert function is _not_ refcounted * We have an owning reference to the input, and can conclude it isn't in any collection * Inserting a node in a collection turns owning refs into non-owning, and since our input type isn't refcounted, there's no way to obtain additional owning refs to the same underlying memory * Because our node isn't in any collection, the insert operation cannot fail, so bpf_obj_drop will not execute * If bpf_obj_drop is guaranteed not to execute, there's no risk of memory being free'd * Otherwise, the input to the insert function is refcounted * If the insert operation fails due to the node's list_head or rb_root already being in some collection, there was some previous successful insert which passed refcount to the collection * We have an owning reference to the input, it must have been acquired via bpf_refcount_acquire, which bumped the refcount * refcount must be >= 2 since there's a valid owning reference and the node is already in a collection * Insert triggering bpf_obj_drop will decr refcount to >= 1, never resulting in a free So although we may do bpf_obj_drop during the critical section, this will never result in memory being free'd, and no changes to non-owning ref logic are needed in this patch. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230415201811.343116-6-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index f03852b89d28..3dd29a53b711 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -464,7 +464,12 @@ struct bpf_insn_aux_data { */ struct bpf_loop_inline_state loop_inline_state; }; - u64 obj_new_size; /* remember the size of type passed to bpf_obj_new to rewrite R1 */ + union { + /* remember the size of type passed to bpf_obj_new to rewrite R1 */ + u64 obj_new_size; + /* remember the offset of node field within type to rewrite */ + u64 insert_off; + }; struct btf_struct_meta *kptr_struct_meta; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ -- cgit From 3e81740a90626024a9d9c6f9bfa3d36204dafefb Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sat, 15 Apr 2023 13:18:10 -0700 Subject: bpf: Centralize btf_field-specific initialization logic All btf_fields in an object are 0-initialized by memset in bpf_obj_init. This might not be a valid initial state for some field types, in which case kfuncs that use the type will properly initialize their input if it's been 0-initialized. Some BPF graph collection types and kfuncs do this: bpf_list_{head,node} and bpf_rb_node. An earlier patch in this series added the bpf_refcount field, for which the 0 state indicates that the refcounted object should be free'd. bpf_obj_init treats this field specially, setting refcount to 1 instead of relying on scattered "refcount is 0? Must have just been initialized, let's set to 1" logic in kfuncs. This patch extends this treatment to list and rbtree field types, allowing most scattered initialization logic in kfuncs to be removed. Note that bpf_{list_head,rb_root} may be inside a BPF map, in which case they'll be 0-initialized without passing through the newly-added logic, so scattered initialization logic must remain for these collection root types. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230415201811.343116-9-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b065de2cfcea..18b592fde896 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -355,6 +355,34 @@ static inline u32 btf_field_type_align(enum btf_field_type type) } } +static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) +{ + memset(addr, 0, field->size); + + switch (field->type) { + case BPF_REFCOUNT: + refcount_set((refcount_t *)addr, 1); + break; + case BPF_RB_NODE: + RB_CLEAR_NODE((struct rb_node *)addr); + break; + case BPF_LIST_HEAD: + case BPF_LIST_NODE: + INIT_LIST_HEAD((struct list_head *)addr); + break; + case BPF_RB_ROOT: + /* RB_ROOT_CACHED 0-inits, no need to do anything after memset */ + case BPF_SPIN_LOCK: + case BPF_TIMER: + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + break; + default: + WARN_ON_ONCE(1); + return; + } +} + static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_field_type type) { if (IS_ERR_OR_NULL(rec)) @@ -369,10 +397,7 @@ static inline void bpf_obj_init(const struct btf_record *rec, void *obj) if (IS_ERR_OR_NULL(rec)) return; for (i = 0; i < rec->cnt; i++) - memset(obj + rec->fields[i].offset, 0, rec->fields[i].size); - - if (rec->refcount_off >= 0) - refcount_set((refcount_t *)(obj + rec->refcount_off), 1); + bpf_obj_init_field(&rec->fields[i], obj + rec->fields[i].offset); } /* 'dst' must be a temporary buffer and should not point to memory that is being -- cgit From 7b4ddf3920d247c2949073b9c274301c8131332a Mon Sep 17 00:00:00 2001 From: David Vernet Date: Sun, 16 Apr 2023 03:49:27 -0500 Subject: bpf: Remove KF_KPTR_GET kfunc flag We've managed to improve the UX for kptrs significantly over the last 9 months. All of the existing use cases which previously had KF_KPTR_GET kfuncs (struct bpf_cpumask *, struct task_struct *, and struct cgroup *) have all been updated to be synchronized using RCU. In other words, their KF_KPTR_GET kfuncs have been removed in favor of KF_RCU | KF_ACQUIRE kfuncs, with the pointers themselves also being readable from maps in an RCU read region thanks to the types being RCU safe. While KF_KPTR_GET was a logical starting point for kptrs, it's become clear that they're not the correct abstraction. KF_KPTR_GET is a flag that essentially does nothing other than enforcing that the argument to a function is a pointer to a referenced kptr map value. At first glance, that's a useful thing to guarantee to a kfunc. It gives kfuncs the ability to try and acquire a reference on that kptr without requiring the BPF prog to do something like this: struct kptr_type *in_map, *new = NULL; in_map = bpf_kptr_xchg(&map->value, NULL); if (in_map) { new = bpf_kptr_type_acquire(in_map); in_map = bpf_kptr_xchg(&map->value, in_map); if (in_map) bpf_kptr_type_release(in_map); } That's clearly a pretty ugly (and racy) UX, and if using KF_KPTR_GET is the only alternative, it's better than nothing. However, the problem with any KF_KPTR_GET kfunc lies in the fact that it always requires some kind of synchronization in order to safely do an opportunistic acquire of the kptr in the map. This is because a BPF program running on another CPU could do a bpf_kptr_xchg() on that map value, and free the kptr after it's been read by the KF_KPTR_GET kfunc. For example, the now-removed bpf_task_kptr_get() kfunc did the following: struct task_struct *bpf_task_kptr_get(struct task_struct **pp) { struct task_struct *p; rcu_read_lock(); p = READ_ONCE(*pp); /* If p is non-NULL, it could still be freed by another CPU, * so we have to do an opportunistic refcount_inc_not_zero() * and return NULL if the task will be freed after the * current RCU read region. */ |f (p && !refcount_inc_not_zero(&p->rcu_users)) p = NULL; rcu_read_unlock(); return p; } In other words, the kfunc uses RCU to ensure that the task remains valid after it's been peeked from the map. However, this is completely redundant with just defining a KF_RCU kfunc that itself does a refcount_inc_not_zero(), which is exactly what bpf_task_acquire() now does. So, the question of whether KF_KPTR_GET is useful is actually, "Are there any synchronization mechanisms / safety flags that are required by certain kptrs, but which are not provided by the verifier to kfuncs?" The answer to that question today is "No", because every kptr we currently care about is RCU protected. Even if the answer ever became "yes", the proper way to support that referenced kptr type would be to add support for whatever synchronization mechanism it requires in the verifier, rather than giving kfuncs a flag that says, "Here's a pointer to a referenced kptr in a map, do whatever you need to do." With all that said -- so as to allow us to consolidate the kfunc API, and simplify the verifier a bit, this patch removes KF_KPTR_GET, and all relevant logic from the verifier. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20230416084928.326135-3-void@manifault.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 813227bff58a..508199e38415 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -18,7 +18,6 @@ #define KF_ACQUIRE (1 << 0) /* kfunc is an acquire function */ #define KF_RELEASE (1 << 1) /* kfunc is a release function */ #define KF_RET_NULL (1 << 2) /* kfunc returns a pointer that may be NULL */ -#define KF_KPTR_GET (1 << 3) /* kfunc returns reference to a kptr */ /* Trusted arguments are those which are guaranteed to be valid when passed to * the kfunc. It is used to enforce that pointers obtained from either acquire * kfuncs, or from the main kernel on a tracepoint or struct_ops callback -- cgit From 59e498a3289f685261c076b998a8a2f8a516874f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 17 Apr 2023 15:49:15 +0200 Subject: bpf: Set skb redirect and from_ingress info in __bpf_tx_skb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are some use-cases where it is desirable to use bpf_redirect() in combination with ifb device, which currently is not supported, for example, around filtering inbound traffic with BPF to then push it to ifb which holds the qdisc for shaping in contrast to doing that on the egress device. Toke mentions the following case related to OpenWrt: Because there's not always a single egress on the other side. These are mainly home routers, which tend to have one or more WiFi devices bridged to one or more ethernet ports on the LAN side, and a single upstream WAN port. And the objective is to control the total amount of traffic going over the WAN link (in both directions), to deal with bufferbloat in the ISP network (which is sadly still all too prevalent). In this setup, the traffic can be split arbitrarily between the links on the LAN side, and the only "single bottleneck" is the WAN link. So we install both egress and ingress shapers on this, configured to something like 95-98% of the true link bandwidth, thus moving the queues into the qdisc layer in the router. It's usually necessary to set the ingress bandwidth shaper a bit lower than the egress due to being "downstream" of the bottleneck link, but it does work surprisingly well. We usually use something like a matchall filter to put all ingress traffic on the ifb, so doing the redirect from BPF has not been an immediate requirement thus far. However, it does seem a bit odd that this is not possible, and we do have a BPF-based filter that layers on top of this kind of setup, which currently uses u32 as the ingress filter and so it could presumably be improved to use BPF instead if that was available. Reported-by: Toke Høiland-Jørgensen Reported-by: Yafang Shao Reported-by: Tonghao Zhang Signed-off-by: Daniel Borkmann Acked-by: Yafang Shao Acked-by: Toke Høiland-Jørgensen Link: https://git.openwrt.org/?p=project/qosify.git;a=blob;f=README Link: https://lore.kernel.org/bpf/875y9yzbuy.fsf@toke.dk Link: https://lore.kernel.org/r/8cebc8b2b6e967e10cbafe2ffd6795050e74accd.1681739137.git.daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- include/linux/skbuff.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 494a23a976b0..9ff2e3d57329 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -5041,6 +5041,15 @@ static inline void skb_reset_redirect(struct sk_buff *skb) skb->redirected = 0; } +static inline void skb_set_redirected_noclear(struct sk_buff *skb, + bool from_ingress) +{ + skb->redirected = 1; +#ifdef CONFIG_NET_REDIRECT + skb->from_ingress = from_ingress; +#endif +} + static inline bool skb_csum_is_sctp(struct sk_buff *skb) { return skb->csum_not_inet; -- cgit From 84601d6ee68ae820dec97450934797046d62db4b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 21 Apr 2023 19:02:54 +0200 Subject: bpf: add bpf_link support for BPF_NETFILTER programs Add bpf_link support skeleton. To keep this reviewable, no bpf program can be invoked yet, if a program is attached only a c-stub is called and not the actual bpf program. Defaults to 'y' if both netfilter and bpf syscall are enabled in kconfig. Uapi example usage: union bpf_attr attr = { }; attr.link_create.prog_fd = progfd; attr.link_create.attach_type = 0; /* unused */ attr.link_create.netfilter.pf = PF_INET; attr.link_create.netfilter.hooknum = NF_INET_LOCAL_IN; attr.link_create.netfilter.priority = -128; err = bpf(BPF_LINK_CREATE, &attr, sizeof(attr)); ... this would attach progfd to ipv4:input hook. Such hook gets removed automatically if the calling program exits. BPF_NETFILTER program invocation is added in followup change. NF_HOOK_OP_BPF enum will eventually be read from nfnetlink_hook, it allows to tell userspace which program is attached at the given hook when user runs 'nft hook list' command rather than just the priority and not-very-helpful 'this hook runs a bpf prog but I can't tell which one'. Will also be used to disallow registration of two bpf programs with same priority in a followup patch. v4: arm32 cmpxchg only supports 32bit operand s/prio/priority/ v3: restrict prog attachment to ip/ip6 for now, lets lift restrictions if more use cases pop up (arptables, ebtables, netdev ingress/egress etc). Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20230421170300.24115-2-fw@strlen.de Signed-off-by: Alexei Starovoitov --- include/linux/netfilter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index c8e03bcaecaa..0762444e3767 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -80,6 +80,7 @@ typedef unsigned int nf_hookfn(void *priv, enum nf_hook_ops_type { NF_HOOK_OP_UNDEFINED, NF_HOOK_OP_NF_TABLES, + NF_HOOK_OP_BPF, }; struct nf_hook_ops { -- cgit From fd9c663b9ad67dedfc9a3fd3429ddd3e83782b4d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 21 Apr 2023 19:02:55 +0200 Subject: bpf: minimal support for programs hooked into netfilter framework This adds minimal support for BPF_PROG_TYPE_NETFILTER bpf programs that will be invoked via the NF_HOOK() points in the ip stack. Invocation incurs an indirect call. This is not a necessity: Its possible to add 'DEFINE_BPF_DISPATCHER(nf_progs)' and handle the program invocation with the same method already done for xdp progs. This isn't done here to keep the size of this chunk down. Verifier restricts verdicts to either DROP or ACCEPT. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20230421170300.24115-3-fw@strlen.de Signed-off-by: Alexei Starovoitov --- include/linux/bpf_types.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index d4ee3ccd3753..39a999abb0ce 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -79,6 +79,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm, #endif BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall, void *, void *) +#ifdef CONFIG_NETFILTER +BPF_PROG_TYPE(BPF_PROG_TYPE_NETFILTER, netfilter, + struct bpf_nf_ctx, struct bpf_nf_ctx) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) -- cgit From 2b99ef22e0d237e08bfc437e7d051f78f352aeb2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 21 Apr 2023 19:02:59 +0200 Subject: bpf: add test_run support for netfilter program type add glue code so a bpf program can be run using userspace-provided netfilter state and packet/skb. Default is to use ipv4:output hook point, but this can be overridden by userspace. Userspace provided netfilter state is restricted, only hook and protocol families can be overridden and only to ipv4/ipv6. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20230421170300.24115-7-fw@strlen.de Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 18b592fde896..e53ceee1df37 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2264,6 +2264,9 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +int bpf_prog_test_run_nf(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); -- cgit