From 45b5623f2d721c25d1a2fdc8c4600fb4b7b61c75 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 2 Dec 2023 09:56:55 -0800 Subject: bpf: rearrange bpf_func_state fields to save a bit of memory It's a trivial rearrangement saving 8 bytes. We have 4 bytes of padding at the end which can be filled with another field without increasing struct bpf_func_state. copy_func_state() logic remains correct without any further changes. BEFORE ====== struct bpf_func_state { struct bpf_reg_state regs[11]; /* 0 1320 */ /* --- cacheline 20 boundary (1280 bytes) was 40 bytes ago --- */ int callsite; /* 1320 4 */ u32 frameno; /* 1324 4 */ u32 subprogno; /* 1328 4 */ u32 async_entry_cnt; /* 1332 4 */ bool in_callback_fn; /* 1336 1 */ /* XXX 7 bytes hole, try to pack */ /* --- cacheline 21 boundary (1344 bytes) --- */ struct tnum callback_ret_range; /* 1344 16 */ bool in_async_callback_fn; /* 1360 1 */ bool in_exception_callback_fn; /* 1361 1 */ /* XXX 2 bytes hole, try to pack */ int acquired_refs; /* 1364 4 */ struct bpf_reference_state * refs; /* 1368 8 */ int allocated_stack; /* 1376 4 */ /* XXX 4 bytes hole, try to pack */ struct bpf_stack_state * stack; /* 1384 8 */ /* size: 1392, cachelines: 22, members: 13 */ /* sum members: 1379, holes: 3, sum holes: 13 */ /* last cacheline: 48 bytes */ }; AFTER ===== struct bpf_func_state { struct bpf_reg_state regs[11]; /* 0 1320 */ /* --- cacheline 20 boundary (1280 bytes) was 40 bytes ago --- */ int callsite; /* 1320 4 */ u32 frameno; /* 1324 4 */ u32 subprogno; /* 1328 4 */ u32 async_entry_cnt; /* 1332 4 */ struct tnum callback_ret_range; /* 1336 16 */ /* --- cacheline 21 boundary (1344 bytes) was 8 bytes ago --- */ bool in_callback_fn; /* 1352 1 */ bool in_async_callback_fn; /* 1353 1 */ bool in_exception_callback_fn; /* 1354 1 */ /* XXX 1 byte hole, try to pack */ int acquired_refs; /* 1356 4 */ struct bpf_reference_state * refs; /* 1360 8 */ struct bpf_stack_state * stack; /* 1368 8 */ int allocated_stack; /* 1376 4 */ /* size: 1384, cachelines: 22, members: 13 */ /* sum members: 1379, holes: 1, sum holes: 1 */ /* padding: 4 */ /* last cacheline: 40 bytes */ }; Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231202175705.885270-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d99a636d36a7..0c0e1bccad45 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -297,8 +297,8 @@ struct bpf_func_state { * void foo(void) { bpf_timer_set_callback(,foo); } */ u32 async_entry_cnt; - bool in_callback_fn; struct tnum callback_ret_range; + bool in_callback_fn; bool in_async_callback_fn; bool in_exception_callback_fn; /* For callback calling functions that limit number of possible @@ -316,8 +316,8 @@ struct bpf_func_state { /* The following fields should be last. See copy_func_state() */ int acquired_refs; struct bpf_reference_state *refs; - int allocated_stack; struct bpf_stack_state *stack; + int allocated_stack; }; struct bpf_idx_pair { -- cgit From 8fa4ecd49b81ccd9d1d87f1c8b2260e218644878 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 2 Dec 2023 09:56:58 -0800 Subject: bpf: enforce exact retval range on subprog/callback exit Instead of relying on potentially imprecise tnum representation of expected return value range for callbacks and subprogs, validate that smin/smax range satisfy exact expected range of return values. E.g., if callback would need to return [0, 2] range, tnum can't represent this precisely and instead will allow [0, 3] range. By checking smin/smax range, we can make sure that subprog/callback indeed returns only valid [0, 2] range. Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231202175705.885270-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 0c0e1bccad45..3378cc753061 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -275,6 +275,11 @@ struct bpf_reference_state { int callback_ref; }; +struct bpf_retval_range { + s32 minval; + s32 maxval; +}; + /* state of the program: * type of all registers and stack info */ @@ -297,7 +302,7 @@ struct bpf_func_state { * void foo(void) { bpf_timer_set_callback(,foo); } */ u32 async_entry_cnt; - struct tnum callback_ret_range; + struct bpf_retval_range callback_ret_range; bool in_callback_fn; bool in_async_callback_fn; bool in_exception_callback_fn; -- cgit From 20c20bd11a0702ce4dc9300c3da58acf551d9725 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:20 +0800 Subject: bpf: Add map and need_defer parameters to .map_fd_put_ptr() map is the pointer of outer map, and need_defer needs some explanation. need_defer tells the implementation to defer the reference release of the passed element and ensure that the element is still alive before the bpf program, which may manipulate it, exits. The following three cases will invoke map_fd_put_ptr() and different need_defer values will be passed to these callers: 1) release the reference of the old element in the map during map update or map deletion. The release must be deferred, otherwise the bpf program may incur use-after-free problem, so need_defer needs to be true. 2) release the reference of the to-be-added element in the error path of map update. The to-be-added element is not visible to any bpf program, so it is OK to pass false for need_defer parameter. 3) release the references of all elements in the map during map release. Any bpf program which has access to the map must have been exited and released, so need_defer=false will be OK. These two parameters will be used by the following patches to fix the potential use-after-free problem for map-in-map. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eb447b0a9423..d273348cfb2f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -106,7 +106,11 @@ struct bpf_map_ops { /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, int fd); - void (*map_fd_put_ptr)(void *ptr); + /* If need_defer is true, the implementation should guarantee that + * the to-be-put element is still alive before the bpf program, which + * may manipulate it, exists. + */ + void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer); int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, -- cgit From 876673364161da50eed6b472d746ef88242b2368 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:22 +0800 Subject: bpf: Defer the free of inner map when necessary When updating or deleting an inner map in map array or map htab, the map may still be accessed by non-sleepable program or sleepable program. However bpf_map_fd_put_ptr() decreases the ref-counter of the inner map directly through bpf_map_put(), if the ref-counter is the last one (which is true for most cases), the inner map will be freed by ops->map_free() in a kworker. But for now, most .map_free() callbacks don't use synchronize_rcu() or its variants to wait for the elapse of a RCU grace period, so after the invocation of ops->map_free completes, the bpf program which is accessing the inner map may incur use-after-free problem. Fix the free of inner map by invoking bpf_map_free_deferred() after both one RCU grace period and one tasks trace RCU grace period if the inner map has been removed from the outer map before. The deferment is accomplished by using call_rcu() or call_rcu_tasks_trace() when releasing the last ref-counter of bpf map. The newly-added rcu_head field in bpf_map shares the same storage space with work field to reduce the size of bpf_map. Fixes: bba1dc0b55ac ("bpf: Remove redundant synchronize_rcu.") Fixes: 638e4b825d52 ("bpf: Allows per-cpu maps and map-in-map in sleepable programs") Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-5-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d273348cfb2f..de3bd03cbeea 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -276,7 +276,11 @@ struct bpf_map { */ atomic64_t refcnt ____cacheline_aligned; atomic64_t usercnt; - struct work_struct work; + /* rcu is used before freeing and work is only used during freeing */ + union { + struct work_struct work; + struct rcu_head rcu; + }; struct mutex freeze_mutex; atomic64_t writecnt; /* 'Ownership' of program-containing map is claimed by the first program @@ -292,6 +296,7 @@ struct bpf_map { } owner; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ + bool free_after_mult_rcu_gp; s64 __percpu *elem_count; }; -- cgit From af66bfd3c8538ed21cf72af18426fc4a408665cf Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:23 +0800 Subject: bpf: Optimize the free of inner map When removing the inner map from the outer map, the inner map will be freed after one RCU grace period and one RCU tasks trace grace period, so it is certain that the bpf program, which may access the inner map, has exited before the inner map is freed. However there is no need to wait for one RCU tasks trace grace period if the outer map is only accessed by non-sleepable program. So adding sleepable_refcnt in bpf_map and increasing sleepable_refcnt when adding the outer map into env->used_maps for sleepable program. Although the max number of bpf program is INT_MAX - 1, the number of bpf programs which are being loaded may be greater than INT_MAX, so using atomic64_t instead of atomic_t for sleepable_refcnt. When removing the inner map from the outer map, using sleepable_refcnt to decide whether or not a RCU tasks trace grace period is needed before freeing the inner map. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-6-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index de3bd03cbeea..10e5e4d8a00f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -297,6 +297,8 @@ struct bpf_map { bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ bool free_after_mult_rcu_gp; + bool free_after_rcu_gp; + atomic64_t sleepable_refcnt; s64 __percpu *elem_count; }; -- cgit From 41f6f64e6999a837048b1bd13a2f8742964eca6b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 5 Dec 2023 10:42:39 -0800 Subject: bpf: support non-r10 register spill/fill to/from stack in precision tracking Use instruction (jump) history to record instructions that performed register spill/fill to/from stack, regardless if this was done through read-only r10 register, or any other register after copying r10 into it *and* potentially adjusting offset. To make this work reliably, we push extra per-instruction flags into instruction history, encoding stack slot index (spi) and stack frame number in extra 10 bit flags we take away from prev_idx in instruction history. We don't touch idx field for maximum performance, as it's checked most frequently during backtracking. This change removes basically the last remaining practical limitation of precision backtracking logic in BPF verifier. It fixes known deficiencies, but also opens up new opportunities to reduce number of verified states, explored in the subsequent patches. There are only three differences in selftests' BPF object files according to veristat, all in the positive direction (less states). File Program Insns (A) Insns (B) Insns (DIFF) States (A) States (B) States (DIFF) -------------------------------------- ------------- --------- --------- ------------- ---------- ---------- ------------- test_cls_redirect_dynptr.bpf.linked3.o cls_redirect 2987 2864 -123 (-4.12%) 240 231 -9 (-3.75%) xdp_synproxy_kern.bpf.linked3.o syncookie_tc 82848 82661 -187 (-0.23%) 5107 5073 -34 (-0.67%) xdp_synproxy_kern.bpf.linked3.o syncookie_xdp 85116 84964 -152 (-0.18%) 5162 5130 -32 (-0.62%) Note, I avoided renaming jmp_history to more generic insn_hist to minimize number of lines changed and potential merge conflicts between bpf and bpf-next trees. Notice also cur_hist_entry pointer reset to NULL at the beginning of instruction verification loop. This pointer avoids the problem of relying on last jump history entry's insn_idx to determine whether we already have entry for current instruction or not. It can happen that we added jump history entry because current instruction is_jmp_point(), but also we need to add instruction flags for stack access. In this case, we don't want to entries, so we need to reuse last added entry, if it is present. Relying on insn_idx comparison has the same ambiguity problem as the one that was fixed recently in [0], so we avoid that. [0] https://patchwork.kernel.org/project/netdevbpf/patch/20231110002638.4168352-3-andrii@kernel.org/ Acked-by: Eduard Zingerman Reported-by: Tao Lyu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231205184248.1502704-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3378cc753061..bada59812e00 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -325,12 +325,34 @@ struct bpf_func_state { int allocated_stack; }; -struct bpf_idx_pair { - u32 prev_idx; +#define MAX_CALL_FRAMES 8 + +/* instruction history flags, used in bpf_jmp_history_entry.flags field */ +enum { + /* instruction references stack slot through PTR_TO_STACK register; + * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8) + * and accessed stack slot's index in next 6 bits (MAX_BPF_STACK is 512, + * 8 bytes per slot, so slot index (spi) is [0, 63]) + */ + INSN_F_FRAMENO_MASK = 0x7, /* 3 bits */ + + INSN_F_SPI_MASK = 0x3f, /* 6 bits */ + INSN_F_SPI_SHIFT = 3, /* shifted 3 bits to the left */ + + INSN_F_STACK_ACCESS = BIT(9), /* we need 10 bits total */ +}; + +static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES); +static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8); + +struct bpf_jmp_history_entry { u32 idx; + /* insn idx can't be bigger than 1 million */ + u32 prev_idx : 22; + /* special flags, e.g., whether insn is doing register stack spill/load */ + u32 flags : 10; }; -#define MAX_CALL_FRAMES 8 /* Maximum number of register states that can exist at once */ #define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES) struct bpf_verifier_state { @@ -413,7 +435,7 @@ struct bpf_verifier_state { * For most states jmp_history_cnt is [0-3]. * For loops can go up to ~40. */ - struct bpf_idx_pair *jmp_history; + struct bpf_jmp_history_entry *jmp_history; u32 jmp_history_cnt; u32 dfs_depth; u32 callback_unroll_depth; @@ -656,6 +678,7 @@ struct bpf_verifier_env { int cur_stack; } cfg; struct backtrack_state bt; + struct bpf_jmp_history_entry *cur_hist_ent; u32 pass_cnt; /* number of times do_check() was called */ u32 subprog_cnt; /* number of instructions analyzed by the verifier */ -- cgit From 40bba140c60fbb3ee8df6203c82fbd3de9f19d95 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:14 -0800 Subject: bpf: add BPF token delegation mount options to BPF FS Add few new mount options to BPF FS that allow to specify that a given BPF FS instance allows creation of BPF token (added in the next patch), and what sort of operations are allowed under BPF token. As such, we get 4 new mount options, each is a bit mask - `delegate_cmds` allow to specify which bpf() syscall commands are allowed with BPF token derived from this BPF FS instance; - if BPF_MAP_CREATE command is allowed, `delegate_maps` specifies a set of allowable BPF map types that could be created with BPF token; - if BPF_PROG_LOAD command is allowed, `delegate_progs` specifies a set of allowable BPF program types that could be loaded with BPF token; - if BPF_PROG_LOAD command is allowed, `delegate_attachs` specifies a set of allowable BPF program attach types that could be loaded with BPF token; delegate_progs and delegate_attachs are meant to be used together, as full BPF program type is, in general, determined through both program type and program attach type. Currently, these mount options accept the following forms of values: - a special value "any", that enables all possible values of a given bit set; - numeric value (decimal or hexadecimal, determined by kernel automatically) that specifies a bit mask value directly; - all the values for a given mount option are combined, if specified multiple times. E.g., `mount -t bpf nodev /path/to/mount -o delegate_maps=0x1 -o delegate_maps=0x2` will result in a combined 0x3 mask. Ideally, more convenient (for humans) symbolic form derived from corresponding UAPI enums would be accepted (e.g., `-o delegate_progs=kprobe|tracepoint`) and I intend to implement this, but it requires a bunch of UAPI header churn, so I postponed it until this feature lands upstream or at least there is a definite consensus that this feature is acceptable and is going to make it, just to minimize amount of wasted effort and not increase amount of non-essential code to be reviewed. Attentive reader will notice that BPF FS is now marked as FS_USERNS_MOUNT, which theoretically makes it mountable inside non-init user namespace as long as the process has sufficient *namespaced* capabilities within that user namespace. But in reality we still restrict BPF FS to be mountable only by processes with CAP_SYS_ADMIN *in init userns* (extra check in bpf_fill_super()). FS_USERNS_MOUNT is added to allow creating BPF FS context object (i.e., fsopen("bpf")) from inside unprivileged process inside non-init userns, to capture that userns as the owning userns. It will still be required to pass this context object back to privileged process to instantiate and mount it. This manipulation is important, because capturing non-init userns as the owning userns of BPF FS instance (super block) allows to use that userns to constraint BPF token to that userns later on (see next patch). So creating BPF FS with delegation inside unprivileged userns will restrict derived BPF token objects to only "work" inside that intended userns, making it scoped to a intended "container". Also, setting these delegation options requires capable(CAP_SYS_ADMIN), so unprivileged process cannot set this up without involvement of a privileged process. There is a set of selftests at the end of the patch set that simulates this sequence of steps and validates that everything works as intended. But careful review is requested to make sure there are no missed gaps in the implementation and testing. This somewhat subtle set of aspects is the result of previous discussions ([0]) about various user namespace implications and interactions with BPF token functionality and is necessary to contain BPF token inside intended user namespace. [0] https://lore.kernel.org/bpf/20230704-hochverdient-lehne-eeb9eeef785e@brauner/ Acked-by: Christian Brauner Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 10e5e4d8a00f..d3c9acc593ea 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1581,6 +1581,16 @@ struct bpf_link_primer { u32 id; }; +struct bpf_mount_opts { + umode_t mode; + + /* BPF token-related delegation options */ + u64 delegate_cmds; + u64 delegate_maps; + u64 delegate_progs; + u64 delegate_attachs; +}; + struct bpf_struct_ops_value; struct btf_member; -- cgit From 4527358b76861dfd64ee34aba45d81648fbc8a61 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:15 -0800 Subject: bpf: introduce BPF token object Add new kind of BPF kernel object, BPF token. BPF token is meant to allow delegating privileged BPF functionality, like loading a BPF program or creating a BPF map, from privileged process to a *trusted* unprivileged process, all while having a good amount of control over which privileged operations could be performed using provided BPF token. This is achieved through mounting BPF FS instance with extra delegation mount options, which determine what operations are delegatable, and also constraining it to the owning user namespace (as mentioned in the previous patch). BPF token itself is just a derivative from BPF FS and can be created through a new bpf() syscall command, BPF_TOKEN_CREATE, which accepts BPF FS FD, which can be attained through open() API by opening BPF FS mount point. Currently, BPF token "inherits" delegated command, map types, prog type, and attach type bit sets from BPF FS as is. In the future, having an BPF token as a separate object with its own FD, we can allow to further restrict BPF token's allowable set of things either at the creation time or after the fact, allowing the process to guard itself further from unintentionally trying to load undesired kind of BPF programs. But for now we keep things simple and just copy bit sets as is. When BPF token is created from BPF FS mount, we take reference to the BPF super block's owning user namespace, and then use that namespace for checking all the {CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN} capabilities that are normally only checked against init userns (using capable()), but now we check them using ns_capable() instead (if BPF token is provided). See bpf_token_capable() for details. Such setup means that BPF token in itself is not sufficient to grant BPF functionality. User namespaced process has to *also* have necessary combination of capabilities inside that user namespace. So while previously CAP_BPF was useless when granted within user namespace, now it gains a meaning and allows container managers and sys admins to have a flexible control over which processes can and need to use BPF functionality within the user namespace (i.e., container in practice). And BPF FS delegation mount options and derived BPF tokens serve as a per-container "flag" to grant overall ability to use bpf() (plus further restrict on which parts of bpf() syscalls are treated as namespaced). Note also, BPF_TOKEN_CREATE command itself requires ns_capable(CAP_BPF) within the BPF FS owning user namespace, rounding up the ns_capable() story of BPF token. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 41 +++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/bpf.h | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d3c9acc593ea..aa9cf8e5fab1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -51,6 +51,10 @@ struct module; struct bpf_func_state; struct ftrace_ops; struct cgroup; +struct bpf_token; +struct user_namespace; +struct super_block; +struct inode; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -1591,6 +1595,13 @@ struct bpf_mount_opts { u64 delegate_attachs; }; +struct bpf_token { + struct work_struct work; + atomic64_t refcnt; + struct user_namespace *userns; + u64 allowed_cmds; +}; + struct bpf_struct_ops_value; struct btf_member; @@ -2048,6 +2059,7 @@ static inline void bpf_enable_instrumentation(void) migrate_enable(); } +extern const struct super_operations bpf_super_ops; extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; @@ -2182,6 +2194,8 @@ static inline void bpf_map_dec_elem_count(struct bpf_map *map) extern int sysctl_unprivileged_bpf_disabled; +bool bpf_token_capable(const struct bpf_token *token, int cap); + static inline bool bpf_allow_ptr_leaks(void) { return perfmon_capable(); @@ -2216,8 +2230,17 @@ int bpf_link_new_fd(struct bpf_link *link); struct bpf_link *bpf_link_get_from_fd(u32 ufd); struct bpf_link *bpf_link_get_curr_or_next(u32 *id); +void bpf_token_inc(struct bpf_token *token); +void bpf_token_put(struct bpf_token *token); +int bpf_token_create(union bpf_attr *attr); +struct bpf_token *bpf_token_get_from_fd(u32 ufd); + +bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd); + int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); +struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir, + umode_t mode); #define BPF_ITER_FUNC_PREFIX "bpf_iter_" #define DEFINE_BPF_ITER_FUNC(target, args...) \ @@ -2580,6 +2603,24 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags) return -EOPNOTSUPP; } +static inline bool bpf_token_capable(const struct bpf_token *token, int cap) +{ + return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN)); +} + +static inline void bpf_token_inc(struct bpf_token *token) +{ +} + +static inline void bpf_token_put(struct bpf_token *token) +{ +} + +static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd) +{ + return ERR_PTR(-EOPNOTSUPP); +} + static inline void __dev_flush(void) { } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e88746ba7d21..d4a567e5bc3c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -847,6 +847,36 @@ union bpf_iter_link_info { * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * + * BPF_TOKEN_CREATE + * Description + * Create BPF token with embedded information about what + * BPF-related functionality it allows: + * - a set of allowed bpf() syscall commands; + * - a set of allowed BPF map types to be created with + * BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed; + * - a set of allowed BPF program types and BPF program attach + * types to be loaded with BPF_PROG_LOAD command, if + * BPF_PROG_LOAD itself is allowed. + * + * BPF token is created (derived) from an instance of BPF FS, + * assuming it has necessary delegation mount options specified. + * This BPF token can be passed as an extra parameter to various + * bpf() syscall commands to grant BPF subsystem functionality to + * unprivileged processes. + * + * When created, BPF token is "associated" with the owning + * user namespace of BPF FS instance (super block) that it was + * derived from, and subsequent BPF operations performed with + * BPF token would be performing capabilities checks (i.e., + * CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within + * that user namespace. Without BPF token, such capabilities + * have to be granted in init user namespace, making bpf() + * syscall incompatible with user namespace, for the most part. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -901,6 +931,8 @@ enum bpf_cmd { BPF_ITER_CREATE, BPF_LINK_DETACH, BPF_PROG_BIND_MAP, + BPF_TOKEN_CREATE, + __MAX_BPF_CMD, }; enum bpf_map_type { @@ -1712,6 +1744,11 @@ union bpf_attr { __u32 flags; /* extra flags */ } prog_bind_map; + struct { /* struct used by BPF_TOKEN_CREATE command */ + __u32 flags; + __u32 bpffs_fd; + } token_create; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit From 688b7270b3cb75e8ac78123d719967db40336e5b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:16 -0800 Subject: bpf: add BPF token support to BPF_MAP_CREATE command Allow providing token_fd for BPF_MAP_CREATE command to allow controlled BPF map creation from unprivileged process through delegated BPF token. Wire through a set of allowed BPF map types to BPF token, derived from BPF FS at BPF token creation time. This, in combination with allowed_cmds allows to create a narrowly-focused BPF token (controlled by privileged agent) with a restrictive set of BPF maps that application can attempt to create. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index aa9cf8e5fab1..e08e8436df38 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1600,6 +1600,7 @@ struct bpf_token { atomic64_t refcnt; struct user_namespace *userns; u64 allowed_cmds; + u64 allowed_maps; }; struct bpf_struct_ops_value; @@ -2236,6 +2237,7 @@ int bpf_token_create(union bpf_attr *attr); struct bpf_token *bpf_token_get_from_fd(u32 ufd); bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd); +bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type); int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d4a567e5bc3c..0bba3392b17a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -983,6 +983,7 @@ enum bpf_map_type { BPF_MAP_TYPE_BLOOM_FILTER, BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, + __MAX_BPF_MAP_TYPE }; /* Note that tracing related programs such as @@ -1433,6 +1434,7 @@ union bpf_attr { * to using 5 hash functions). */ __u64 map_extra; + __u32 map_token_fd; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ -- cgit From ee54b1a910e4d49c9a104f31ae3f5b979131adf8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:17 -0800 Subject: bpf: add BPF token support to BPF_BTF_LOAD command Accept BPF token FD in BPF_BTF_LOAD command to allow BTF data loading through delegated BPF token. BTF loading is a pretty straightforward operation, so as long as BPF token is created with allow_cmds granting BPF_BTF_LOAD command, kernel proceeds to parsing BTF data and creating BTF object. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0bba3392b17a..9f9989e0d062 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1616,6 +1616,7 @@ union bpf_attr { * truncated), or smaller (if log buffer wasn't filled completely). */ __u32 btf_log_true_size; + __u32 btf_token_fd; }; struct { -- cgit From e1cef620f598853a90f17701fcb1057a6768f7b8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:18 -0800 Subject: bpf: add BPF token support to BPF_PROG_LOAD command Add basic support of BPF token to BPF_PROG_LOAD. Wire through a set of allowed BPF program types and attach types, derived from BPF FS at BPF token creation time. Then make sure we perform bpf_token_capable() checks everywhere where it's relevant. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ include/uapi/linux/bpf.h | 2 ++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e08e8436df38..20af87b59d70 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1461,6 +1461,7 @@ struct bpf_prog_aux { #ifdef CONFIG_SECURITY void *security; #endif + struct bpf_token *token; struct bpf_prog_offload *offload; struct btf *btf; struct bpf_func_info *func_info; @@ -1601,6 +1602,8 @@ struct bpf_token { struct user_namespace *userns; u64 allowed_cmds; u64 allowed_maps; + u64 allowed_progs; + u64 allowed_attachs; }; struct bpf_struct_ops_value; @@ -2238,6 +2241,9 @@ struct bpf_token *bpf_token_get_from_fd(u32 ufd); bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd); bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type); +bool bpf_token_allow_prog_type(const struct bpf_token *token, + enum bpf_prog_type prog_type, + enum bpf_attach_type attach_type); int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9f9989e0d062..4df2d025c784 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1028,6 +1028,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER, + __MAX_BPF_PROG_TYPE }; enum bpf_attach_type { @@ -1504,6 +1505,7 @@ union bpf_attr { * truncated), or smaller (if log buffer wasn't filled completely). */ __u32 log_true_size; + __u32 prog_token_fd; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ -- cgit From 4cbb270e115bc197ff2046aeb54cc951666b16ec Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:19 -0800 Subject: bpf: take into account BPF token when fetching helper protos Instead of performing unconditional system-wide bpf_capable() and perfmon_capable() calls inside bpf_base_func_proto() function (and other similar ones) to determine eligibility of a given BPF helper for a given program, use previously recorded BPF token during BPF_PROG_LOAD command handling to inform the decision. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-8-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 20af87b59d70..2a3ab4f3dd8c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2492,7 +2492,8 @@ const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type struct bpf_prog *bpf_prog_by_id(u32 id); struct bpf_link *bpf_link_by_id(u32 id); -const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); +const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog); void bpf_task_storage_free(struct task_struct *task); void bpf_cgrp_storage_free(struct cgroup *cgroup); bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); @@ -2752,7 +2753,7 @@ static inline int btf_struct_access(struct bpf_verifier_log *log, } static inline const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id) +bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { return NULL; } -- cgit From 8062fb12de99b2da33754c6a3be1bfc30d9a35f4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:20 -0800 Subject: bpf: consistently use BPF token throughout BPF verifier logic Remove remaining direct queries to perfmon_capable() and bpf_capable() in BPF verifier logic and instead use BPF token (if available) to make decisions about privileges. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-9-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 16 ++++++++-------- include/linux/filter.h | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2a3ab4f3dd8c..435abad3cc61 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2200,24 +2200,24 @@ extern int sysctl_unprivileged_bpf_disabled; bool bpf_token_capable(const struct bpf_token *token, int cap); -static inline bool bpf_allow_ptr_leaks(void) +static inline bool bpf_allow_ptr_leaks(const struct bpf_token *token) { - return perfmon_capable(); + return bpf_token_capable(token, CAP_PERFMON); } -static inline bool bpf_allow_uninit_stack(void) +static inline bool bpf_allow_uninit_stack(const struct bpf_token *token) { - return perfmon_capable(); + return bpf_token_capable(token, CAP_PERFMON); } -static inline bool bpf_bypass_spec_v1(void) +static inline bool bpf_bypass_spec_v1(const struct bpf_token *token) { - return cpu_mitigations_off() || perfmon_capable(); + return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON); } -static inline bool bpf_bypass_spec_v4(void) +static inline bool bpf_bypass_spec_v4(const struct bpf_token *token) { - return cpu_mitigations_off() || perfmon_capable(); + return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON); } int bpf_map_new_fd(struct bpf_map *map, int flags); diff --git a/include/linux/filter.h b/include/linux/filter.h index a4953fafc8cb..14354605ad26 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1139,7 +1139,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) return false; if (!bpf_jit_harden) return false; - if (bpf_jit_harden == 1 && bpf_capable()) + if (bpf_jit_harden == 1 && bpf_token_capable(prog->aux->token, CAP_BPF)) return false; return true; -- cgit From c3dd6e94df7193f33f45d33303f5e85afb2a72dc Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:21 -0800 Subject: bpf,lsm: refactor bpf_prog_alloc/bpf_prog_free LSM hooks Based on upstream discussion ([0]), rework existing bpf_prog_alloc_security LSM hook. Rename it to bpf_prog_load and instead of passing bpf_prog_aux, pass proper bpf_prog pointer for a full BPF program struct. Also, we pass bpf_attr union with all the user-provided arguments for BPF_PROG_LOAD command. This will give LSMs as much information as we can basically provide. The hook is also BPF token-aware now, and optional bpf_token struct is passed as a third argument. bpf_prog_load LSM hook is called after a bunch of sanity checks were performed, bpf_prog and bpf_prog_aux were allocated and filled out, but right before performing full-fledged BPF verification step. bpf_prog_free LSM hook is now accepting struct bpf_prog argument, for consistency. SELinux code is adjusted to all new names, types, and signatures. Note, given that bpf_prog_load (previously bpf_prog_alloc) hook can be used by some LSMs to allocate extra security blob, but also by other LSMs to reject BPF program loading, we need to make sure that bpf_prog_free LSM hook is called after bpf_prog_load/bpf_prog_alloc one *even* if the hook itself returned error. If we don't do that, we run the risk of leaking memory. This seems to be possible today when combining SELinux and BPF LSM, as one example, depending on their relative ordering. Also, for BPF LSM setup, add bpf_prog_load and bpf_prog_free to sleepable LSM hooks list, as they are both executed in sleepable context. Also drop bpf_prog_load hook from untrusted, as there is no issue with refcount or anything else anymore, that originally forced us to add it to untrusted list in c0c852dd1876 ("bpf: Do not mark certain LSM hook arguments as trusted"). We now trigger this hook much later and it should not be an issue anymore. [0] https://lore.kernel.org/bpf/9fe88aef7deabbe87d3fc38c4aea3c69.paul@paul-moore.com/ Acked-by: Paul Moore Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-10-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/lsm_hook_defs.h | 5 +++-- include/linux/security.h | 12 +++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index ff217a5ce552..41ec4a7c070e 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -400,8 +400,9 @@ LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode) LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog) LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map) LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map) -LSM_HOOK(int, 0, bpf_prog_alloc_security, struct bpf_prog_aux *aux) -LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free_security, struct bpf_prog_aux *aux) +LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, + struct bpf_token *token) +LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) #endif /* CONFIG_BPF_SYSCALL */ LSM_HOOK(int, 0, locked_down, enum lockdown_reason what) diff --git a/include/linux/security.h b/include/linux/security.h index 1d1df326c881..65467eef6678 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2020,15 +2020,16 @@ static inline void securityfs_remove(struct dentry *dentry) union bpf_attr; struct bpf_map; struct bpf_prog; -struct bpf_prog_aux; +struct bpf_token; #ifdef CONFIG_SECURITY extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size); extern int security_bpf_map(struct bpf_map *map, fmode_t fmode); extern int security_bpf_prog(struct bpf_prog *prog); extern int security_bpf_map_alloc(struct bpf_map *map); extern void security_bpf_map_free(struct bpf_map *map); -extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux); -extern void security_bpf_prog_free(struct bpf_prog_aux *aux); +extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, + struct bpf_token *token); +extern void security_bpf_prog_free(struct bpf_prog *prog); #else static inline int security_bpf(int cmd, union bpf_attr *attr, unsigned int size) @@ -2054,12 +2055,13 @@ static inline int security_bpf_map_alloc(struct bpf_map *map) static inline void security_bpf_map_free(struct bpf_map *map) { } -static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux) +static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, + struct bpf_token *token) { return 0; } -static inline void security_bpf_prog_free(struct bpf_prog_aux *aux) +static inline void security_bpf_prog_free(struct bpf_prog *prog) { } #endif /* CONFIG_SECURITY */ #endif /* CONFIG_BPF_SYSCALL */ -- cgit From 66d636d70a79c1d37e3eea67ab50969e6aaef983 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:22 -0800 Subject: bpf,lsm: refactor bpf_map_alloc/bpf_map_free LSM hooks Similarly to bpf_prog_alloc LSM hook, rename and extend bpf_map_alloc hook into bpf_map_create, taking not just struct bpf_map, but also bpf_attr and bpf_token, to give a fuller context to LSMs. Unlike bpf_prog_alloc, there is no need to move the hook around, as it currently is firing right before allocating BPF map ID and FD, which seems to be a sweet spot. But like bpf_prog_alloc/bpf_prog_free combo, make sure that bpf_map_free LSM hook is called even if bpf_map_create hook returned error, as if few LSMs are combined together it could be that one LSM successfully allocated security blob for its needs, while subsequent LSM rejected BPF map creation. The former LSM would still need to free up LSM blob, so we need to ensure security_bpf_map_free() is called regardless of the outcome. Acked-by: Paul Moore Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-11-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/lsm_hook_defs.h | 5 +++-- include/linux/security.h | 6 ++++-- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 41ec4a7c070e..adb25cc63ce3 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -398,8 +398,9 @@ LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule) LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size) LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode) LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog) -LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map) -LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map) +LSM_HOOK(int, 0, bpf_map_create, struct bpf_map *map, union bpf_attr *attr, + struct bpf_token *token) +LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map) LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token) LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) diff --git a/include/linux/security.h b/include/linux/security.h index 65467eef6678..08fd777cbe94 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2025,7 +2025,8 @@ struct bpf_token; extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size); extern int security_bpf_map(struct bpf_map *map, fmode_t fmode); extern int security_bpf_prog(struct bpf_prog *prog); -extern int security_bpf_map_alloc(struct bpf_map *map); +extern int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, + struct bpf_token *token); extern void security_bpf_map_free(struct bpf_map *map); extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token); @@ -2047,7 +2048,8 @@ static inline int security_bpf_prog(struct bpf_prog *prog) return 0; } -static inline int security_bpf_map_alloc(struct bpf_map *map) +static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, + struct bpf_token *token) { return 0; } -- cgit From d734ca7b33dbf60eb15dcf7c44f3da7073356777 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:23 -0800 Subject: bpf,lsm: add BPF token LSM hooks Wire up bpf_token_create and bpf_token_free LSM hooks, which allow to allocate LSM security blob (we add `void *security` field to struct bpf_token for that), but also control who can instantiate BPF token. This follows existing pattern for BPF map and BPF prog. Also add security_bpf_token_allow_cmd() and security_bpf_token_capable() LSM hooks that allow LSM implementation to control and negate (if necessary) BPF token's delegation of a specific bpf_cmd and capability, respectively. Acked-by: Paul Moore Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-12-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ include/linux/lsm_hook_defs.h | 5 +++++ include/linux/security.h | 25 +++++++++++++++++++++++++ 3 files changed, 33 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 435abad3cc61..7a483f6b6d5f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1604,6 +1604,9 @@ struct bpf_token { u64 allowed_maps; u64 allowed_progs; u64 allowed_attachs; +#ifdef CONFIG_SECURITY + void *security; +#endif }; struct bpf_struct_ops_value; diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index adb25cc63ce3..3fdd00b452ac 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -404,6 +404,11 @@ LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map) LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token) LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) +LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr, + struct path *path) +LSM_HOOK(void, LSM_RET_VOID, bpf_token_free, struct bpf_token *token) +LSM_HOOK(int, 0, bpf_token_cmd, const struct bpf_token *token, enum bpf_cmd cmd) +LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap) #endif /* CONFIG_BPF_SYSCALL */ LSM_HOOK(int, 0, locked_down, enum lockdown_reason what) diff --git a/include/linux/security.h b/include/linux/security.h index 08fd777cbe94..00809d2d5c38 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -32,6 +32,7 @@ #include #include #include +#include struct linux_binprm; struct cred; @@ -2031,6 +2032,11 @@ extern void security_bpf_map_free(struct bpf_map *map); extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token); extern void security_bpf_prog_free(struct bpf_prog *prog); +extern int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, + struct path *path); +extern void security_bpf_token_free(struct bpf_token *token); +extern int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd); +extern int security_bpf_token_capable(const struct bpf_token *token, int cap); #else static inline int security_bpf(int cmd, union bpf_attr *attr, unsigned int size) @@ -2065,6 +2071,25 @@ static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr * static inline void security_bpf_prog_free(struct bpf_prog *prog) { } + +static inline int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, + struct path *path) +{ + return 0; +} + +static inline void security_bpf_token_free(struct bpf_token *token) +{ } + +static inline int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd) +{ + return 0; +} + +static inline int security_bpf_token_capable(const struct bpf_token *token, int cap) +{ + return 0; +} #endif /* CONFIG_SECURITY */ #endif /* CONFIG_BPF_SYSCALL */ -- cgit From 7065eefb38f16c91e9ace36fb7c873e4c9857c27 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 6 Dec 2023 11:09:20 -0800 Subject: bpf: rename MAX_BPF_LINK_TYPE into __MAX_BPF_LINK_TYPE for consistency To stay consistent with the naming pattern used for similar cases in BPF UAPI (__MAX_BPF_ATTACH_TYPE, etc), rename MAX_BPF_LINK_TYPE into __MAX_BPF_LINK_TYPE. Also similar to MAX_BPF_ATTACH_TYPE and MAX_BPF_REG, add: #define MAX_BPF_LINK_TYPE __MAX_BPF_LINK_TYPE Not all __MAX_xxx enums have such #define, so I'm not sure if we should add it or not, but I figured I'll start with a completely backwards compatible way, and we can drop that, if necessary. Also adjust a selftest that used MAX_BPF_LINK_TYPE enum. Suggested-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20231206190920.1651226-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4df2d025c784..e0545201b55f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1108,9 +1108,11 @@ enum bpf_link_type { BPF_LINK_TYPE_TCX = 11, BPF_LINK_TYPE_UPROBE_MULTI = 12, BPF_LINK_TYPE_NETKIT = 13, - MAX_BPF_LINK_TYPE, + __MAX_BPF_LINK_TYPE, }; +#define MAX_BPF_LINK_TYPE __MAX_BPF_LINK_TYPE + enum bpf_perf_event_type { BPF_PERF_EVENT_UNSPEC = 0, BPF_PERF_EVENT_UPROBE = 1, -- cgit From f08a1c658257c73697a819c4ded3a84b6f0ead74 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:48 -0800 Subject: bpf: Let bpf_prog_pack_free handle any pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, bpf_prog_pack_free only can only free pointer to struct bpf_binary_header, which is not flexible. Add a size argument to bpf_prog_pack_free so that it can handle any pointer. Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Reviewed-by: Björn Töpel Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20231206224054.492250-2-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 14354605ad26..12d907f17d36 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1067,7 +1067,7 @@ struct bpf_binary_header * bpf_jit_binary_pack_hdr(const struct bpf_prog *fp); void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns); -void bpf_prog_pack_free(struct bpf_binary_header *hdr); +void bpf_prog_pack_free(void *ptr, u32 size); static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) { -- cgit From 7a3d9a159b178e87306a6e989071ed9a114a1a31 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:49 -0800 Subject: bpf: Adjust argument names of arch_prepare_bpf_trampoline() We are using "im" for "struct bpf_tramp_image" and "tr" for "struct bpf_trampoline" in most of the code base. The only exception is the prototype and fallback version of arch_prepare_bpf_trampoline(). Update them to match the rest of the code base. We mix "orig_call" and "func_addr" for the argument in different versions of arch_prepare_bpf_trampoline(). s/orig_call/func_addr/g so they match. Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20231206224054.492250-3-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7a483f6b6d5f..17eb6d905204 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1098,10 +1098,10 @@ struct bpf_tramp_run_ctx; * fexit = a set of program to run after original function */ struct bpf_tramp_image; -int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, - void *orig_call); + void *func_addr); u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, -- cgit From 82583daa2efc2e336962b231a46bad03a280b3e0 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:50 -0800 Subject: bpf: Add helpers for trampoline image management As BPF trampoline of different archs moves from bpf_jit_[alloc|free]_exec() to bpf_prog_pack_[alloc|free](), we need to use different _alloc, _free for different archs during the transition. Add the following helpers for this transition: void *arch_alloc_bpf_trampoline(unsigned int size); void arch_free_bpf_trampoline(void *image, unsigned int size); void arch_protect_bpf_trampoline(void *image, unsigned int size); void arch_unprotect_bpf_trampoline(void *image, unsigned int size); The fallback version of these helpers require size <= PAGE_SIZE, but they are only called with size == PAGE_SIZE. They will be called with size < PAGE_SIZE when arch_bpf_trampoline_size() helper is introduced later. Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20231206224054.492250-4-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 17eb6d905204..b7fca151cf1b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1102,6 +1102,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr); +void *arch_alloc_bpf_trampoline(unsigned int size); +void arch_free_bpf_trampoline(void *image, unsigned int size); +void arch_protect_bpf_trampoline(void *image, unsigned int size); +void arch_unprotect_bpf_trampoline(void *image, unsigned int size); + u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, -- cgit From 96d1b7c081c0c96cbe8901045f4ff15a2e9974a2 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:52 -0800 Subject: bpf: Add arch_bpf_trampoline_size() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This helper will be used to calculate the size of the trampoline before allocating the memory. arch_prepare_bpf_trampoline() for arm64 and riscv64 can use arch_bpf_trampoline_size() to check the trampoline fits in the image. OTOH, arch_prepare_bpf_trampoline() for s390 has to call the JIT process twice, so it cannot use arch_bpf_trampoline_size(). Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Acked-by: Jiri Olsa Acked-by: Björn Töpel Tested-by: Björn Töpel # on riscv Link: https://lore.kernel.org/r/20231206224054.492250-6-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b7fca151cf1b..2332ddeb396b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1106,6 +1106,8 @@ void *arch_alloc_bpf_trampoline(unsigned int size); void arch_free_bpf_trampoline(void *image, unsigned int size); void arch_protect_bpf_trampoline(void *image, unsigned int size); void arch_unprotect_bpf_trampoline(void *image, unsigned int size); +int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, + struct bpf_tramp_links *tlinks, void *func_addr); u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); -- cgit From 26ef208c209a0e6eed8942a5d191b39dccfa6e38 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:53 -0800 Subject: bpf: Use arch_bpf_trampoline_size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of blindly allocating PAGE_SIZE for each trampoline, check the size of the trampoline with arch_bpf_trampoline_size(). This size is saved in bpf_tramp_image->size, and used for modmem charge/uncharge. The fallback arch_alloc_bpf_trampoline() still allocates a whole page because we need to use set_memory_* to protect the memory. struct_ops trampoline still uses a whole page for multiple trampolines. With this size check at caller (regular trampoline and struct_ops trampoline), remove arch_bpf_trampoline_size() from arch_prepare_bpf_trampoline() in archs. Also, update bpf_image_ksym_add() to handle symbol of different sizes. Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Acked-by: Jiri Olsa Acked-by: Björn Töpel Tested-by: Björn Töpel # on riscv Link: https://lore.kernel.org/r/20231206224054.492250-7-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2332ddeb396b..c1a06263a4f3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1141,6 +1141,7 @@ enum bpf_tramp_prog_type { struct bpf_tramp_image { void *image; + int size; struct bpf_ksym ksym; struct percpu_ref pcref; void *ip_after_call; @@ -1325,7 +1326,7 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to); /* Called only from JIT-enabled code, so there's no need for stubs. */ -void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); +void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); -- cgit From 92e1567ee3e3f6f160e320890ac77eec50bf8e7d Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Thu, 7 Dec 2023 22:25:17 -0500 Subject: bpf: Add some comments to stack representation Add comments to the datastructure tracking the stack state, as the mapping between each stack slot and where its state is stored is not entirely obvious. Signed-off-by: Andrei Matei Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20231208032519.260451-2-andreimatei1@gmail.com --- include/linux/bpf_verifier.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index bada59812e00..314b679fb494 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -321,7 +321,17 @@ struct bpf_func_state { /* The following fields should be last. See copy_func_state() */ int acquired_refs; struct bpf_reference_state *refs; + /* The state of the stack. Each element of the array describes BPF_REG_SIZE + * (i.e. 8) bytes worth of stack memory. + * stack[0] represents bytes [*(r10-8)..*(r10-1)] + * stack[1] represents bytes [*(r10-16)..*(r10-9)] + * ... + * stack[allocated_stack/8 - 1] represents [*(r10-allocated_stack)..*(r10-allocated_stack+7)] + */ struct bpf_stack_state *stack; + /* Size of the current stack, in bytes. The stack state is tracked below, in + * `stack`. allocated_stack is always a multiple of BPF_REG_SIZE. + */ int allocated_stack; }; @@ -658,6 +668,10 @@ struct bpf_verifier_env { int exception_callback_subprog; bool explore_alu_limits; bool allow_ptr_leaks; + /* Allow access to uninitialized stack memory. Writes with fixed offset are + * always allowed, so this refers to reads (with fixed or variable offset), + * to writes with variable offset and to indirect (helper) accesses. + */ bool allow_uninit_stack; bool bpf_capable; bool bypass_spec_v1; -- cgit From 2ebe81c814355d000fe49d9c4213983844dcb32b Mon Sep 17 00:00:00 2001 From: Aleksander Lobakin Date: Wed, 6 Dec 2023 21:59:19 +0100 Subject: net, xdp: Allow metadata > 32 32 bytes may be not enough for some custom metadata. Relax the restriction, allow metadata larger than 32 bytes and make __skb_metadata_differs() work with bigger lengths. Now size of metadata is only limited by the fact it is stored as u8 in skb_shared_info, so maximum possible value is 255. Size still has to be aligned to 4, so the actual upper limit becomes 252. Most driver implementations will offer less, none can offer more. Other important conditions, such as having enough space for xdp_frame building, are already checked in bpf_xdp_adjust_meta(). Signed-off-by: Aleksander Lobakin Signed-off-by: Larysa Zaremba Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/eb87653c-8ff8-447d-a7a1-25961f60518a@kernel.org Link: https://lore.kernel.org/bpf/20231206205919.404415-3-larysa.zaremba@intel.com --- include/linux/skbuff.h | 13 ++++++++----- include/net/xdp.h | 7 ++++++- 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b370eb8d70f7..df6ef42639d8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4247,10 +4247,13 @@ static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, { const void *a = skb_metadata_end(skb_a); const void *b = skb_metadata_end(skb_b); - /* Using more efficient varaiant than plain call to memcmp(). */ -#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 diffs = 0; + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || + BITS_PER_LONG != 64) + goto slow; + + /* Using more efficient variant than plain call to memcmp(). */ switch (meta_len) { #define __it(x, op) (x -= sizeof(u##op)) #define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op)) @@ -4270,11 +4273,11 @@ static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, fallthrough; case 4: diffs |= __it_diff(a, b, 32); break; + default: +slow: + return memcmp(a - meta_len, b - meta_len, meta_len); } return diffs; -#else - return memcmp(a - meta_len, b - meta_len, meta_len); -#endif } static inline bool skb_metadata_differs(const struct sk_buff *skb_a, diff --git a/include/net/xdp.h b/include/net/xdp.h index 349c36fb5fd8..5d3673afc037 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -369,7 +369,12 @@ xdp_data_meta_unsupported(const struct xdp_buff *xdp) static inline bool xdp_metalen_invalid(unsigned long metalen) { - return (metalen & (sizeof(__u32) - 1)) || (metalen > 32); + unsigned long meta_max; + + meta_max = type_max(typeof_member(struct skb_shared_info, meta_len)); + BUILD_BUG_ON(!__builtin_constant_p(meta_max)); + + return !IS_ALIGNED(metalen, sizeof(u32)) || metalen > meta_max; } struct xdp_attachment_info { -- cgit From 1a1ad782dcbbacd9e8d4e2e7ff1bf14d1db80727 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 4 Dec 2023 15:39:21 -0800 Subject: bpf: tidy up exception callback management a bit Use the fact that we are passing subprog index around and have a corresponding struct bpf_subprog_info in bpf_verifier_env for each subprogram. We don't need to separately pass around a flag whether subprog is exception callback or not, each relevant verifier function can determine this using provided subprog index if we maintain bpf_subprog_info properly. Also move out exception callback-specific logic from btf_prepare_func_args(), keeping it generic. We can enforce all these restriction right before exception callback verification pass. We add out parameter, arg_cnt, for now, but this will be unnecessary with subsequent refactoring and will be removed. Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231204233931.49758-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c1a06263a4f3..0bd4889e917a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2494,7 +2494,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *reg, bool is_ex_cb); + struct bpf_reg_state *reg, u32 *nargs); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf, const struct btf_type *t); const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt, -- cgit From 406a6fa44bfbc8563f0612b08d43df2fa65e8bc5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 4 Dec 2023 15:39:22 -0800 Subject: bpf: use bitfields for simple per-subprog bool flags We have a bunch of bool flags for each subprog. Instead of wasting bytes for them, use bitfields instead. Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231204233931.49758-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 314b679fb494..c2819a6579a5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -611,12 +611,12 @@ struct bpf_subprog_info { u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ - bool has_tail_call; - bool tail_call_reachable; - bool has_ld_abs; - bool is_cb; - bool is_async_cb; - bool is_exception_cb; + bool has_tail_call: 1; + bool tail_call_reachable: 1; + bool has_ld_abs: 1; + bool is_cb: 1; + bool is_async_cb: 1; + bool is_exception_cb: 1; }; struct bpf_verifier_env; -- cgit From 750e785796bb72423b97cac21ecd0fa3b3b65610 Mon Sep 17 00:00:00 2001 From: Jie Jiang Date: Tue, 12 Dec 2023 09:39:23 +0000 Subject: bpf: Support uid and gid when mounting bpffs Parse uid and gid in bpf_parse_param() so that they can be passed in as the `data` parameter when mount() bpffs. This will be useful when we want to control which user/group has the control to the mounted bpffs, otherwise a separate chown() call will be needed. Signed-off-by: Jie Jiang Signed-off-by: Andrii Nakryiko Acked-by: Mike Frysinger Acked-by: Christian Brauner Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231212093923.497838-1-jiejiang@chromium.org --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0bd4889e917a..c87c608a3689 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1595,6 +1595,8 @@ struct bpf_link_primer { }; struct bpf_mount_opts { + kuid_t uid; + kgid_t gid; umode_t mode; /* BPF token-related delegation options */ -- cgit From 0e6a7b09597011985d7aad3b747c43e9b2a43555 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Tue, 5 Dec 2023 22:08:35 +0100 Subject: ice: Support RX hash XDP hint RX hash XDP hint requests both hash value and type. Type is XDP-specific, so we need a separate way to map these values to the hardware ptypes, so create a lookup table. Instead of creating a new long list, reuse contents of ice_decode_rx_desc_ptype[] through preprocessor. Current hash type enum does not contain ICMP packet type, but ice devices support it, so also add a new type into core code. Then use previously refactored code and create a function that allows XDP code to read RX hash. Signed-off-by: Larysa Zaremba Link: https://lore.kernel.org/r/20231205210847.28460-7-larysa.zaremba@intel.com Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 5d3673afc037..b7d6fe61381f 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -432,6 +432,7 @@ enum xdp_rss_hash_type { XDP_RSS_L4_UDP = BIT(5), XDP_RSS_L4_SCTP = BIT(6), XDP_RSS_L4_IPSEC = BIT(7), /* L4 based hash include IPSEC SPI */ + XDP_RSS_L4_ICMP = BIT(8), /* Second part: RSS hash type combinations used for driver HW mapping */ XDP_RSS_TYPE_NONE = 0, @@ -447,11 +448,13 @@ enum xdp_rss_hash_type { XDP_RSS_TYPE_L4_IPV4_UDP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV4_SCTP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV4_IPSEC = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, + XDP_RSS_TYPE_L4_IPV4_ICMP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_TCP, XDP_RSS_TYPE_L4_IPV6_UDP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV6_SCTP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV6_IPSEC = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, + XDP_RSS_TYPE_L4_IPV6_ICMP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP_EX = XDP_RSS_TYPE_L4_IPV6_TCP | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_IPV6_UDP_EX = XDP_RSS_TYPE_L4_IPV6_UDP | XDP_RSS_L3_DYNHDR, -- cgit From b4e352ff1169ebce930c734630f9587b1677d163 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 5 Dec 2023 22:08:36 +0100 Subject: xsk: add functions to fill control buffer Commit 94ecc5ca4dbf ("xsk: Add cb area to struct xdp_buff_xsk") has added a buffer for custom data to xdp_buff_xsk. Particularly, this memory is used for data, consumed by XDP hints kfuncs. It does not always change on a per-packet basis and some parts can be set for example, at the same time as RX queue info. Add functions to fill all cbs in xsk_buff_pool with the same metadata. Signed-off-by: Maciej Fijalkowski Signed-off-by: Larysa Zaremba Acked-by: Magnus Karlsson Link: https://lore.kernel.org/r/20231205210847.28460-8-larysa.zaremba@intel.com Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock_drv.h | 17 +++++++++++++++++ include/net/xsk_buff_pool.h | 2 ++ 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 81e02de3f453..b62bb8525a5f 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -14,6 +14,12 @@ #ifdef CONFIG_XDP_SOCKETS +struct xsk_cb_desc { + void *src; + u8 off; + u8 bytes; +}; + void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max); @@ -47,6 +53,12 @@ static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, xp_set_rxq_info(pool, rxq); } +static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, + struct xsk_cb_desc *desc) +{ + xp_fill_cb(pool, desc); +} + static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool) { #ifdef CONFIG_NET_RX_BUSY_POLL @@ -274,6 +286,11 @@ static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, { } +static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, + struct xsk_cb_desc *desc) +{ +} + static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool) { return 0; diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 8d48d37ab7c0..99dd7376df6a 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -12,6 +12,7 @@ struct xsk_buff_pool; struct xdp_rxq_info; +struct xsk_cb_desc; struct xsk_queue; struct xdp_desc; struct xdp_umem; @@ -135,6 +136,7 @@ static inline void xp_init_xskb_dma(struct xdp_buff_xsk *xskb, struct xsk_buff_p /* AF_XDP ZC drivers, via xdp_sock_buff.h */ void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq); +void xp_fill_cb(struct xsk_buff_pool *pool, struct xsk_cb_desc *desc); int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, unsigned long attrs, struct page **pages, u32 nr_pages); void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs); -- cgit From e6795330f88b4f643c649a02662d47b779340535 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Tue, 5 Dec 2023 22:08:38 +0100 Subject: xdp: Add VLAN tag hint Implement functionality that enables drivers to expose VLAN tag to XDP code. VLAN tag is represented by 2 variables: - protocol ID, which is passed to bpf code in BE - VLAN TCI, in host byte order Acked-by: Stanislav Fomichev Signed-off-by: Larysa Zaremba Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/20231205210847.28460-10-larysa.zaremba@intel.com Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 6 ++++++ include/uapi/linux/netdev.h | 3 +++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index b7d6fe61381f..8cd04a74dba5 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -404,6 +404,10 @@ void xdp_attachment_setup(struct xdp_attachment_info *info, NETDEV_XDP_RX_METADATA_HASH, \ bpf_xdp_metadata_rx_hash, \ xmo_rx_hash) \ + XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \ + NETDEV_XDP_RX_METADATA_VLAN_TAG, \ + bpf_xdp_metadata_rx_vlan_tag, \ + xmo_rx_vlan_tag) \ enum xdp_rx_metadata { #define XDP_METADATA_KFUNC(name, _, __, ___) name, @@ -465,6 +469,8 @@ struct xdp_metadata_ops { int (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp); int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash, enum xdp_rss_hash_type *rss_type); + int (*xmo_rx_vlan_tag)(const struct xdp_md *ctx, __be16 *vlan_proto, + u16 *vlan_tci); }; #ifdef CONFIG_NET diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 6244c0164976..966638b08ccf 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -44,10 +44,13 @@ enum netdev_xdp_act { * timestamp via bpf_xdp_metadata_rx_timestamp(). * @NETDEV_XDP_RX_METADATA_HASH: Device is capable of exposing receive packet * hash via bpf_xdp_metadata_rx_hash(). + * @NETDEV_XDP_RX_METADATA_VLAN_TAG: Device is capable of exposing receive + * packet VLAN tag via bpf_xdp_metadata_rx_vlan_tag(). */ enum netdev_xdp_rx_metadata { NETDEV_XDP_RX_METADATA_TIMESTAMP = 1, NETDEV_XDP_RX_METADATA_HASH = 2, + NETDEV_XDP_RX_METADATA_VLAN_TAG = 4, }; /** -- cgit From 537fec0733c4a72e2a2b69fee365459c5b75d92e Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Tue, 5 Dec 2023 22:08:42 +0100 Subject: net: make vlan_get_tag() return -ENODATA instead of -EINVAL __vlan_hwaccel_get_tag() is used in veth XDP hints implementation, its return value (-EINVAL if skb is not VLAN tagged) is passed to bpf code, but XDP hints specification requires drivers to return -ENODATA, if a hint cannot be provided for a particular packet. Solve this inconsistency by changing error return value of __vlan_hwaccel_get_tag() from -EINVAL to -ENODATA, do the same thing to __vlan_get_tag(), because this function is supposed to follow the same convention. This, in turn, makes -ENODATA the only non-zero value vlan_get_tag() can return. We can do this with no side effects, because none of the users of the 3 above-mentioned functions rely on the exact value. Suggested-by: Jesper Dangaard Brouer Acked-by: Stanislav Fomichev Signed-off-by: Larysa Zaremba Link: https://lore.kernel.org/r/20231205210847.28460-14-larysa.zaremba@intel.com Signed-off-by: Alexei Starovoitov --- include/linux/if_vlan.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 3028af87716e..c1645c86eed9 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -540,7 +540,7 @@ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) struct vlan_ethhdr *veth = skb_vlan_eth_hdr(skb); if (!eth_type_vlan(veth->h_vlan_proto)) - return -EINVAL; + return -ENODATA; *vlan_tci = ntohs(veth->h_vlan_TCI); return 0; @@ -561,7 +561,7 @@ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb, return 0; } else { *vlan_tci = 0; - return -EINVAL; + return -ENODATA; } } -- cgit From 7978bad4b6b9265a1e808a5f679ee428d1dd6523 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Tue, 5 Dec 2023 22:08:43 +0100 Subject: mlx5: implement VLAN tag XDP hint Implement the newly added .xmo_rx_vlan_tag() hint function. Reviewed-by: Tariq Toukan Signed-off-by: Larysa Zaremba Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/20231205210847.28460-15-larysa.zaremba@intel.com Signed-off-by: Alexei Starovoitov --- include/linux/mlx5/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 820bca965fb6..01275c6e8468 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -918,7 +918,7 @@ static inline u8 get_cqe_tls_offload(struct mlx5_cqe64 *cqe) return (cqe->tls_outer_l3_tunneled >> 3) & 0x3; } -static inline bool cqe_has_vlan(struct mlx5_cqe64 *cqe) +static inline bool cqe_has_vlan(const struct mlx5_cqe64 *cqe) { return cqe->l4_l3_hdr_type & 0x1; } -- cgit From 04d25ccea2b3199269b7e500da33023b51418fde Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 12 Dec 2023 20:37:35 -0800 Subject: net, xdp: Correct grammar Use the correct verb form in 2 places in the XDP rx-queue comment. Signed-off-by: Randy Dunlap Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20231213043735.30208-1-rdunlap@infradead.org --- include/net/xdp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 8cd04a74dba5..e6770dd40c91 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -16,7 +16,7 @@ * * The XDP RX-queue info (xdp_rxq_info) is associated with the driver * level RX-ring queues. It is information that is specific to how - * the driver have configured a given RX-ring queue. + * the driver has configured a given RX-ring queue. * * Each xdp_buff frame received in the driver carries a (pointer) * reference to this xdp_rxq_info structure. This provides the XDP @@ -32,7 +32,7 @@ * The struct is not directly tied to the XDP prog. A new XDP prog * can be attached as long as it doesn't change the underlying * RX-ring. If the RX-ring does change significantly, the NIC driver - * naturally need to stop the RX-ring before purging and reallocating + * naturally needs to stop the RX-ring before purging and reallocating * memory. In that process the driver MUST call unregister (which * also applies for driver shutdown and unload). The register API is * also mandatory during RX-ring setup. -- cgit From 8f0ec8c681755f523cf842bfe350ea40609b83a9 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Thu, 14 Dec 2023 15:49:02 -0700 Subject: bpf: xfrm: Add bpf_xdp_get_xfrm_state() kfunc This commit adds an unstable kfunc helper to access internal xfrm_state associated with an SA. This is intended to be used for the upcoming IPsec pcpu work to assign special pcpu SAs to a particular CPU. In other words: for custom software RSS. That being said, the function that this kfunc wraps is fairly generic and used for a lot of xfrm tasks. I'm sure people will find uses elsewhere over time. This commit also adds a corresponding bpf_xdp_xfrm_state_release() kfunc to release the refcnt acquired by bpf_xdp_get_xfrm_state(). The verifier will require that all acquired xfrm_state's are released. Co-developed-by: Antony Antony Signed-off-by: Antony Antony Acked-by: Steffen Klassert Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/a29699c42f5fad456b875c98dd11c6afc3ffb707.1702593901.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- include/net/xfrm.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index c9bb0f892f55..1d107241b901 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2190,4 +2190,13 @@ static inline int register_xfrm_interface_bpf(void) #endif +#if IS_ENABLED(CONFIG_DEBUG_INFO_BTF) +int register_xfrm_state_bpf(void); +#else +static inline int register_xfrm_state_bpf(void) +{ + return 0; +} +#endif + #endif /* _NET_XFRM_H */ -- cgit From 4382159696c9af67ee047ed55f2dbf05480f52f6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Dec 2023 10:12:17 +0100 Subject: cfi: Flip headers Normal include order is that linux/foo.h should include asm/foo.h, CFI has it the wrong way around. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sami Tolvanen Link: https://lore.kernel.org/r/20231215092707.231038174@infradead.org Signed-off-by: Alexei Starovoitov --- include/asm-generic/Kbuild | 1 + include/asm-generic/cfi.h | 5 +++++ include/linux/cfi.h | 1 + 3 files changed, 7 insertions(+) create mode 100644 include/asm-generic/cfi.h (limited to 'include') diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index def242528b1d..d436bee4d129 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -11,6 +11,7 @@ mandatory-y += bitops.h mandatory-y += bug.h mandatory-y += bugs.h mandatory-y += cacheflush.h +mandatory-y += cfi.h mandatory-y += checksum.h mandatory-y += compat.h mandatory-y += current.h diff --git a/include/asm-generic/cfi.h b/include/asm-generic/cfi.h new file mode 100644 index 000000000000..41fac3537bf9 --- /dev/null +++ b/include/asm-generic/cfi.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_GENERIC_CFI_H +#define __ASM_GENERIC_CFI_H + +#endif /* __ASM_GENERIC_CFI_H */ diff --git a/include/linux/cfi.h b/include/linux/cfi.h index 3552ec82b725..2309d74e77e6 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -9,6 +9,7 @@ #include #include +#include #ifdef CONFIG_CFI_CLANG enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, -- cgit From 4f9087f16651aca4a5f32da840a53f6660f0579a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Dec 2023 10:12:18 +0100 Subject: x86/cfi,bpf: Fix BPF JIT call The current BPF call convention is __nocfi, except when it calls !JIT things, then it calls regular C functions. It so happens that with FineIBT the __nocfi and C calling conventions are incompatible. Specifically __nocfi will call at func+0, while FineIBT will have endbr-poison there, which is not a valid indirect target. Causing #CP. Notably this only triggers on IBT enabled hardware, which is probably why this hasn't been reported (also, most people will have JIT on anyway). Implement proper CFI prologues for the BPF JIT codegen and drop __nocfi for x86. Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231215092707.345270396@infradead.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 12 ++++++++++-- include/linux/cfi.h | 7 +++++++ 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c87c608a3689..9d84c376851a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -29,6 +29,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -1211,7 +1212,11 @@ struct bpf_dispatcher { #endif }; -static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func( +#ifndef __bpfcall +#define __bpfcall __nocfi +#endif + +static __always_inline __bpfcall unsigned int bpf_dispatcher_nop_func( const void *ctx, const struct bpf_insn *insnsi, bpf_func_t bpf_func) @@ -1303,7 +1308,7 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func #define DEFINE_BPF_DISPATCHER(name) \ __BPF_DISPATCHER_SC(name); \ - noinline __nocfi unsigned int bpf_dispatcher_##name##_func( \ + noinline __bpfcall unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ const struct bpf_insn *insnsi, \ bpf_func_t bpf_func) \ @@ -1453,6 +1458,9 @@ struct bpf_prog_aux { struct bpf_kfunc_desc_tab *kfunc_tab; struct bpf_kfunc_btf_tab *kfunc_btf_tab; u32 size_poke_tab; +#ifdef CONFIG_FINEIBT + struct bpf_ksym ksym_prefix; +#endif struct bpf_ksym ksym; const struct bpf_prog_ops *ops; struct bpf_map **used_maps; diff --git a/include/linux/cfi.h b/include/linux/cfi.h index 2309d74e77e6..1ed2d96c0cfc 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -11,6 +11,13 @@ #include #include +#ifndef cfi_get_offset +static inline int cfi_get_offset(void) +{ + return 0; +} +#endif + #ifdef CONFIG_CFI_CLANG enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, unsigned long *target, u32 type); -- cgit From 2cd3e3772e41377f32d6eea643e0590774e9187c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Dec 2023 10:12:20 +0100 Subject: x86/cfi,bpf: Fix bpf_struct_ops CFI BPF struct_ops uses __arch_prepare_bpf_trampoline() to write trampolines for indirect function calls. These tramplines much have matching CFI. In order to obtain the correct CFI hash for the various methods, add a matching structure that contains stub functions, the compiler will generate correct CFI which we can pilfer for the trampolines. Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231215092707.566977112@infradead.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9d84c376851a..db46b3359bf5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1060,6 +1060,17 @@ struct btf_func_model { */ #define BPF_TRAMP_F_TAIL_CALL_CTX BIT(7) +/* + * Indicate the trampoline should be suitable to receive indirect calls; + * without this indirectly calling the generated code can result in #UD/#CP, + * depending on the CFI options. + * + * Used by bpf_struct_ops. + * + * Incompatible with FENTRY usage, overloads @func_addr argument. + */ +#define BPF_TRAMP_F_INDIRECT BIT(8) + /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. */ @@ -1697,6 +1708,7 @@ struct bpf_struct_ops { struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS]; u32 type_id; u32 value_id; + void *cfi_stubs; }; #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) @@ -1710,6 +1722,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, struct bpf_tramp_link *link, const struct btf_func_model *model, + void *stub_func, void *image, void *image_end); static inline bool bpf_try_module_get(const void *data, struct module *owner) { -- cgit From e9d13b9d2f99ccf7afeab490d97eaa5ac9846598 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Dec 2023 10:12:21 +0100 Subject: cfi: Add CFI_NOSEAL() Add a CFI_NOSEAL() helper to mark functions that need to retain their CFI information, despite not otherwise leaking their address. Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231215092707.669401084@infradead.org Signed-off-by: Alexei Starovoitov --- include/linux/cfi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/cfi.h b/include/linux/cfi.h index 1ed2d96c0cfc..f0df518e11dd 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -46,4 +46,8 @@ static inline void module_cfi_finalize(const Elf_Ehdr *hdr, #endif /* CONFIG_ARCH_USES_CFI_TRAPS */ #endif /* CONFIG_MODULES */ +#ifndef CFI_NOSEAL +#define CFI_NOSEAL(x) +#endif + #endif /* _LINUX_CFI_H */ -- cgit From 852486b35f344887786d63250946dd921a05d7e8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 15 Dec 2023 10:12:23 +0100 Subject: x86/cfi,bpf: Fix bpf_exception_cb() signature As per the earlier patches, BPF sub-programs have bpf_callback_t signature and CFI expects callers to have matching signature. This is violated by bpf_prog_aux::bpf_exception_cb(). [peterz: Changelog] Reported-by: Peter Zijlstra Signed-off-by: Alexei Starovoitov Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/CAADnVQ+Z7UcXXBBhMubhcMM=R-dExk-uHtfOLtoLxQ1XxEpqEA@mail.gmail.com Link: https://lore.kernel.org/r/20231215092707.910319166@infradead.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index db46b3359bf5..5e694934cf37 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1484,7 +1484,7 @@ struct bpf_prog_aux { int cgroup_atype; /* enum cgroup_bpf_attach_type */ struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; char name[BPF_OBJ_NAME_LEN]; - unsigned int (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp); + u64 (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp, u64, u64); #ifdef CONFIG_SECURITY void *security; #endif -- cgit