From 31379397dcc364a59ce764fabb131b645c43e340 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 5 May 2021 15:25:29 +0200 Subject: bpf: Forbid trampoline attach for functions with variable arguments We can't currently allow to attach functions with variable arguments. The problem is that we should save all the registers for arguments, which is probably doable, but if caller uses more than 6 arguments, we need stack data, which will be wrong, because of the extra stack frame we do in bpf trampoline, so we could crash. Also currently there's malformed trampoline code generated for such functions at the moment as described in: https://lore.kernel.org/bpf/20210429212834.82621-1-jolsa@kernel.org/ Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210505132529.401047-1-jolsa@kernel.org --- kernel/bpf/btf.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0600ed325fa0..f982a9f0dbc4 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5206,6 +5206,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, m->ret_size = ret; for (i = 0; i < nargs; i++) { + if (i == nargs - 1 && args[i].type == 0) { + bpf_log(log, + "The function %s with variable args is unsupported.\n", + tname); + return -EINVAL; + } ret = __get_type_size(btf, args[i].type, &t); if (ret < 0) { bpf_log(log, @@ -5213,6 +5219,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]); return -EINVAL; } + if (ret == 0) { + bpf_log(log, + "The function %s has malformed void argument.\n", + tname); + return -EINVAL; + } m->arg_size[i] = ret; } m->nr_args = nargs; -- cgit From 049c4e13714ecbca567b4d5f6d563f05d431c80e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 10 May 2021 13:10:44 +0000 Subject: bpf: Fix alu32 const subreg bound tracking on bitwise operations Fix a bug in the verifier's scalar32_min_max_*() functions which leads to incorrect tracking of 32 bit bounds for the simulation of and/or/xor bitops. When both the src & dst subreg is a known constant, then the assumption is that scalar_min_max_*() will take care to update bounds correctly. However, this is not the case, for example, consider a register R2 which has a tnum of 0xffffffff00000000, meaning, lower 32 bits are known constant and in this case of value 0x00000001. R2 is then and'ed with a register R3 which is a 64 bit known constant, here, 0x100000002. What can be seen in line '10:' is that 32 bit bounds reach an invalid state where {u,s}32_min_value > {u,s}32_max_value. The reason is scalar32_min_max_*() delegates 32 bit bounds updates to scalar_min_max_*(), however, that really only takes place when both the 64 bit src & dst register is a known constant. Given scalar32_min_max_*() is intended to be designed as closely as possible to scalar_min_max_*(), update the 32 bit bounds in this situation through __mark_reg32_known() which will set all {u,s}32_{min,max}_value to the correct constant, which is 0x00000000 after the fix (given 0x00000001 & 0x00000002 in 32 bit space). This is possible given var32_off already holds the final value as dst_reg->var_off is updated before calling scalar32_min_max_*(). Before fix, invalid tracking of R2: [...] 9: R0_w=inv1337 R1=ctx(id=0,off=0,imm=0) R2_w=inv(id=0,smin_value=-9223372036854775807 (0x8000000000000001),smax_value=9223372032559808513 (0x7fffffff00000001),umin_value=1,umax_value=0xffffffff00000001,var_off=(0x1; 0xffffffff00000000),s32_min_value=1,s32_max_value=1,u32_min_value=1,u32_max_value=1) R3_w=inv4294967298 R10=fp0 9: (5f) r2 &= r3 10: R0_w=inv1337 R1=ctx(id=0,off=0,imm=0) R2_w=inv(id=0,smin_value=0,smax_value=4294967296 (0x100000000),umin_value=0,umax_value=0x100000000,var_off=(0x0; 0x100000000),s32_min_value=1,s32_max_value=0,u32_min_value=1,u32_max_value=0) R3_w=inv4294967298 R10=fp0 [...] After fix, correct tracking of R2: [...] 9: R0_w=inv1337 R1=ctx(id=0,off=0,imm=0) R2_w=inv(id=0,smin_value=-9223372036854775807 (0x8000000000000001),smax_value=9223372032559808513 (0x7fffffff00000001),umin_value=1,umax_value=0xffffffff00000001,var_off=(0x1; 0xffffffff00000000),s32_min_value=1,s32_max_value=1,u32_min_value=1,u32_max_value=1) R3_w=inv4294967298 R10=fp0 9: (5f) r2 &= r3 10: R0_w=inv1337 R1=ctx(id=0,off=0,imm=0) R2_w=inv(id=0,smin_value=0,smax_value=4294967296 (0x100000000),umin_value=0,umax_value=0x100000000,var_off=(0x0; 0x100000000),s32_min_value=0,s32_max_value=0,u32_min_value=0,u32_max_value=0) R3_w=inv4294967298 R10=fp0 [...] Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") Fixes: 2921c90d4718 ("bpf: Fix a verifier failure with xor") Reported-by: Manfred Paul (@_manfp) Reported-by: Thadeu Lima de Souza Cascardo Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 757476c91c98..9352a1b7de2d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7084,11 +7084,10 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, s32 smin_val = src_reg->s32_min_value; u32 umax_val = src_reg->u32_max_value; - /* Assuming scalar64_min_max_and will be called so its safe - * to skip updating register for known 32-bit case. - */ - if (src_known && dst_known) + if (src_known && dst_known) { + __mark_reg32_known(dst_reg, var32_off.value); return; + } /* We get our minimum from the var_off, since that's inherently * bitwise. Our maximum is the minimum of the operands' maxima. @@ -7108,7 +7107,6 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, dst_reg->s32_min_value = dst_reg->u32_min_value; dst_reg->s32_max_value = dst_reg->u32_max_value; } - } static void scalar_min_max_and(struct bpf_reg_state *dst_reg, @@ -7155,11 +7153,10 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, s32 smin_val = src_reg->s32_min_value; u32 umin_val = src_reg->u32_min_value; - /* Assuming scalar64_min_max_or will be called so it is safe - * to skip updating register for known case. - */ - if (src_known && dst_known) + if (src_known && dst_known) { + __mark_reg32_known(dst_reg, var32_off.value); return; + } /* We get our maximum from the var_off, and our minimum is the * maximum of the operands' minima @@ -7224,11 +7221,10 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, struct tnum var32_off = tnum_subreg(dst_reg->var_off); s32 smin_val = src_reg->s32_min_value; - /* Assuming scalar64_min_max_xor will be called so it is safe - * to skip updating register for known case. - */ - if (src_known && dst_known) + if (src_known && dst_known) { + __mark_reg32_known(dst_reg, var32_off.value); return; + } /* We get both minimum and maximum from the var32_off. */ dst_reg->u32_min_value = var32_off.value; -- cgit From 4b81ccebaeee885ab1aa1438133f2991e3a2b6ea Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Tue, 27 Apr 2021 10:12:12 -0300 Subject: bpf, ringbuf: Deny reserve of buffers larger than ringbuf A BPF program might try to reserve a buffer larger than the ringbuf size. If the consumer pointer is way ahead of the producer, that would be successfully reserved, allowing the BPF program to read or write out of the ringbuf allocated area. Reported-by: Ryota Shiga (Flatt Security) Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Alexei Starovoitov --- kernel/bpf/ringbuf.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index f25b719ac786..b86d80c9cd59 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -315,6 +315,9 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) return NULL; len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); + if (len > rb->mask + 1) + return NULL; + cons_pos = smp_load_acquire(&rb->consumer_pos); if (in_nmi()) { -- cgit From 04ea3086c4d73da7009de1e84962a904139af219 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 4 May 2021 16:38:00 -0700 Subject: bpf: Prevent writable memory-mapping of read-only ringbuf pages Only the very first page of BPF ringbuf that contains consumer position counter is supposed to be mapped as writeable by user-space. Producer position is read-only and can be modified only by the kernel code. BPF ringbuf data pages are read-only as well and are not meant to be modified by user-code to maintain integrity of per-record headers. This patch allows to map only consumer position page as writeable and everything else is restricted to be read-only. remap_vmalloc_range() internally adds VM_DONTEXPAND, so all the established memory mappings can't be extended, which prevents any future violations through mremap()'ing. Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Reported-by: Ryota Shiga (Flatt Security) Reported-by: Thadeu Lima de Souza Cascardo Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- kernel/bpf/ringbuf.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index b86d80c9cd59..84b3b35fc0d0 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -221,25 +221,20 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, return -ENOTSUPP; } -static size_t bpf_ringbuf_mmap_page_cnt(const struct bpf_ringbuf *rb) -{ - size_t data_pages = (rb->mask + 1) >> PAGE_SHIFT; - - /* consumer page + producer page + 2 x data pages */ - return RINGBUF_POS_PAGES + 2 * data_pages; -} - static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_ringbuf_map *rb_map; - size_t mmap_sz; rb_map = container_of(map, struct bpf_ringbuf_map, map); - mmap_sz = bpf_ringbuf_mmap_page_cnt(rb_map->rb) << PAGE_SHIFT; - - if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > mmap_sz) - return -EINVAL; + if (vma->vm_flags & VM_WRITE) { + /* allow writable mapping for the consumer_pos only */ + if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EPERM; + } else { + vma->vm_flags &= ~VM_MAYWRITE; + } + /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); } -- cgit From b24abcff918a5cbf44b0c982bd3477a93e8e4911 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 11 May 2021 22:35:16 +0200 Subject: bpf, kconfig: Add consolidated menu entry for bpf with core options Right now, all core BPF related options are scattered in different Kconfig locations mainly due to historic reasons. Moving forward, lets add a proper subsystem entry under ... General setup ---> BPF subsystem ---> ... in order to have all knobs in a single location and thus ease BPF related configuration. Networking related bits such as sockmap are out of scope for the general setup and therefore better suited to remain in net/Kconfig. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/f23f58765a4d59244ebd8037da7b6a6b2fb58446.1620765074.git.daniel@iogearbox.net --- kernel/bpf/Kconfig | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 kernel/bpf/Kconfig (limited to 'kernel/bpf') diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig new file mode 100644 index 000000000000..b4edaefc6255 --- /dev/null +++ b/kernel/bpf/Kconfig @@ -0,0 +1,78 @@ +# SPDX-License-Identifier: GPL-2.0-only + +# BPF interpreter that, for example, classic socket filters depend on. +config BPF + bool + +# Used by archs to tell that they support BPF JIT compiler plus which +# flavour. Only one of the two can be selected for a specific arch since +# eBPF JIT supersedes the cBPF JIT. + +# Classic BPF JIT (cBPF) +config HAVE_CBPF_JIT + bool + +# Extended BPF JIT (eBPF) +config HAVE_EBPF_JIT + bool + +# Used by archs to tell that they want the BPF JIT compiler enabled by +# default for kernels that were compiled with BPF JIT support. +config ARCH_WANT_DEFAULT_BPF_JIT + bool + +menu "BPF subsystem" + +config BPF_SYSCALL + bool "Enable bpf() system call" + select BPF + select IRQ_WORK + select TASKS_TRACE_RCU + select BINARY_PRINTF + select NET_SOCK_MSG if INET + default n + help + Enable the bpf() system call that allows to manipulate BPF programs + and maps via file descriptors. + +config BPF_JIT + bool "Enable BPF Just In Time compiler" + depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT + depends on MODULES + help + BPF programs are normally handled by a BPF interpreter. This option + allows the kernel to generate native code when a program is loaded + into the kernel. This will significantly speed-up processing of BPF + programs. + + Note, an admin should enable this feature changing: + /proc/sys/net/core/bpf_jit_enable + /proc/sys/net/core/bpf_jit_harden (optional) + /proc/sys/net/core/bpf_jit_kallsyms (optional) + +config BPF_JIT_ALWAYS_ON + bool "Permanently enable BPF JIT and remove BPF interpreter" + depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT + help + Enables BPF JIT and removes BPF interpreter to avoid speculative + execution of BPF instructions by the interpreter. + +config BPF_JIT_DEFAULT_ON + def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON + depends on HAVE_EBPF_JIT && BPF_JIT + +source "kernel/bpf/preload/Kconfig" + +config BPF_LSM + bool "Enable BPF LSM Instrumentation" + depends on BPF_EVENTS + depends on BPF_SYSCALL + depends on SECURITY + depends on BPF_JIT + help + Enables instrumentation of the security hooks with BPF programs for + implementing dynamic MAC and Audit Policies. + + If you are unsure how to answer this question, answer N. + +endmenu # "BPF subsystem" -- cgit From 08389d888287c3823f80b0216766b71e17f0aba5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 11 May 2021 22:35:17 +0200 Subject: bpf: Add kconfig knob for disabling unpriv bpf by default Add a kconfig knob which allows for unprivileged bpf to be disabled by default. If set, the knob sets /proc/sys/kernel/unprivileged_bpf_disabled to value of 2. This still allows a transition of 2 -> {0,1} through an admin. Similarly, this also still keeps 1 -> {1} behavior intact, so that once set to permanently disabled, it cannot be undone aside from a reboot. We've also added extra2 with max of 2 for the procfs handler, so that an admin still has a chance to toggle between 0 <-> 2. Either way, as an additional alternative, applications can make use of CAP_BPF that we added a while ago. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/74ec548079189e4e4dffaeb42b8987bb3c852eee.1620765074.git.daniel@iogearbox.net --- kernel/bpf/Kconfig | 10 ++++++++++ kernel/bpf/syscall.c | 3 ++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index b4edaefc6255..26b591e23f16 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -61,6 +61,16 @@ config BPF_JIT_DEFAULT_ON def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON depends on HAVE_EBPF_JIT && BPF_JIT +config BPF_UNPRIV_DEFAULT_OFF + bool "Disable unprivileged BPF by default" + depends on BPF_SYSCALL + help + Disables unprivileged BPF by default by setting the corresponding + /proc/sys/kernel/unprivileged_bpf_disabled knob to 2. An admin can + still reenable it by setting it to 0 later on, or permanently + disable it by setting it to 1 (from which no other transition to + 0 is possible anymore). + source "kernel/bpf/preload/Kconfig" config BPF_LSM diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 941ca06d9dfa..ea04b0deb5ce 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -50,7 +50,8 @@ static DEFINE_SPINLOCK(map_idr_lock); static DEFINE_IDR(link_idr); static DEFINE_SPINLOCK(link_idr_lock); -int sysctl_unprivileged_bpf_disabled __read_mostly; +int sysctl_unprivileged_bpf_disabled __read_mostly = + IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0; static const struct bpf_map_ops * const bpf_map_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) -- cgit From 35e3815fa8102fab4dee75f3547472c66581125d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 29 Apr 2021 13:47:12 +0200 Subject: bpf: Add deny list of btf ids check for tracing programs The recursion check in __bpf_prog_enter and __bpf_prog_exit leaves some (not inlined) functions unprotected: In __bpf_prog_enter: - migrate_disable is called before prog->active is checked In __bpf_prog_exit: - migrate_enable,rcu_read_unlock_strict are called after prog->active is decreased When attaching trampoline to them we get panic like: traps: PANIC: double fault, error_code: 0x0 double fault: 0000 [#1] SMP PTI RIP: 0010:__bpf_prog_enter+0x4/0x50 ... Call Trace: bpf_trampoline_6442466513_0+0x18/0x1000 migrate_disable+0x5/0x50 __bpf_prog_enter+0x9/0x50 bpf_trampoline_6442466513_0+0x18/0x1000 migrate_disable+0x5/0x50 __bpf_prog_enter+0x9/0x50 bpf_trampoline_6442466513_0+0x18/0x1000 migrate_disable+0x5/0x50 __bpf_prog_enter+0x9/0x50 bpf_trampoline_6442466513_0+0x18/0x1000 migrate_disable+0x5/0x50 ... Fixing this by adding deny list of btf ids for tracing programs and checking btf id during program verification. Adding above functions to this list. Suggested-by: Alexei Starovoitov Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210429114712.43783-1-jolsa@kernel.org --- kernel/bpf/verifier.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9352a1b7de2d..c58598ef4b5b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13196,6 +13196,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, return 0; } +BTF_SET_START(btf_id_deny) +BTF_ID_UNUSED +#ifdef CONFIG_SMP +BTF_ID(func, migrate_disable) +BTF_ID(func, migrate_enable) +#endif +#if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU +BTF_ID(func, rcu_read_unlock_strict) +#endif +BTF_SET_END(btf_id_deny) + static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; @@ -13255,6 +13266,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) ret = bpf_lsm_verify_prog(&env->log, prog); if (ret < 0) return ret; + } else if (prog->type == BPF_PROG_TYPE_TRACING && + btf_id_set_contains(&btf_id_deny, btf_id)) { + return -EINVAL; } key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id); -- cgit From e2d5b2bb769fa5f500760caba76436ba3a10a895 Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Tue, 11 May 2021 10:10:54 +0200 Subject: bpf: Fix nested bpf_bprintf_prepare with more per-cpu buffers The bpf_seq_printf, bpf_trace_printk and bpf_snprintf helpers share one per-cpu buffer that they use to store temporary data (arguments to bprintf). They "get" that buffer with try_get_fmt_tmp_buf and "put" it by the end of their scope with bpf_bprintf_cleanup. If one of these helpers gets called within the scope of one of these helpers, for example: a first bpf program gets called, uses bpf_trace_printk which calls raw_spin_lock_irqsave which is traced by another bpf program that calls bpf_snprintf, then the second "get" fails. Essentially, these helpers are not re-entrant. They would return -EBUSY and print a warning message once. This patch triples the number of bprintf buffers to allow three levels of nesting. This is very similar to what was done for tracepoints in "9594dc3c7e7 bpf: fix nested bpf tracepoints with per-cpu data" Fixes: d9c9e4db186a ("bpf: Factorize bpf_trace_printk and bpf_seq_printf") Reported-by: syzbot+63122d0bc347f18c1884@syzkaller.appspotmail.com Signed-off-by: Florent Revest Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210511081054.2125874-1-revest@chromium.org --- kernel/bpf/helpers.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 544773970dbc..ef658a9ea5c9 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -696,34 +696,35 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, */ #define MAX_PRINTF_BUF_LEN 512 -struct bpf_printf_buf { - char tmp_buf[MAX_PRINTF_BUF_LEN]; +/* Support executing three nested bprintf helper calls on a given CPU */ +struct bpf_bprintf_buffers { + char tmp_bufs[3][MAX_PRINTF_BUF_LEN]; }; -static DEFINE_PER_CPU(struct bpf_printf_buf, bpf_printf_buf); -static DEFINE_PER_CPU(int, bpf_printf_buf_used); +static DEFINE_PER_CPU(struct bpf_bprintf_buffers, bpf_bprintf_bufs); +static DEFINE_PER_CPU(int, bpf_bprintf_nest_level); static int try_get_fmt_tmp_buf(char **tmp_buf) { - struct bpf_printf_buf *bufs; - int used; + struct bpf_bprintf_buffers *bufs; + int nest_level; preempt_disable(); - used = this_cpu_inc_return(bpf_printf_buf_used); - if (WARN_ON_ONCE(used > 1)) { - this_cpu_dec(bpf_printf_buf_used); + nest_level = this_cpu_inc_return(bpf_bprintf_nest_level); + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bufs->tmp_bufs))) { + this_cpu_dec(bpf_bprintf_nest_level); preempt_enable(); return -EBUSY; } - bufs = this_cpu_ptr(&bpf_printf_buf); - *tmp_buf = bufs->tmp_buf; + bufs = this_cpu_ptr(&bpf_bprintf_bufs); + *tmp_buf = bufs->tmp_bufs[nest_level - 1]; return 0; } void bpf_bprintf_cleanup(void) { - if (this_cpu_read(bpf_printf_buf_used)) { - this_cpu_dec(bpf_printf_buf_used); + if (this_cpu_read(bpf_bprintf_nest_level)) { + this_cpu_dec(bpf_bprintf_nest_level); preempt_enable(); } } -- cgit From 6bdacdb48e94ff26c03c6eeeef48c03c5e2f7dd4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 12 May 2021 20:57:14 +0200 Subject: bpf: Fix BPF_JIT kconfig symbol dependency Randy reported a randconfig build error recently on i386: ld: arch/x86/net/bpf_jit_comp32.o: in function `do_jit': bpf_jit_comp32.c:(.text+0x28c9): undefined reference to `__bpf_call_base' ld: arch/x86/net/bpf_jit_comp32.o: in function `bpf_int_jit_compile': bpf_jit_comp32.c:(.text+0x3694): undefined reference to `bpf_jit_blind_constants' ld: bpf_jit_comp32.c:(.text+0x3719): undefined reference to `bpf_jit_binary_free' ld: bpf_jit_comp32.c:(.text+0x3745): undefined reference to `bpf_jit_binary_alloc' ld: bpf_jit_comp32.c:(.text+0x37d3): undefined reference to `bpf_jit_prog_release_other' [...] The cause was that b24abcff918a ("bpf, kconfig: Add consolidated menu entry for bpf with core options") moved BPF_JIT from net/Kconfig into kernel/bpf/Kconfig and previously BPF_JIT was guarded by a 'if NET'. However, there is no actual dependency on NET, it's just that menuconfig NET selects BPF. And the latter in turn causes kernel/bpf/core.o to be built which contains above symbols. Randy's randconfig didn't have NET set, and BPF wasn't either, but BPF_JIT otoh was. Detangle this by making BPF_JIT depend on BPF instead. arm64 was the only arch that pulled in its JIT in net/ via obj-$(CONFIG_NET), all others unconditionally pull this dir in via obj-y. Do the same since CONFIG_NET guard there is really useless as we compiled the JIT via obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o anyway. Fixes: b24abcff918a ("bpf, kconfig: Add consolidated menu entry for bpf with core options") Reported-by: Randy Dunlap Signed-off-by: Daniel Borkmann Acked-by: Randy Dunlap Tested-by: Randy Dunlap --- kernel/bpf/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index 26b591e23f16..bd04f4a44c01 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -37,6 +37,7 @@ config BPF_SYSCALL config BPF_JIT bool "Enable BPF Just In Time compiler" + depends on BPF depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT depends on MODULES help -- cgit From 8afcc19fbf083a8459284d9a29b4b5ac1cb2396c Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 17 May 2021 11:28:29 +0200 Subject: bpf: Clarify a bpf_bprintf_prepare macro The per-cpu buffers contain bprintf data rather than printf arguments. The macro name and comment were a bit confusing, this rewords them in a clearer way. Signed-off-by: Florent Revest Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210517092830.1026418-1-revest@chromium.org --- kernel/bpf/helpers.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ef658a9ea5c9..3a5ab614cbb0 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -692,13 +692,14 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, return -EINVAL; } -/* Per-cpu temp buffers which can be used by printf-like helpers for %s or %p +/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary + * arguments representation. */ -#define MAX_PRINTF_BUF_LEN 512 +#define MAX_BPRINTF_BUF_LEN 512 /* Support executing three nested bprintf helper calls on a given CPU */ struct bpf_bprintf_buffers { - char tmp_bufs[3][MAX_PRINTF_BUF_LEN]; + char tmp_bufs[3][MAX_BPRINTF_BUF_LEN]; }; static DEFINE_PER_CPU(struct bpf_bprintf_buffers, bpf_bprintf_bufs); static DEFINE_PER_CPU(int, bpf_bprintf_nest_level); @@ -761,7 +762,7 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, if (num_args && try_get_fmt_tmp_buf(&tmp_buf)) return -EBUSY; - tmp_buf_end = tmp_buf + MAX_PRINTF_BUF_LEN; + tmp_buf_end = tmp_buf + MAX_BPRINTF_BUF_LEN; *bin_args = (u32 *)tmp_buf; } -- cgit From 0af02eb2a7d76ca85a1ecaf4b3775e2c86408fab Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 17 May 2021 11:28:30 +0200 Subject: bpf: Avoid using ARRAY_SIZE on an uninitialized pointer The cppcheck static code analysis reported the following error: if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bufs->tmp_bufs))) { ^ ARRAY_SIZE is a macro that expands to sizeofs, so bufs is not actually dereferenced at runtime, and the code is actually safe. But to keep things tidy, this patch removes the need for a call to ARRAY_SIZE by extracting the size of the array into a macro. Cppcheck should no longer be confused and the code ends up being a bit cleaner. Fixes: e2d5b2bb769f ("bpf: Fix nested bpf_bprintf_prepare with more per-cpu buffers") Reported-by: kernel test robot Signed-off-by: Florent Revest Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210517092830.1026418-2-revest@chromium.org --- kernel/bpf/helpers.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 3a5ab614cbb0..73443498d88f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -698,8 +698,9 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, #define MAX_BPRINTF_BUF_LEN 512 /* Support executing three nested bprintf helper calls on a given CPU */ +#define MAX_BPRINTF_NEST_LEVEL 3 struct bpf_bprintf_buffers { - char tmp_bufs[3][MAX_BPRINTF_BUF_LEN]; + char tmp_bufs[MAX_BPRINTF_NEST_LEVEL][MAX_BPRINTF_BUF_LEN]; }; static DEFINE_PER_CPU(struct bpf_bprintf_buffers, bpf_bprintf_bufs); static DEFINE_PER_CPU(int, bpf_bprintf_nest_level); @@ -711,7 +712,7 @@ static int try_get_fmt_tmp_buf(char **tmp_buf) preempt_disable(); nest_level = this_cpu_inc_return(bpf_bprintf_nest_level); - if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bufs->tmp_bufs))) { + if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) { this_cpu_dec(bpf_bprintf_nest_level); preempt_enable(); return -EBUSY; -- cgit From ceb11679d9fcf3fdb358a310a38760fcbe9b63ed Mon Sep 17 00:00:00 2001 From: Yinjun Zhang Date: Thu, 20 May 2021 10:58:34 +0200 Subject: bpf, offload: Reorder offload callback 'prepare' in verifier Commit 4976b718c355 ("bpf: Introduce pseudo_btf_id") switched the order of resolve_pseudo_ldimm(), in which some pseudo instructions are rewritten. Thus those rewritten instructions cannot be passed to driver via 'prepare' offload callback. Reorder the 'prepare' offload callback to fix it. Fixes: 4976b718c355 ("bpf: Introduce pseudo_btf_id") Signed-off-by: Yinjun Zhang Signed-off-by: Simon Horman Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210520085834.15023-1-simon.horman@netronome.com --- kernel/bpf/verifier.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c58598ef4b5b..09849e43f035 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13368,12 +13368,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; - if (bpf_prog_is_dev_bound(env->prog->aux)) { - ret = bpf_prog_offload_verifier_prep(env->prog); - if (ret) - goto skip_full_check; - } - env->explored_states = kvcalloc(state_htab_size(env), sizeof(struct bpf_verifier_state_list *), GFP_USER); @@ -13401,6 +13395,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (ret < 0) goto skip_full_check; + if (bpf_prog_is_dev_bound(env->prog->aux)) { + ret = bpf_prog_offload_verifier_prep(env->prog); + if (ret) + goto skip_full_check; + } + ret = check_cfg(env); if (ret < 0) goto skip_full_check; -- cgit From 5c9d706f61336d9f7f285df64c734af778c70f39 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 25 May 2021 20:35:29 +0200 Subject: bpf: Fix BPF_LSM kconfig symbol dependency Similarly as 6bdacdb48e94 ("bpf: Fix BPF_JIT kconfig symbol dependency") we need to detangle the hard BPF_LSM dependency on NET. This was previously implicit by its dependency on BPF_JIT which itself was dependent on NET (but without any actual/real hard dependency code-wise). Given the latter was lifted, so should be the former as BPF_LSMs could well exist on net-less systems. This therefore also fixes a randconfig build error recently reported by Randy: ld: kernel/bpf/bpf_lsm.o: in function `bpf_lsm_func_proto': bpf_lsm.c:(.text+0x1a0): undefined reference to `bpf_sk_storage_get_proto' ld: bpf_lsm.c:(.text+0x1b8): undefined reference to `bpf_sk_storage_delete_proto' [...] Fixes: b24abcff918a ("bpf, kconfig: Add consolidated menu entry for bpf with core options") Reported-by: Randy Dunlap Signed-off-by: Daniel Borkmann Acked-by: Randy Dunlap Tested-by: Randy Dunlap --- kernel/bpf/bpf_lsm.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 5efb2b24012c..da471bf01b97 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -107,10 +107,12 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_inode_storage_get_proto; case BPF_FUNC_inode_storage_delete: return &bpf_inode_storage_delete_proto; +#ifdef CONFIG_NET case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; +#endif /* CONFIG_NET */ case BPF_FUNC_spin_lock: return &bpf_spin_lock_proto; case BPF_FUNC_spin_unlock: -- cgit From 3d0220f6861d713213b015b582e9f21e5b28d2e0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 21 May 2021 10:17:36 +0000 Subject: bpf: Wrap aux data inside bpf_sanitize_info container Add a container structure struct bpf_sanitize_info which holds the current aux info, and update call-sites to sanitize_ptr_alu() to pass it in. This is needed for passing in additional state later on. Signed-off-by: Daniel Borkmann Reviewed-by: Piotr Krysiuk Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 09849e43f035..98690f5367f9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6486,15 +6486,19 @@ static bool sanitize_needed(u8 opcode) return opcode == BPF_ADD || opcode == BPF_SUB; } +struct bpf_sanitize_info { + struct bpf_insn_aux_data aux; +}; + static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_insn *insn, const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg, struct bpf_reg_state *dst_reg, - struct bpf_insn_aux_data *tmp_aux, + struct bpf_sanitize_info *info, const bool commit_window) { - struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : tmp_aux; + struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : &info->aux; struct bpf_verifier_state *vstate = env->cur_state; bool off_is_imm = tnum_is_const(off_reg->var_off); bool off_is_neg = off_reg->smin_value < 0; @@ -6523,8 +6527,8 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, /* In commit phase we narrow the masking window based on * the observed pointer move after the simulated operation. */ - alu_state = tmp_aux->alu_state; - alu_limit = abs(tmp_aux->alu_limit - alu_limit); + alu_state = info->aux.alu_state; + alu_limit = abs(info->aux.alu_limit - alu_limit); } else { alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0; @@ -6685,7 +6689,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value, umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; - struct bpf_insn_aux_data tmp_aux = {}; + struct bpf_sanitize_info info = {}; u8 opcode = BPF_OP(insn->code); u32 dst = insn->dst_reg; int ret; @@ -6754,7 +6758,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (sanitize_needed(opcode)) { ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg, - &tmp_aux, false); + &info, false); if (ret < 0) return sanitize_err(env, insn, ret, off_reg, dst_reg); } @@ -6895,7 +6899,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; if (sanitize_needed(opcode)) { ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg, - &tmp_aux, true); + &info, true); if (ret < 0) return sanitize_err(env, insn, ret, off_reg, dst_reg); } -- cgit From bb01a1bba579b4b1c5566af24d95f1767859771e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 21 May 2021 10:19:22 +0000 Subject: bpf: Fix mask direction swap upon off reg sign change Masking direction as indicated via mask_to_left is considered to be calculated once and then used to derive pointer limits. Thus, this needs to be placed into bpf_sanitize_info instead so we can pass it to sanitize_ptr_alu() call after the pointer move. Piotr noticed a corner case where the off reg causes masking direction change which then results in an incorrect final aux->alu_limit. Fixes: 7fedb63a8307 ("bpf: Tighten speculative pointer arithmetic mask") Reported-by: Piotr Krysiuk Signed-off-by: Daniel Borkmann Reviewed-by: Piotr Krysiuk Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 98690f5367f9..8574cb60915a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6409,18 +6409,10 @@ enum { }; static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, - const struct bpf_reg_state *off_reg, - u32 *alu_limit, u8 opcode) + u32 *alu_limit, bool mask_to_left) { - bool off_is_neg = off_reg->smin_value < 0; - bool mask_to_left = (opcode == BPF_ADD && off_is_neg) || - (opcode == BPF_SUB && !off_is_neg); u32 max = 0, ptr_limit = 0; - if (!tnum_is_const(off_reg->var_off) && - (off_reg->smin_value < 0) != (off_reg->smax_value < 0)) - return REASON_BOUNDS; - switch (ptr_reg->type) { case PTR_TO_STACK: /* Offset 0 is out-of-bounds, but acceptable start for the @@ -6488,6 +6480,7 @@ static bool sanitize_needed(u8 opcode) struct bpf_sanitize_info { struct bpf_insn_aux_data aux; + bool mask_to_left; }; static int sanitize_ptr_alu(struct bpf_verifier_env *env, @@ -6519,7 +6512,16 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, if (vstate->speculative) goto do_sim; - err = retrieve_ptr_limit(ptr_reg, off_reg, &alu_limit, opcode); + if (!commit_window) { + if (!tnum_is_const(off_reg->var_off) && + (off_reg->smin_value < 0) != (off_reg->smax_value < 0)) + return REASON_BOUNDS; + + info->mask_to_left = (opcode == BPF_ADD && off_is_neg) || + (opcode == BPF_SUB && !off_is_neg); + } + + err = retrieve_ptr_limit(ptr_reg, &alu_limit, info->mask_to_left); if (err < 0) return err; -- cgit From a7036191277f9fa68d92f2071ddc38c09b1e5ee5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 4 May 2021 08:58:25 +0000 Subject: bpf: No need to simulate speculative domain for immediates In 801c6058d14a ("bpf: Fix leakage of uninitialized bpf stack under speculation") we replaced masking logic with direct loads of immediates if the register is a known constant. Given in this case we do not apply any masking, there is also no reason for the operation to be truncated under the speculative domain. Therefore, there is also zero reason for the verifier to branch-off and simulate this case, it only needs to do it for unknown but bounded scalars. As a side-effect, this also enables few test cases that were previously rejected due to simulation under zero truncation. Signed-off-by: Daniel Borkmann Reviewed-by: Piotr Krysiuk Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8574cb60915a..94ba5163d4c5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6545,8 +6545,12 @@ do_sim: /* If we're in commit phase, we're done here given we already * pushed the truncated dst_reg into the speculative verification * stack. + * + * Also, when register is a known constant, we rewrite register-based + * operation to immediate-based, and thus do not need masking (and as + * a consequence, do not need to simulate the zero-truncation either). */ - if (commit_window) + if (commit_window || off_is_imm) return 0; /* Simulate and find potential out-of-bounds access under -- cgit From ff40e51043af63715ab413995ff46996ecf9583f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 28 May 2021 09:16:31 +0000 Subject: bpf, lockdown, audit: Fix buggy SELinux lockdown permission checks Commit 59438b46471a ("security,lockdown,selinux: implement SELinux lockdown") added an implementation of the locked_down LSM hook to SELinux, with the aim to restrict which domains are allowed to perform operations that would breach lockdown. This is indirectly also getting audit subsystem involved to report events. The latter is problematic, as reported by Ondrej and Serhei, since it can bring down the whole system via audit: 1) The audit events that are triggered due to calls to security_locked_down() can OOM kill a machine, see below details [0]. 2) It also seems to be causing a deadlock via avc_has_perm()/slow_avc_audit() when trying to wake up kauditd, for example, when using trace_sched_switch() tracepoint, see details in [1]. Triggering this was not via some hypothetical corner case, but with existing tools like runqlat & runqslower from bcc, for example, which make use of this tracepoint. Rough call sequence goes like: rq_lock(rq) -> -------------------------+ trace_sched_switch() -> | bpf_prog_xyz() -> +-> deadlock selinux_lockdown() -> | audit_log_end() -> | wake_up_interruptible() -> | try_to_wake_up() -> | rq_lock(rq) --------------+ What's worse is that the intention of 59438b46471a to further restrict lockdown settings for specific applications in respect to the global lockdown policy is completely broken for BPF. The SELinux policy rule for the current lockdown check looks something like this: allow : lockdown { }; However, this doesn't match with the 'current' task where the security_locked_down() is executed, example: httpd does a syscall. There is a tracing program attached to the syscall which triggers a BPF program to run, which ends up doing a bpf_probe_read_kernel{,_str}() helper call. The selinux_lockdown() hook does the permission check against 'current', that is, httpd in this example. httpd has literally zero relation to this tracing program, and it would be nonsensical having to write an SELinux policy rule against httpd to let the tracing helper pass. The policy in this case needs to be against the entity that is installing the BPF program. For example, if bpftrace would generate a histogram of syscall counts by user space application: bpftrace -e 'tracepoint:raw_syscalls:sys_enter { @[comm] = count(); }' bpftrace would then go and generate a BPF program from this internally. One way of doing it [for the sake of the example] could be to call bpf_get_current_task() helper and then access current->comm via one of bpf_probe_read_kernel{,_str}() helpers. So the program itself has nothing to do with httpd or any other random app doing a syscall here. The BPF program _explicitly initiated_ the lockdown check. The allow/deny policy belongs in the context of bpftrace: meaning, you want to grant bpftrace access to use these helpers, but other tracers on the system like my_random_tracer _not_. Therefore fix all three issues at the same time by taking a completely different approach for the security_locked_down() hook, that is, move the check into the program verification phase where we actually retrieve the BPF func proto. This also reliably gets the task (current) that is trying to install the BPF tracing program, e.g. bpftrace/bcc/perf/systemtap/etc, and it also fixes the OOM since we're moving this out of the BPF helper's fast-path which can be called several millions of times per second. The check is then also in line with other security_locked_down() hooks in the system where the enforcement is performed at open/load time, for example, open_kcore() for /proc/kcore access or module_sig_check() for module signatures just to pick few random ones. What's out of scope in the fix as well as in other security_locked_down() hook locations /outside/ of BPF subsystem is that if the lockdown policy changes on the fly there is no retrospective action. This requires a different discussion, potentially complex infrastructure, and it's also not clear whether this can be solved generically. Either way, it is out of scope for a suitable stable fix which this one is targeting. Note that the breakage is specifically on 59438b46471a where it started to rely on 'current' as UAPI behavior, and _not_ earlier infrastructure such as 9d1f8be5cf42 ("bpf: Restrict bpf when kernel lockdown is in confidentiality mode"). [0] https://bugzilla.redhat.com/show_bug.cgi?id=1955585, Jakub Hrozek says: I starting seeing this with F-34. When I run a container that is traced with BPF to record the syscalls it is doing, auditd is flooded with messages like: type=AVC msg=audit(1619784520.593:282387): avc: denied { confidentiality } for pid=476 comm="auditd" lockdown_reason="use of bpf to read kernel RAM" scontext=system_u:system_r:auditd_t:s0 tcontext=system_u:system_r:auditd_t:s0 tclass=lockdown permissive=0 This seems to be leading to auditd running out of space in the backlog buffer and eventually OOMs the machine. [...] auditd running at 99% CPU presumably processing all the messages, eventually I get: Apr 30 12:20:42 fedora kernel: audit: backlog limit exceeded Apr 30 12:20:42 fedora kernel: audit: backlog limit exceeded Apr 30 12:20:42 fedora kernel: audit: audit_backlog=2152579 > audit_backlog_limit=64 Apr 30 12:20:42 fedora kernel: audit: audit_backlog=2152626 > audit_backlog_limit=64 Apr 30 12:20:42 fedora kernel: audit: audit_backlog=2152694 > audit_backlog_limit=64 Apr 30 12:20:42 fedora kernel: audit: audit_lost=6878426 audit_rate_limit=0 audit_backlog_limit=64 Apr 30 12:20:45 fedora kernel: oci-seccomp-bpf invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=-1000 Apr 30 12:20:45 fedora kernel: CPU: 0 PID: 13284 Comm: oci-seccomp-bpf Not tainted 5.11.12-300.fc34.x86_64 #1 Apr 30 12:20:45 fedora kernel: Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-2.fc32 04/01/2014 [...] [1] https://lore.kernel.org/linux-audit/CANYvDQN7H5tVp47fbYcRasv4XF07eUbsDwT_eDCHXJUj43J7jQ@mail.gmail.com/, Serhei Makarov says: Upstream kernel 5.11.0-rc7 and later was found to deadlock during a bpf_probe_read_compat() call within a sched_switch tracepoint. The problem is reproducible with the reg_alloc3 testcase from SystemTap's BPF backend testsuite on x86_64 as well as the runqlat, runqslower tools from bcc on ppc64le. Example stack trace: [...] [ 730.868702] stack backtrace: [ 730.869590] CPU: 1 PID: 701 Comm: in:imjournal Not tainted, 5.12.0-0.rc2.20210309git144c79ef3353.166.fc35.x86_64 #1 [ 730.871605] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 [ 730.873278] Call Trace: [ 730.873770] dump_stack+0x7f/0xa1 [ 730.874433] check_noncircular+0xdf/0x100 [ 730.875232] __lock_acquire+0x1202/0x1e10 [ 730.876031] ? __lock_acquire+0xfc0/0x1e10 [ 730.876844] lock_acquire+0xc2/0x3a0 [ 730.877551] ? __wake_up_common_lock+0x52/0x90 [ 730.878434] ? lock_acquire+0xc2/0x3a0 [ 730.879186] ? lock_is_held_type+0xa7/0x120 [ 730.880044] ? skb_queue_tail+0x1b/0x50 [ 730.880800] _raw_spin_lock_irqsave+0x4d/0x90 [ 730.881656] ? __wake_up_common_lock+0x52/0x90 [ 730.882532] __wake_up_common_lock+0x52/0x90 [ 730.883375] audit_log_end+0x5b/0x100 [ 730.884104] slow_avc_audit+0x69/0x90 [ 730.884836] avc_has_perm+0x8b/0xb0 [ 730.885532] selinux_lockdown+0xa5/0xd0 [ 730.886297] security_locked_down+0x20/0x40 [ 730.887133] bpf_probe_read_compat+0x66/0xd0 [ 730.887983] bpf_prog_250599c5469ac7b5+0x10f/0x820 [ 730.888917] trace_call_bpf+0xe9/0x240 [ 730.889672] perf_trace_run_bpf_submit+0x4d/0xc0 [ 730.890579] perf_trace_sched_switch+0x142/0x180 [ 730.891485] ? __schedule+0x6d8/0xb20 [ 730.892209] __schedule+0x6d8/0xb20 [ 730.892899] schedule+0x5b/0xc0 [ 730.893522] exit_to_user_mode_prepare+0x11d/0x240 [ 730.894457] syscall_exit_to_user_mode+0x27/0x70 [ 730.895361] entry_SYSCALL_64_after_hwframe+0x44/0xae [...] Fixes: 59438b46471a ("security,lockdown,selinux: implement SELinux lockdown") Reported-by: Ondrej Mosnacek Reported-by: Jakub Hrozek Reported-by: Serhei Makarov Reported-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Tested-by: Jiri Olsa Cc: Paul Moore Cc: James Morris Cc: Jerome Marchand Cc: Frank Eigler Cc: Linus Torvalds Link: https://lore.kernel.org/bpf/01135120-8bf7-df2e-cff0-1d73f1f841c3@iogearbox.net --- kernel/bpf/helpers.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 73443498d88f..a2f1f15ce432 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "../../lib/kstrtox.h" @@ -1069,11 +1070,13 @@ bpf_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: - return &bpf_probe_read_kernel_proto; + return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + NULL : &bpf_probe_read_kernel_proto; case BPF_FUNC_probe_read_user_str: return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: - return &bpf_probe_read_kernel_str_proto; + return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + NULL : &bpf_probe_read_kernel_str_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; case BPF_FUNC_snprintf: -- cgit From d203b0fd863a2261e5d00b97f3d060c4c2a6db71 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 28 May 2021 13:03:30 +0000 Subject: bpf: Inherit expanded/patched seen count from old aux data Instead of relying on current env->pass_cnt, use the seen count from the old aux data in adjust_insn_aux_data(), and expand it to the new range of patched instructions. This change is valid given we always expand 1:n with n>=1, so what applies to the old/original instruction needs to apply for the replacement as well. Not relying on env->pass_cnt is a prerequisite for a later change where we want to avoid marking an instruction seen when verified under speculative execution path. Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Reviewed-by: Benedict Schlueter Reviewed-by: Piotr Krysiuk Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 94ba5163d4c5..f93c7befb5dc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11366,6 +11366,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, { struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; struct bpf_insn *insn = new_prog->insnsi; + u32 old_seen = old_data[off].seen; u32 prog_len; int i; @@ -11386,7 +11387,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, memcpy(new_data + off + cnt - 1, old_data + off, sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); for (i = off; i < off + cnt - 1; i++) { - new_data[i].seen = env->pass_cnt; + /* Expand insni[off]'s seen count to the patched range. */ + new_data[i].seen = old_seen; new_data[i].zext_dst = insn_has_def32(env, insn + i); } env->insn_aux_data = new_data; -- cgit From fe9a5ca7e370e613a9a75a13008a3845ea759d6e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 28 May 2021 13:47:27 +0000 Subject: bpf: Do not mark insn as seen under speculative path verification ... in such circumstances, we do not want to mark the instruction as seen given the goal is still to jmp-1 rewrite/sanitize dead code, if it is not reachable from the non-speculative path verification. We do however want to verify it for safety regardless. With the patch as-is all the insns that have been marked as seen before the patch will also be marked as seen after the patch (just with a potentially different non-zero count). An upcoming patch will also verify paths that are unreachable in the non-speculative domain, hence this extension is needed. Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Reviewed-by: Benedict Schlueter Reviewed-by: Piotr Krysiuk Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f93c7befb5dc..af88d9b9c014 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6572,6 +6572,19 @@ do_sim: return !ret ? REASON_STACK : 0; } +static void sanitize_mark_insn_seen(struct bpf_verifier_env *env) +{ + struct bpf_verifier_state *vstate = env->cur_state; + + /* If we simulate paths under speculation, we don't update the + * insn as 'seen' such that when we verify unreachable paths in + * the non-speculative domain, sanitize_dead_code() can still + * rewrite/sanitize them. + */ + if (!vstate->speculative) + env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; +} + static int sanitize_err(struct bpf_verifier_env *env, const struct bpf_insn *insn, int reason, const struct bpf_reg_state *off_reg, @@ -10630,7 +10643,7 @@ static int do_check(struct bpf_verifier_env *env) } regs = cur_regs(env); - env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; + sanitize_mark_insn_seen(env); prev_insn_idx = env->insn_idx; if (class == BPF_ALU || class == BPF_ALU64) { @@ -10857,7 +10870,7 @@ process_bpf_exit: return err; env->insn_idx++; - env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; + sanitize_mark_insn_seen(env); } else { verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; @@ -12712,6 +12725,9 @@ static void free_states(struct bpf_verifier_env *env) * insn_aux_data was touched. These variables are compared to clear temporary * data from failed pass. For testing and experiments do_check_common() can be * run multiple times even when prior attempt to verify is unsuccessful. + * + * Note that special handling is needed on !env->bypass_spec_v1 if this is + * ever called outside of error path with subsequent program rejection. */ static void sanitize_insn_aux_data(struct bpf_verifier_env *env) { -- cgit From 9183671af6dbf60a1219371d4ed73e23f43b49db Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 28 May 2021 15:47:32 +0000 Subject: bpf: Fix leakage under speculation on mispredicted branches The verifier only enumerates valid control-flow paths and skips paths that are unreachable in the non-speculative domain. And so it can miss issues under speculative execution on mispredicted branches. For example, a type confusion has been demonstrated with the following crafted program: // r0 = pointer to a map array entry // r6 = pointer to readable stack slot // r9 = scalar controlled by attacker 1: r0 = *(u64 *)(r0) // cache miss 2: if r0 != 0x0 goto line 4 3: r6 = r9 4: if r0 != 0x1 goto line 6 5: r9 = *(u8 *)(r6) 6: // leak r9 Since line 3 runs iff r0 == 0 and line 5 runs iff r0 == 1, the verifier concludes that the pointer dereference on line 5 is safe. But: if the attacker trains both the branches to fall-through, such that the following is speculatively executed ... r6 = r9 r9 = *(u8 *)(r6) // leak r9 ... then the program will dereference an attacker-controlled value and could leak its content under speculative execution via side-channel. This requires to mistrain the branch predictor, which can be rather tricky, because the branches are mutually exclusive. However such training can be done at congruent addresses in user space using different branches that are not mutually exclusive. That is, by training branches in user space ... A: if r0 != 0x0 goto line C B: ... C: if r0 != 0x0 goto line D D: ... ... such that addresses A and C collide to the same CPU branch prediction entries in the PHT (pattern history table) as those of the BPF program's lines 2 and 4, respectively. A non-privileged attacker could simply brute force such collisions in the PHT until observing the attack succeeding. Alternative methods to mistrain the branch predictor are also possible that avoid brute forcing the collisions in the PHT. A reliable attack has been demonstrated, for example, using the following crafted program: // r0 = pointer to a [control] map array entry // r7 = *(u64 *)(r0 + 0), training/attack phase // r8 = *(u64 *)(r0 + 8), oob address // [...] // r0 = pointer to a [data] map array entry 1: if r7 == 0x3 goto line 3 2: r8 = r0 // crafted sequence of conditional jumps to separate the conditional // branch in line 193 from the current execution flow 3: if r0 != 0x0 goto line 5 4: if r0 == 0x0 goto exit 5: if r0 != 0x0 goto line 7 6: if r0 == 0x0 goto exit [...] 187: if r0 != 0x0 goto line 189 188: if r0 == 0x0 goto exit // load any slowly-loaded value (due to cache miss in phase 3) ... 189: r3 = *(u64 *)(r0 + 0x1200) // ... and turn it into known zero for verifier, while preserving slowly- // loaded dependency when executing: 190: r3 &= 1 191: r3 &= 2 // speculatively bypassed phase dependency 192: r7 += r3 193: if r7 == 0x3 goto exit 194: r4 = *(u8 *)(r8 + 0) // leak r4 As can be seen, in training phase (phase != 0x3), the condition in line 1 turns into false and therefore r8 with the oob address is overridden with the valid map value address, which in line 194 we can read out without issues. However, in attack phase, line 2 is skipped, and due to the cache miss in line 189 where the map value is (zeroed and later) added to the phase register, the condition in line 193 takes the fall-through path due to prior branch predictor training, where under speculation, it'll load the byte at oob address r8 (unknown scalar type at that point) which could then be leaked via side-channel. One way to mitigate these is to 'branch off' an unreachable path, meaning, the current verification path keeps following the is_branch_taken() path and we push the other branch to the verification stack. Given this is unreachable from the non-speculative domain, this branch's vstate is explicitly marked as speculative. This is needed for two reasons: i) if this path is solely seen from speculative execution, then we later on still want the dead code elimination to kick in in order to sanitize these instructions with jmp-1s, and ii) to ensure that paths walked in the non-speculative domain are not pruned from earlier walks of paths walked in the speculative domain. Additionally, for robustness, we mark the registers which have been part of the conditional as unknown in the speculative path given there should be no assumptions made on their content. The fix in here mitigates type confusion attacks described earlier due to i) all code paths in the BPF program being explored and ii) existing verifier logic already ensuring that given memory access instruction references one specific data structure. An alternative to this fix that has also been looked at in this scope was to mark aux->alu_state at the jump instruction with a BPF_JMP_TAKEN state as well as direction encoding (always-goto, always-fallthrough, unknown), such that mixing of different always-* directions themselves as well as mixing of always-* with unknown directions would cause a program rejection by the verifier, e.g. programs with constructs like 'if ([...]) { x = 0; } else { x = 1; }' with subsequent 'if (x == 1) { [...] }'. For unprivileged, this would result in only single direction always-* taken paths, and unknown taken paths being allowed, such that the former could be patched from a conditional jump to an unconditional jump (ja). Compared to this approach here, it would have two downsides: i) valid programs that otherwise are not performing any pointer arithmetic, etc, would potentially be rejected/broken, and ii) we are required to turn off path pruning for unprivileged, where both can be avoided in this work through pushing the invalid branch to the verification stack. The issue was originally discovered by Adam and Ofek, and later independently discovered and reported as a result of Benedict and Piotr's research work. Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation") Reported-by: Adam Morrison Reported-by: Ofek Kirzner Reported-by: Benedict Schlueter Reported-by: Piotr Krysiuk Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Reviewed-by: Benedict Schlueter Reviewed-by: Piotr Krysiuk Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 44 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index af88d9b9c014..c6a27574242d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6483,6 +6483,27 @@ struct bpf_sanitize_info { bool mask_to_left; }; +static struct bpf_verifier_state * +sanitize_speculative_path(struct bpf_verifier_env *env, + const struct bpf_insn *insn, + u32 next_idx, u32 curr_idx) +{ + struct bpf_verifier_state *branch; + struct bpf_reg_state *regs; + + branch = push_stack(env, next_idx, curr_idx, true); + if (branch && insn) { + regs = branch->frame[branch->curframe]->regs; + if (BPF_SRC(insn->code) == BPF_K) { + mark_reg_unknown(env, regs, insn->dst_reg); + } else if (BPF_SRC(insn->code) == BPF_X) { + mark_reg_unknown(env, regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->src_reg); + } + } + return branch; +} + static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_insn *insn, const struct bpf_reg_state *ptr_reg, @@ -6566,7 +6587,8 @@ do_sim: tmp = *dst_reg; *dst_reg = *ptr_reg; } - ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); + ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1, + env->insn_idx); if (!ptr_is_dst_reg && ret) *dst_reg = tmp; return !ret ? REASON_STACK : 0; @@ -8763,14 +8785,28 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (err) return err; } + if (pred == 1) { - /* only follow the goto, ignore fall-through */ + /* Only follow the goto, ignore fall-through. If needed, push + * the fall-through branch for simulation under speculative + * execution. + */ + if (!env->bypass_spec_v1 && + !sanitize_speculative_path(env, insn, *insn_idx + 1, + *insn_idx)) + return -EFAULT; *insn_idx += insn->off; return 0; } else if (pred == 0) { - /* only follow fall-through branch, since - * that's where the program will go + /* Only follow the fall-through branch, since that's where the + * program will go. If needed, push the goto branch for + * simulation under speculative execution. */ + if (!env->bypass_spec_v1 && + !sanitize_speculative_path(env, insn, + *insn_idx + insn->off + 1, + *insn_idx)) + return -EFAULT; return 0; } -- cgit