summaryrefslogtreecommitdiff
path: root/kernel/bpf
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/Kconfig5
-rw-r--r--kernel/bpf/Makefile21
-rw-r--r--kernel/bpf/arena.c593
-rw-r--r--kernel/bpf/arraymap.c105
-rw-r--r--kernel/bpf/bloom_filter.c13
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c21
-rw-r--r--kernel/bpf/bpf_inode_storage.c42
-rw-r--r--kernel/bpf/bpf_iter.c5
-rw-r--r--kernel/bpf/bpf_local_storage.c175
-rw-r--r--kernel/bpf/bpf_lsm.c93
-rw-r--r--kernel/bpf/bpf_struct_ops.c943
-rw-r--r--kernel/bpf/bpf_struct_ops_types.h12
-rw-r--r--kernel/bpf/bpf_task_storage.c23
-rw-r--r--kernel/bpf/btf.c1495
-rw-r--r--kernel/bpf/btf_iter.c2
-rw-r--r--kernel/bpf/btf_relocate.c2
-rw-r--r--kernel/bpf/cgroup.c36
-rw-r--r--kernel/bpf/core.c188
-rw-r--r--kernel/bpf/cpumap.c50
-rw-r--r--kernel/bpf/cpumask.c7
-rw-r--r--kernel/bpf/crypto.c393
-rw-r--r--kernel/bpf/devmap.c84
-rw-r--r--kernel/bpf/disasm.c28
-rw-r--r--kernel/bpf/dispatcher.c3
-rw-r--r--kernel/bpf/hashtab.c225
-rw-r--r--kernel/bpf/helpers.c844
-rw-r--r--kernel/bpf/inode.c277
-rw-r--r--kernel/bpf/kmem_cache_iter.c238
-rw-r--r--kernel/bpf/local_storage.c4
-rw-r--r--kernel/bpf/log.c99
-rw-r--r--kernel/bpf/lpm_trie.c158
-rw-r--r--kernel/bpf/map_in_map.c42
-rw-r--r--kernel/bpf/map_iter.c4
-rw-r--r--kernel/bpf/memalloc.c46
-rw-r--r--kernel/bpf/range_tree.c270
-rw-r--r--kernel/bpf/range_tree.h21
-rw-r--r--kernel/bpf/relo_core.c2
-rw-r--r--kernel/bpf/reuseport_array.c2
-rw-r--r--kernel/bpf/ringbuf.c49
-rw-r--r--kernel/bpf/stackmap.c140
-rw-r--r--kernel/bpf/syscall.c919
-rw-r--r--kernel/bpf/sysfs_btf.c18
-rw-r--r--kernel/bpf/task_iter.c15
-rw-r--r--kernel/bpf/token.c255
-rw-r--r--kernel/bpf/trampoline.c84
-rw-r--r--kernel/bpf/verifier.c4862
46 files changed, 9942 insertions, 2971 deletions
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index 6a906ff93006..17067dcb4386 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -3,6 +3,7 @@
# BPF interpreter that, for example, classic socket filters depend on.
config BPF
bool
+ select CRYPTO_LIB_SHA1
# Used by archs to tell that they support BPF JIT compiler plus which
# flavour. Only one of the two can be selected for a specific arch since
@@ -27,7 +28,7 @@ config BPF_SYSCALL
bool "Enable bpf() system call"
select BPF
select IRQ_WORK
- select TASKS_RCU if PREEMPTION
+ select NEED_TASKS_RCU
select TASKS_TRACE_RCU
select BINARY_PRINTF
select NET_SOCK_MSG if NET
@@ -42,7 +43,7 @@ config BPF_JIT
bool "Enable BPF Just In Time compiler"
depends on BPF
depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT
- depends on MODULES
+ select EXECMEM
help
BPF programs are normally handled by a BPF interpreter. This option
allows the kernel to generate native code when a program is loaded
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index f526b7573e97..410028633621 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -4,9 +4,9 @@ ifneq ($(CONFIG_BPF_JIT_ALWAYS_ON),y)
# ___bpf_prog_run() needs GCSE disabled on x86; see 3193c0836f203 for details
cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
endif
-CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
+CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy)
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
@@ -15,6 +15,9 @@ obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o
+ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy)
+obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o
+endif
obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
@@ -41,8 +44,18 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
obj-$(CONFIG_BPF_SYSCALL) += cpumask.o
obj-${CONFIG_BPF_LSM} += bpf_lsm.o
endif
+ifneq ($(CONFIG_CRYPTO),)
+obj-$(CONFIG_BPF_SYSCALL) += crypto.o
+endif
obj-$(CONFIG_BPF_PRELOAD) += preload/
obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
-$(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE
- $(call if_changed_rule,cc_o_c)
+obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o
+obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o
+
+CFLAGS_REMOVE_percpu_freelist.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_queue_stack_maps.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_lpm_trie.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_ringbuf.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
new file mode 100644
index 000000000000..095a9554e1de
--- /dev/null
+++ b/kernel/bpf/arena.c
@@ -0,0 +1,593 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include "linux/filter.h"
+#include <linux/btf_ids.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include "range_tree.h"
+
+/*
+ * bpf_arena is a sparsely populated shared memory region between bpf program and
+ * user space process.
+ *
+ * For example on x86-64 the values could be:
+ * user_vm_start 7f7d26200000 // picked by mmap()
+ * kern_vm_start ffffc90001e69000 // picked by get_vm_area()
+ * For user space all pointers within the arena are normal 8-byte addresses.
+ * In this example 7f7d26200000 is the address of the first page (pgoff=0).
+ * The bpf program will access it as: kern_vm_start + lower_32bit_of_user_ptr
+ * (u32)7f7d26200000 -> 26200000
+ * hence
+ * ffffc90001e69000 + 26200000 == ffffc90028069000 is "pgoff=0" within 4Gb
+ * kernel memory region.
+ *
+ * BPF JITs generate the following code to access arena:
+ * mov eax, eax // eax has lower 32-bit of user pointer
+ * mov word ptr [rax + r12 + off], bx
+ * where r12 == kern_vm_start and off is s16.
+ * Hence allocate 4Gb + GUARD_SZ/2 on each side.
+ *
+ * Initially kernel vm_area and user vma are not populated.
+ * User space can fault-in any address which will insert the page
+ * into kernel and user vma.
+ * bpf program can allocate a page via bpf_arena_alloc_pages() kfunc
+ * which will insert it into kernel vm_area.
+ * The later fault-in from user space will populate that page into user vma.
+ */
+
+/* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */
+#define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1)
+#define KERN_VM_SZ (SZ_4G + GUARD_SZ)
+
+struct bpf_arena {
+ struct bpf_map map;
+ u64 user_vm_start;
+ u64 user_vm_end;
+ struct vm_struct *kern_vm;
+ struct range_tree rt;
+ struct list_head vma_list;
+ struct mutex lock;
+};
+
+u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
+{
+ return arena ? (u64) (long) arena->kern_vm->addr + GUARD_SZ / 2 : 0;
+}
+
+u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
+{
+ return arena ? arena->user_vm_start : 0;
+}
+
+static long arena_map_peek_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static long arena_map_push_elem(struct bpf_map *map, void *value, u64 flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static long arena_map_pop_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static long arena_map_delete_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static int arena_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -EOPNOTSUPP;
+}
+
+static long compute_pgoff(struct bpf_arena *arena, long uaddr)
+{
+ return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT;
+}
+
+static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
+{
+ struct vm_struct *kern_vm;
+ int numa_node = bpf_map_attr_numa_node(attr);
+ struct bpf_arena *arena;
+ u64 vm_range;
+ int err = -ENOMEM;
+
+ if (!bpf_jit_supports_arena())
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (attr->key_size || attr->value_size || attr->max_entries == 0 ||
+ /* BPF_F_MMAPABLE must be set */
+ !(attr->map_flags & BPF_F_MMAPABLE) ||
+ /* No unsupported flags present */
+ (attr->map_flags & ~(BPF_F_SEGV_ON_FAULT | BPF_F_MMAPABLE | BPF_F_NO_USER_CONV)))
+ return ERR_PTR(-EINVAL);
+
+ if (attr->map_extra & ~PAGE_MASK)
+ /* If non-zero the map_extra is an expected user VMA start address */
+ return ERR_PTR(-EINVAL);
+
+ vm_range = (u64)attr->max_entries * PAGE_SIZE;
+ if (vm_range > SZ_4G)
+ return ERR_PTR(-E2BIG);
+
+ if ((attr->map_extra >> 32) != ((attr->map_extra + vm_range - 1) >> 32))
+ /* user vma must not cross 32-bit boundary */
+ return ERR_PTR(-ERANGE);
+
+ kern_vm = get_vm_area(KERN_VM_SZ, VM_SPARSE | VM_USERMAP);
+ if (!kern_vm)
+ return ERR_PTR(-ENOMEM);
+
+ arena = bpf_map_area_alloc(sizeof(*arena), numa_node);
+ if (!arena)
+ goto err;
+
+ arena->kern_vm = kern_vm;
+ arena->user_vm_start = attr->map_extra;
+ if (arena->user_vm_start)
+ arena->user_vm_end = arena->user_vm_start + vm_range;
+
+ INIT_LIST_HEAD(&arena->vma_list);
+ bpf_map_init_from_attr(&arena->map, attr);
+ range_tree_init(&arena->rt);
+ err = range_tree_set(&arena->rt, 0, attr->max_entries);
+ if (err) {
+ bpf_map_area_free(arena);
+ goto err;
+ }
+ mutex_init(&arena->lock);
+
+ return &arena->map;
+err:
+ free_vm_area(kern_vm);
+ return ERR_PTR(err);
+}
+
+static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
+{
+ struct page *page;
+ pte_t pte;
+
+ pte = ptep_get(ptep);
+ if (!pte_present(pte)) /* sanity check */
+ return 0;
+ page = pte_page(pte);
+ /*
+ * We do not update pte here:
+ * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
+ * 2. TLB flushing is batched or deferred. Even if we clear pte,
+ * the TLB entries can stick around and continue to permit access to
+ * the freed page. So it all relies on 1.
+ */
+ __free_page(page);
+ return 0;
+}
+
+static void arena_map_free(struct bpf_map *map)
+{
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ /*
+ * Check that user vma-s are not around when bpf map is freed.
+ * mmap() holds vm_file which holds bpf_map refcnt.
+ * munmap() must have happened on vma followed by arena_vm_close()
+ * which would clear arena->vma_list.
+ */
+ if (WARN_ON_ONCE(!list_empty(&arena->vma_list)))
+ return;
+
+ /*
+ * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area().
+ * It unmaps everything from vmalloc area and clears pgtables.
+ * Call apply_to_existing_page_range() first to find populated ptes and
+ * free those pages.
+ */
+ apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
+ KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
+ free_vm_area(arena->kern_vm);
+ range_tree_destroy(&arena->rt);
+ bpf_map_area_free(arena);
+}
+
+static void *arena_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return ERR_PTR(-EINVAL);
+}
+
+static long arena_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf,
+ const struct btf_type *key_type, const struct btf_type *value_type)
+{
+ return 0;
+}
+
+static u64 arena_map_mem_usage(const struct bpf_map *map)
+{
+ return 0;
+}
+
+struct vma_list {
+ struct vm_area_struct *vma;
+ struct list_head head;
+ refcount_t mmap_count;
+};
+
+static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)
+{
+ struct vma_list *vml;
+
+ vml = kmalloc(sizeof(*vml), GFP_KERNEL);
+ if (!vml)
+ return -ENOMEM;
+ refcount_set(&vml->mmap_count, 1);
+ vma->vm_private_data = vml;
+ vml->vma = vma;
+ list_add(&vml->head, &arena->vma_list);
+ return 0;
+}
+
+static void arena_vm_open(struct vm_area_struct *vma)
+{
+ struct vma_list *vml = vma->vm_private_data;
+
+ refcount_inc(&vml->mmap_count);
+}
+
+static void arena_vm_close(struct vm_area_struct *vma)
+{
+ struct bpf_map *map = vma->vm_file->private_data;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+ struct vma_list *vml = vma->vm_private_data;
+
+ if (!refcount_dec_and_test(&vml->mmap_count))
+ return;
+ guard(mutex)(&arena->lock);
+ /* update link list under lock */
+ list_del(&vml->head);
+ vma->vm_private_data = NULL;
+ kfree(vml);
+}
+
+static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
+{
+ struct bpf_map *map = vmf->vma->vm_file->private_data;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+ struct page *page;
+ long kbase, kaddr;
+ int ret;
+
+ kbase = bpf_arena_get_kern_vm_start(arena);
+ kaddr = kbase + (u32)(vmf->address);
+
+ guard(mutex)(&arena->lock);
+ page = vmalloc_to_page((void *)kaddr);
+ if (page)
+ /* already have a page vmap-ed */
+ goto out;
+
+ if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
+ /* User space requested to segfault when page is not allocated by bpf prog */
+ return VM_FAULT_SIGSEGV;
+
+ ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
+ if (ret)
+ return VM_FAULT_SIGSEGV;
+
+ /* Account into memcg of the process that created bpf_arena */
+ ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page);
+ if (ret) {
+ range_tree_set(&arena->rt, vmf->pgoff, 1);
+ return VM_FAULT_SIGSEGV;
+ }
+
+ ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page);
+ if (ret) {
+ range_tree_set(&arena->rt, vmf->pgoff, 1);
+ __free_page(page);
+ return VM_FAULT_SIGSEGV;
+ }
+out:
+ page_ref_add(page, 1);
+ vmf->page = page;
+ return 0;
+}
+
+static const struct vm_operations_struct arena_vm_ops = {
+ .open = arena_vm_open,
+ .close = arena_vm_close,
+ .fault = arena_vm_fault,
+};
+
+static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ struct bpf_map *map = filp->private_data;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+ long ret;
+
+ if (pgoff)
+ return -EINVAL;
+ if (len > SZ_4G)
+ return -E2BIG;
+
+ /* if user_vm_start was specified at arena creation time */
+ if (arena->user_vm_start) {
+ if (len > arena->user_vm_end - arena->user_vm_start)
+ return -E2BIG;
+ if (len != arena->user_vm_end - arena->user_vm_start)
+ return -EINVAL;
+ if (addr != arena->user_vm_start)
+ return -EINVAL;
+ }
+
+ ret = mm_get_unmapped_area(current->mm, filp, addr, len * 2, 0, flags);
+ if (IS_ERR_VALUE(ret))
+ return ret;
+ if ((ret >> 32) == ((ret + len - 1) >> 32))
+ return ret;
+ if (WARN_ON_ONCE(arena->user_vm_start))
+ /* checks at map creation time should prevent this */
+ return -EFAULT;
+ return round_up(ret, SZ_4G);
+}
+
+static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+{
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ guard(mutex)(&arena->lock);
+ if (arena->user_vm_start && arena->user_vm_start != vma->vm_start)
+ /*
+ * If map_extra was not specified at arena creation time then
+ * 1st user process can do mmap(NULL, ...) to pick user_vm_start
+ * 2nd user process must pass the same addr to mmap(addr, MAP_FIXED..);
+ * or
+ * specify addr in map_extra and
+ * use the same addr later with mmap(addr, MAP_FIXED..);
+ */
+ return -EBUSY;
+
+ if (arena->user_vm_end && arena->user_vm_end != vma->vm_end)
+ /* all user processes must have the same size of mmap-ed region */
+ return -EBUSY;
+
+ /* Earlier checks should prevent this */
+ if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > SZ_4G || vma->vm_pgoff))
+ return -EFAULT;
+
+ if (remember_vma(arena, vma))
+ return -ENOMEM;
+
+ arena->user_vm_start = vma->vm_start;
+ arena->user_vm_end = vma->vm_end;
+ /*
+ * bpf_map_mmap() checks that it's being mmaped as VM_SHARED and
+ * clears VM_MAYEXEC. Set VM_DONTEXPAND as well to avoid
+ * potential change of user_vm_start.
+ */
+ vm_flags_set(vma, VM_DONTEXPAND);
+ vma->vm_ops = &arena_vm_ops;
+ return 0;
+}
+
+static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off)
+{
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if ((u64)off > arena->user_vm_end - arena->user_vm_start)
+ return -ERANGE;
+ *imm = (unsigned long)arena->user_vm_start;
+ return 0;
+}
+
+BTF_ID_LIST_SINGLE(bpf_arena_map_btf_ids, struct, bpf_arena)
+const struct bpf_map_ops arena_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = arena_map_alloc,
+ .map_free = arena_map_free,
+ .map_direct_value_addr = arena_map_direct_value_addr,
+ .map_mmap = arena_map_mmap,
+ .map_get_unmapped_area = arena_get_unmapped_area,
+ .map_get_next_key = arena_map_get_next_key,
+ .map_push_elem = arena_map_push_elem,
+ .map_peek_elem = arena_map_peek_elem,
+ .map_pop_elem = arena_map_pop_elem,
+ .map_lookup_elem = arena_map_lookup_elem,
+ .map_update_elem = arena_map_update_elem,
+ .map_delete_elem = arena_map_delete_elem,
+ .map_check_btf = arena_map_check_btf,
+ .map_mem_usage = arena_map_mem_usage,
+ .map_btf_id = &bpf_arena_map_btf_ids[0],
+};
+
+static u64 clear_lo32(u64 val)
+{
+ return val & ~(u64)~0U;
+}
+
+/*
+ * Allocate pages and vmap them into kernel vmalloc area.
+ * Later the pages will be mmaped into user space vma.
+ */
+static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id)
+{
+ /* user_vm_end/start are fixed before bpf prog runs */
+ long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
+ u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena);
+ struct page **pages;
+ long pgoff = 0;
+ u32 uaddr32;
+ int ret, i;
+
+ if (page_cnt > page_cnt_max)
+ return 0;
+
+ if (uaddr) {
+ if (uaddr & ~PAGE_MASK)
+ return 0;
+ pgoff = compute_pgoff(arena, uaddr);
+ if (pgoff > page_cnt_max - page_cnt)
+ /* requested address will be outside of user VMA */
+ return 0;
+ }
+
+ /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */
+ pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ return 0;
+
+ guard(mutex)(&arena->lock);
+
+ if (uaddr) {
+ ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
+ if (ret)
+ goto out_free_pages;
+ ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
+ } else {
+ ret = pgoff = range_tree_find(&arena->rt, page_cnt);
+ if (pgoff >= 0)
+ ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
+ }
+ if (ret)
+ goto out_free_pages;
+
+ ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO,
+ node_id, page_cnt, pages);
+ if (ret)
+ goto out;
+
+ uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE);
+ /* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1
+ * will not overflow 32-bit. Lower 32-bit need to represent
+ * contiguous user address range.
+ * Map these pages at kern_vm_start base.
+ * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
+ * lower 32-bit and it's ok.
+ */
+ ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32,
+ kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages);
+ if (ret) {
+ for (i = 0; i < page_cnt; i++)
+ __free_page(pages[i]);
+ goto out;
+ }
+ kvfree(pages);
+ return clear_lo32(arena->user_vm_start) + uaddr32;
+out:
+ range_tree_set(&arena->rt, pgoff, page_cnt);
+out_free_pages:
+ kvfree(pages);
+ return 0;
+}
+
+/*
+ * If page is present in vmalloc area, unmap it from vmalloc area,
+ * unmap it from all user space vma-s,
+ * and free it.
+ */
+static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
+{
+ struct vma_list *vml;
+
+ list_for_each_entry(vml, &arena->vma_list, head)
+ zap_page_range_single(vml->vma, uaddr,
+ PAGE_SIZE * page_cnt, NULL);
+}
+
+static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
+{
+ u64 full_uaddr, uaddr_end;
+ long kaddr, pgoff, i;
+ struct page *page;
+
+ /* only aligned lower 32-bit are relevant */
+ uaddr = (u32)uaddr;
+ uaddr &= PAGE_MASK;
+ full_uaddr = clear_lo32(arena->user_vm_start) + uaddr;
+ uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT));
+ if (full_uaddr >= uaddr_end)
+ return;
+
+ page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT;
+
+ guard(mutex)(&arena->lock);
+
+ pgoff = compute_pgoff(arena, uaddr);
+ /* clear range */
+ range_tree_set(&arena->rt, pgoff, page_cnt);
+
+ if (page_cnt > 1)
+ /* bulk zap if multiple pages being freed */
+ zap_pages(arena, full_uaddr, page_cnt);
+
+ kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr;
+ for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) {
+ page = vmalloc_to_page((void *)kaddr);
+ if (!page)
+ continue;
+ if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */
+ /* Optimization for the common case of page_cnt==1:
+ * If page wasn't mapped into some user vma there
+ * is no need to call zap_pages which is slow. When
+ * page_cnt is big it's faster to do the batched zap.
+ */
+ zap_pages(arena, full_uaddr, 1);
+ vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE);
+ __free_page(page);
+ }
+}
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt,
+ int node_id, u64 flags)
+{
+ struct bpf_map *map = p__map;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
+ return NULL;
+
+ return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id);
+}
+
+__bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
+{
+ struct bpf_map *map = p__map;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign)
+ return;
+ arena_free_pages(arena, (long)ptr__ign, page_cnt);
+}
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(arena_kfuncs)
+BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE)
+BTF_KFUNCS_END(arena_kfuncs)
+
+static const struct btf_kfunc_id_set common_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &arena_kfuncs,
+};
+
+static int __init kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
+}
+late_initcall(kfunc_init);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 0bdbbbeab155..eb28c0f219ee 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -73,6 +73,9 @@ int array_map_alloc_check(union bpf_attr *attr)
/* avoid overflow on round_up(map->value_size) */
if (attr->value_size > INT_MAX)
return -E2BIG;
+ /* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */
+ if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE)
+ return -E2BIG;
return 0;
}
@@ -82,7 +85,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int numa_node = bpf_map_attr_numa_node(attr);
u32 elem_size, index_mask, max_entries;
- bool bypass_spec_v1 = bpf_bypass_spec_v1();
+ bool bypass_spec_v1 = bpf_bypass_spec_v1(NULL);
u64 array_size, mask64;
struct bpf_array *array;
@@ -246,6 +249,38 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
return this_cpu_ptr(array->pptrs[index & array->index_mask]);
}
+/* emit BPF instructions equivalent to C code of percpu_array_map_lookup_elem() */
+static int percpu_array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_insn *insn = insn_buf;
+
+ if (!bpf_jit_supports_percpu_insn())
+ return -EOPNOTSUPP;
+
+ if (map->map_flags & BPF_F_INNER_MAP)
+ return -EOPNOTSUPP;
+
+ BUILD_BUG_ON(offsetof(struct bpf_array, map) != 0);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct bpf_array, pptrs));
+
+ *insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0);
+ if (!map->bypass_spec_v1) {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 6);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_0, array->index_mask);
+ } else {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 5);
+ }
+
+ *insn++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
+ *insn++ = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
+ *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
+ *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *insn++ = BPF_MOV64_IMM(BPF_REG_0, 0);
+ return insn - insn_buf;
+}
+
static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
@@ -396,17 +431,22 @@ static void *array_map_vmalloc_addr(struct bpf_array *array)
return (void *)round_down((unsigned long)array, PAGE_SIZE);
}
-static void array_map_free_timers(struct bpf_map *map)
+static void array_map_free_timers_wq(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
- /* We don't reset or free fields other than timer on uref dropping to zero. */
- if (!btf_record_has_field(map->record, BPF_TIMER))
- return;
-
- for (i = 0; i < array->map.max_entries; i++)
- bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
+ /* We don't reset or free fields other than timer and workqueue
+ * on uref dropping to zero.
+ */
+ if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) {
+ for (i = 0; i < array->map.max_entries; i++) {
+ if (btf_record_has_field(map->record, BPF_TIMER))
+ bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
+ if (btf_record_has_field(map->record, BPF_WORKQUEUE))
+ bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i));
+ }
+ }
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
@@ -457,7 +497,7 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
if (map->btf_key_type_id)
seq_printf(m, "%u: ", *(u32 *)key);
btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
rcu_read_unlock();
}
@@ -478,7 +518,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,
seq_printf(m, "\tcpu%d: ", cpu);
btf_type_seq_show(map->btf, map->btf_value_type_id,
per_cpu_ptr(pptr, cpu), m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
}
seq_puts(m, "}\n");
@@ -563,7 +603,7 @@ static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos)
array = container_of(map, struct bpf_array, map);
index = info->index & array->index_mask;
if (info->percpu_value_buf)
- return array->pptrs[index];
+ return (void *)(uintptr_t)array->pptrs[index];
return array_map_elem_ptr(array, index);
}
@@ -582,7 +622,7 @@ static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
array = container_of(map, struct bpf_array, map);
index = info->index & array->index_mask;
if (info->percpu_value_buf)
- return array->pptrs[index];
+ return (void *)(uintptr_t)array->pptrs[index];
return array_map_elem_ptr(array, index);
}
@@ -595,7 +635,7 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
struct bpf_iter_meta meta;
struct bpf_prog *prog;
int off = 0, cpu = 0;
- void __percpu **pptr;
+ void __percpu *pptr;
u32 size;
meta.seq = seq;
@@ -611,7 +651,7 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
if (!info->percpu_value_buf) {
ctx.value = v;
} else {
- pptr = v;
+ pptr = (void __percpu *)(uintptr_t)v;
size = array->elem_size;
for_each_possible_cpu(cpu) {
copy_map_value_long(map, info->percpu_value_buf + off,
@@ -695,13 +735,13 @@ static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback
u64 ret = 0;
void *val;
+ cant_migrate();
+
if (flags != 0)
return -EINVAL;
is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
array = container_of(map, struct bpf_array, map);
- if (is_percpu)
- migrate_disable();
for (i = 0; i < map->max_entries; i++) {
if (is_percpu)
val = this_cpu_ptr(array->pptrs[i]);
@@ -716,8 +756,6 @@ static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback
break;
}
- if (is_percpu)
- migrate_enable();
return num_elems;
}
@@ -750,7 +788,7 @@ const struct bpf_map_ops array_map_ops = {
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
- .map_release_uref = array_map_free_timers,
+ .map_release_uref = array_map_free_timers_wq,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
@@ -776,6 +814,7 @@ const struct bpf_map_ops percpu_array_map_ops = {
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = percpu_array_map_lookup_elem,
+ .map_gen_lookup = percpu_array_map_gen_lookup,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
.map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem,
@@ -906,22 +945,44 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file, int fd)
{
struct bpf_prog *prog = bpf_prog_get(fd);
+ bool is_extended;
if (IS_ERR(prog))
return prog;
- if (!bpf_prog_map_compatible(map, prog)) {
+ if (prog->type == BPF_PROG_TYPE_EXT ||
+ !bpf_prog_map_compatible(map, prog)) {
bpf_prog_put(prog);
return ERR_PTR(-EINVAL);
}
+ mutex_lock(&prog->aux->ext_mutex);
+ is_extended = prog->aux->is_extended;
+ if (!is_extended)
+ prog->aux->prog_array_member_cnt++;
+ mutex_unlock(&prog->aux->ext_mutex);
+ if (is_extended) {
+ /* Extended prog can not be tail callee. It's to prevent a
+ * potential infinite loop like:
+ * tail callee prog entry -> tail callee prog subprog ->
+ * freplace prog entry --tailcall-> tail callee prog entry.
+ */
+ bpf_prog_put(prog);
+ return ERR_PTR(-EBUSY);
+ }
+
return prog;
}
static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
+ struct bpf_prog *prog = ptr;
+
+ mutex_lock(&prog->aux->ext_mutex);
+ prog->aux->prog_array_member_cnt--;
+ mutex_unlock(&prog->aux->ext_mutex);
/* bpf_prog is freed after one RCU or tasks trace grace period */
- bpf_prog_put(ptr);
+ bpf_prog_put(prog);
}
static u32 prog_fd_array_sys_lookup_elem(void *ptr)
@@ -955,7 +1016,7 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
prog_id = prog_fd_array_sys_lookup_elem(ptr);
btf_type_seq_show(map->btf, map->btf_value_type_id,
&prog_id, m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
}
}
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index addf3dd57b59..35e1ddca74d2 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -80,6 +80,18 @@ static int bloom_map_get_next_key(struct bpf_map *map, void *key, void *next_key
return -EOPNOTSUPP;
}
+/* Called from syscall */
+static int bloom_map_alloc_check(union bpf_attr *attr)
+{
+ if (attr->value_size > KMALLOC_MAX_SIZE)
+ /* if value_size is bigger, the user space won't be able to
+ * access the elements.
+ */
+ return -E2BIG;
+
+ return 0;
+}
+
static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
{
u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits;
@@ -191,6 +203,7 @@ static u64 bloom_map_mem_usage(const struct bpf_map *map)
BTF_ID_LIST_SINGLE(bpf_bloom_map_btf_ids, struct, bpf_bloom_filter)
const struct bpf_map_ops bloom_filter_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bloom_map_alloc_check,
.map_alloc = bloom_map_alloc,
.map_free = bloom_map_free,
.map_get_next_key = bloom_map_get_next_key,
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 28efd0a3f220..54ff2a85d4c0 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -15,22 +15,20 @@ static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy);
static void bpf_cgrp_storage_lock(void)
{
- migrate_disable();
+ cant_migrate();
this_cpu_inc(bpf_cgrp_storage_busy);
}
static void bpf_cgrp_storage_unlock(void)
{
this_cpu_dec(bpf_cgrp_storage_busy);
- migrate_enable();
}
static bool bpf_cgrp_storage_trylock(void)
{
- migrate_disable();
+ cant_migrate();
if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) {
this_cpu_dec(bpf_cgrp_storage_busy);
- migrate_enable();
return false;
}
return true;
@@ -47,17 +45,18 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)
{
struct bpf_local_storage *local_storage;
+ migrate_disable();
rcu_read_lock();
local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
- if (!local_storage) {
- rcu_read_unlock();
- return;
- }
+ if (!local_storage)
+ goto out;
bpf_cgrp_storage_lock();
bpf_local_storage_destroy(local_storage);
bpf_cgrp_storage_unlock();
+out:
rcu_read_unlock();
+ migrate_enable();
}
static struct bpf_local_storage_data *
@@ -107,7 +106,7 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
bpf_cgrp_storage_lock();
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
- value, map_flags, GFP_ATOMIC);
+ value, map_flags, false, GFP_ATOMIC);
bpf_cgrp_storage_unlock();
cgroup_put(cgroup);
return PTR_ERR_OR_ZERO(sdata);
@@ -154,7 +153,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
static void cgroup_storage_map_free(struct bpf_map *map)
{
- bpf_local_storage_map_free(map, &cgroup_cache, NULL);
+ bpf_local_storage_map_free(map, &cgroup_cache, &bpf_cgrp_storage_busy);
}
/* *gfp_flags* is a hidden argument provided by the verifier */
@@ -181,7 +180,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
- value, BPF_NOEXIST, gfp_flags);
+ value, BPF_NOEXIST, false, gfp_flags);
unlock:
bpf_cgrp_storage_unlock();
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index b0ef45db207c..15a3eb9b02d9 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -16,7 +16,6 @@
#include <uapi/linux/btf.h>
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
-#include <linux/fdtable.h>
#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(inode_cache);
@@ -63,28 +62,28 @@ void bpf_inode_storage_free(struct inode *inode)
if (!bsb)
return;
+ migrate_disable();
rcu_read_lock();
local_storage = rcu_dereference(bsb->storage);
- if (!local_storage) {
- rcu_read_unlock();
- return;
- }
+ if (!local_storage)
+ goto out;
bpf_local_storage_destroy(local_storage);
+out:
rcu_read_unlock();
+ migrate_enable();
}
static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_local_storage_data *sdata;
- struct fd f = fdget_raw(*(int *)key);
+ CLASS(fd_raw, f)(*(int *)key);
- if (!f.file)
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- sdata = inode_storage_lookup(file_inode(f.file), map, true);
- fdput(f);
+ sdata = inode_storage_lookup(file_inode(fd_file(f)), map, true);
return sdata ? sdata->data : NULL;
}
@@ -92,19 +91,16 @@ static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags)
{
struct bpf_local_storage_data *sdata;
- struct fd f = fdget_raw(*(int *)key);
+ CLASS(fd_raw, f)(*(int *)key);
- if (!f.file)
+ if (fd_empty(f))
return -EBADF;
- if (!inode_storage_ptr(file_inode(f.file))) {
- fdput(f);
+ if (!inode_storage_ptr(file_inode(fd_file(f))))
return -EBADF;
- }
- sdata = bpf_local_storage_update(file_inode(f.file),
+ sdata = bpf_local_storage_update(file_inode(fd_file(f)),
(struct bpf_local_storage_map *)map,
- value, map_flags, GFP_ATOMIC);
- fdput(f);
+ value, map_flags, false, GFP_ATOMIC);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -123,15 +119,11 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
{
- struct fd f = fdget_raw(*(int *)key);
- int err;
+ CLASS(fd_raw, f)(*(int *)key);
- if (!f.file)
+ if (fd_empty(f))
return -EBADF;
-
- err = inode_storage_delete(file_inode(f.file), map);
- fdput(f);
- return err;
+ return inode_storage_delete(file_inode(fd_file(f)), map);
}
/* *gfp_flags* is a hidden argument provided by the verifier */
@@ -162,7 +154,7 @@ BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
sdata = bpf_local_storage_update(
inode, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST, gfp_flags);
+ BPF_NOEXIST, false, gfp_flags);
return IS_ERR(sdata) ? (unsigned long)NULL :
(unsigned long)sdata->data;
}
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 0fae79164187..106735145948 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -283,7 +283,6 @@ static int iter_release(struct inode *inode, struct file *file)
const struct file_operations bpf_iter_fops = {
.open = iter_open,
- .llseek = no_llseek,
.read = bpf_seq_read,
.release = iter_release,
};
@@ -548,7 +547,7 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
return -ENOENT;
/* Only allow sleepable program for resched-able iterator */
- if (prog->aux->sleepable && !bpf_iter_target_support_resched(tinfo))
+ if (prog->sleepable && !bpf_iter_target_support_resched(tinfo))
return -EINVAL;
link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
@@ -697,7 +696,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
struct bpf_run_ctx run_ctx, *old_run_ctx;
int ret;
- if (prog->aux->sleepable) {
+ if (prog->sleepable) {
rcu_read_lock_trace();
migrate_disable();
might_fault();
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 146824cc9689..fa56c30833ff 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -73,7 +73,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
- void *value, bool charge_mem, gfp_t gfp_flags)
+ void *value, bool charge_mem, bool swap_uptrs, gfp_t gfp_flags)
{
struct bpf_local_storage_elem *selem;
@@ -81,9 +81,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return NULL;
if (smap->bpf_ma) {
- migrate_disable();
selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags);
- migrate_enable();
if (selem)
/* Keep the original bpf_map_kzalloc behavior
* before started using the bpf_mem_cache_alloc.
@@ -99,9 +97,12 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
}
if (selem) {
- if (value)
+ if (value) {
+ /* No need to call check_and_init_map_value as memory is zero init */
copy_map_value(&smap->map, SDATA(selem)->data, value);
- /* No need to call check_and_init_map_value as memory is zero init */
+ if (swap_uptrs)
+ bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value);
+ }
return selem;
}
@@ -171,17 +172,14 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
return;
}
- if (smap) {
- migrate_disable();
+ if (smap)
bpf_mem_cache_free(&smap->storage_ma, local_storage);
- migrate_enable();
- } else {
+ else
/* smap could be NULL if the selem that triggered
* this 'local_storage' creation had been long gone.
* In this case, directly do call_rcu().
*/
call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
- }
}
/* rcu tasks trace callback for bpf_ma == false */
@@ -209,8 +207,15 @@ static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
static void bpf_selem_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+ /* The bpf_local_storage_map_free will wait for rcu_barrier */
+ smap = rcu_dereference_check(SDATA(selem)->smap, 1);
+
+ migrate_disable();
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+ migrate_enable();
bpf_mem_cache_raw_free(selem);
}
@@ -226,23 +231,50 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
struct bpf_local_storage_map *smap,
bool reuse_now)
{
- bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
-
if (!smap->bpf_ma) {
+ /* Only task storage has uptrs and task storage
+ * has moved to bpf_mem_alloc. Meaning smap->bpf_ma == true
+ * for task storage, so this bpf_obj_free_fields() won't unpin
+ * any uptr.
+ */
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
__bpf_selem_free(selem, reuse_now);
return;
}
- if (!reuse_now) {
- call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
- } else {
+ if (reuse_now) {
+ /* reuse_now == true only happens when the storage owner
+ * (e.g. task_struct) is being destructed or the map itself
+ * is being destructed (ie map_free). In both cases,
+ * no bpf prog can have a hold on the selem. It is
+ * safe to unpin the uptrs and free the selem now.
+ */
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
/* Instead of using the vanilla call_rcu(),
* bpf_mem_cache_free will be able to reuse selem
* immediately.
*/
- migrate_disable();
bpf_mem_cache_free(&smap->selem_ma, selem);
- migrate_enable();
+ return;
+ }
+
+ call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
+}
+
+static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
+{
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
+ struct hlist_node *n;
+
+ /* The "_safe" iteration is needed.
+ * The loop is not removing the selem from the list
+ * but bpf_selem_free will use the selem->rcu_head
+ * which is union-ized with the selem->free_node.
+ */
+ hlist_for_each_entry_safe(selem, n, list, free_node) {
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+ bpf_selem_free(selem, smap, reuse_now);
}
}
@@ -252,7 +284,7 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
*/
static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
struct bpf_local_storage_elem *selem,
- bool uncharge_mem, bool reuse_now)
+ bool uncharge_mem, struct hlist_head *free_selem_list)
{
struct bpf_local_storage_map *smap;
bool free_local_storage;
@@ -296,7 +328,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
SDATA(selem))
RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
- bpf_selem_free(selem, smap, reuse_now);
+ hlist_add_head(&selem->free_node, free_selem_list);
if (rcu_access_pointer(local_storage->smap) == smap)
RCU_INIT_POINTER(local_storage->smap, NULL);
@@ -318,7 +350,7 @@ static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage,
*
* If the local_storage->list is already empty, the caller will not
* care about the bpf_ma value also because the caller is not
- * responsibile to free the local_storage.
+ * responsible to free the local_storage.
*/
if (storage_smap)
@@ -345,6 +377,7 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage *local_storage;
bool bpf_ma, free_local_storage = false;
+ HLIST_HEAD(selem_free_list);
unsigned long flags;
if (unlikely(!selem_linked_to_storage_lockless(selem)))
@@ -360,9 +393,11 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
raw_spin_lock_irqsave(&local_storage->lock, flags);
if (likely(selem_linked_to_storage(selem)))
free_local_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, true, reuse_now);
+ local_storage, selem, true, &selem_free_list);
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_selem_free_list(&selem_free_list, reuse_now);
+
if (free_local_storage)
bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now);
}
@@ -414,47 +449,21 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
bpf_selem_unlink_storage(selem, reuse_now);
}
-/* If cacheit_lockit is false, this lookup function is lockless */
-struct bpf_local_storage_data *
-bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
- struct bpf_local_storage_map *smap,
- bool cacheit_lockit)
+void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *selem)
{
- struct bpf_local_storage_data *sdata;
- struct bpf_local_storage_elem *selem;
-
- /* Fast path (cache hit) */
- sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx],
- bpf_rcu_lock_held());
- if (sdata && rcu_access_pointer(sdata->smap) == smap)
- return sdata;
-
- /* Slow path (cache miss) */
- hlist_for_each_entry_rcu(selem, &local_storage->list, snode,
- rcu_read_lock_trace_held())
- if (rcu_access_pointer(SDATA(selem)->smap) == smap)
- break;
-
- if (!selem)
- return NULL;
-
- sdata = SDATA(selem);
- if (cacheit_lockit) {
- unsigned long flags;
-
- /* spinlock is needed to avoid racing with the
- * parallel delete. Otherwise, publishing an already
- * deleted sdata to the cache will become a use-after-free
- * problem in the next bpf_local_storage_lookup().
- */
- raw_spin_lock_irqsave(&local_storage->lock, flags);
- if (selem_linked_to_storage(selem))
- rcu_assign_pointer(local_storage->cache[smap->cache_idx],
- sdata);
- raw_spin_unlock_irqrestore(&local_storage->lock, flags);
- }
+ unsigned long flags;
- return sdata;
+ /* spinlock is needed to avoid racing with the
+ * parallel delete. Otherwise, publishing an already
+ * deleted sdata to the cache will become a use-after-free
+ * problem in the next bpf_local_storage_lookup().
+ */
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
+ if (selem_linked_to_storage(selem))
+ rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem));
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
}
static int check_flags(const struct bpf_local_storage_data *old_sdata,
@@ -484,15 +493,11 @@ int bpf_local_storage_alloc(void *owner,
if (err)
return err;
- if (smap->bpf_ma) {
- migrate_disable();
+ if (smap->bpf_ma)
storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags);
- migrate_enable();
- } else {
+ else
storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
gfp_flags | __GFP_NOWARN);
- }
-
if (!storage) {
err = -ENOMEM;
goto uncharge;
@@ -550,11 +555,12 @@ uncharge:
*/
struct bpf_local_storage_data *
bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
- void *value, u64 map_flags, gfp_t gfp_flags)
+ void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags)
{
struct bpf_local_storage_data *old_sdata = NULL;
struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
struct bpf_local_storage *local_storage;
+ HLIST_HEAD(old_selem_free_list);
unsigned long flags;
int err;
@@ -576,7 +582,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (err)
return ERR_PTR(err);
- selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
+ selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags);
if (!selem)
return ERR_PTR(-ENOMEM);
@@ -610,7 +616,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
/* A lookup has just been done before and concluded a new selem is
* needed. The chance of an unnecessary alloc is unlikely.
*/
- alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
+ alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags);
if (!alloc_selem)
return ERR_PTR(-ENOMEM);
@@ -650,11 +656,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (old_sdata) {
bpf_selem_unlink_map(SELEM(old_sdata));
bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
- true, false);
+ true, &old_selem_free_list);
}
unlock:
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_selem_free_list(&old_selem_free_list, false);
if (alloc_selem) {
mem_uncharge(smap, owner, smap->elem_size);
bpf_selem_free(alloc_selem, smap, true);
@@ -732,6 +739,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage_elem *selem;
bool bpf_ma, free_storage = false;
+ HLIST_HEAD(free_selem_list);
struct hlist_node *n;
unsigned long flags;
@@ -760,10 +768,12 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
* of the loop will set the free_cgroup_storage to true.
*/
free_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, true, true);
+ local_storage, selem, true, &free_selem_list);
}
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_selem_free_list(&free_selem_list, true);
+
if (free_storage)
bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true);
}
@@ -808,8 +818,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
nbuckets = max_t(u32, 2, nbuckets);
smap->bucket_log = ilog2(nbuckets);
- smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
- nbuckets, GFP_USER | __GFP_NOWARN);
+ smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets,
+ sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN);
if (!smap->buckets) {
err = -ENOMEM;
goto free_smap;
@@ -823,8 +833,12 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
smap->elem_size = offsetof(struct bpf_local_storage_elem,
sdata.data[attr->value_size]);
- smap->bpf_ma = bpf_ma;
- if (bpf_ma) {
+ /* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non
+ * preemptible context. Thus, enforce all storages to use
+ * bpf_mem_alloc when CONFIG_PREEMPT_RT is enabled.
+ */
+ smap->bpf_ma = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : bpf_ma;
+ if (smap->bpf_ma) {
err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false);
if (err)
goto free_smap;
@@ -880,15 +894,11 @@ void bpf_local_storage_map_free(struct bpf_map *map,
while ((selem = hlist_entry_safe(
rcu_dereference_raw(hlist_first_rcu(&b->list)),
struct bpf_local_storage_elem, map_node))) {
- if (busy_counter) {
- migrate_disable();
+ if (busy_counter)
this_cpu_inc(*busy_counter);
- }
bpf_selem_unlink(selem, true);
- if (busy_counter) {
+ if (busy_counter)
this_cpu_dec(*busy_counter);
- migrate_enable();
- }
cond_resched_rcu();
}
rcu_read_unlock();
@@ -909,6 +919,9 @@ void bpf_local_storage_map_free(struct bpf_map *map,
synchronize_rcu();
if (smap->bpf_ma) {
+ rcu_barrier_tasks_trace();
+ if (!rcu_trace_implies_rcu_gp())
+ rcu_barrier();
bpf_mem_alloc_destroy(&smap->selem_ma);
bpf_mem_alloc_destroy(&smap->storage_ma);
}
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index e8e910395bf6..967492b65185 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -11,7 +11,6 @@
#include <linux/lsm_hooks.h>
#include <linux/bpf_lsm.h>
#include <linux/kallsyms.h>
-#include <linux/bpf_verifier.h>
#include <net/bpf_sk_storage.h>
#include <linux/bpf_local_storage.h>
#include <linux/btf_ids.h>
@@ -36,6 +35,24 @@ BTF_SET_START(bpf_lsm_hooks)
#undef LSM_HOOK
BTF_SET_END(bpf_lsm_hooks)
+BTF_SET_START(bpf_lsm_disabled_hooks)
+BTF_ID(func, bpf_lsm_vm_enough_memory)
+BTF_ID(func, bpf_lsm_inode_need_killpriv)
+BTF_ID(func, bpf_lsm_inode_getsecurity)
+BTF_ID(func, bpf_lsm_inode_listsecurity)
+BTF_ID(func, bpf_lsm_inode_copy_up_xattr)
+BTF_ID(func, bpf_lsm_getselfattr)
+BTF_ID(func, bpf_lsm_getprocattr)
+BTF_ID(func, bpf_lsm_setprocattr)
+#ifdef CONFIG_KEYS
+BTF_ID(func, bpf_lsm_key_getsecurity)
+#endif
+#ifdef CONFIG_AUDIT
+BTF_ID(func, bpf_lsm_audit_rule_match)
+#endif
+BTF_ID(func, bpf_lsm_ismaclabel)
+BTF_SET_END(bpf_lsm_disabled_hooks)
+
/* List of LSM hooks that should operate on 'current' cgroup regardless
* of function signature.
*/
@@ -97,15 +114,24 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
const struct bpf_prog *prog)
{
+ u32 btf_id = prog->aux->attach_btf_id;
+ const char *func_name = prog->aux->attach_func_name;
+
if (!prog->gpl_compatible) {
bpf_log(vlog,
"LSM programs must have a GPL compatible license\n");
return -EINVAL;
}
- if (!btf_id_set_contains(&bpf_lsm_hooks, prog->aux->attach_btf_id)) {
+ if (btf_id_set_contains(&bpf_lsm_disabled_hooks, btf_id)) {
+ bpf_log(vlog, "attach_btf_id %u points to disabled hook %s\n",
+ btf_id, func_name);
+ return -EINVAL;
+ }
+
+ if (!btf_id_set_contains(&bpf_lsm_hooks, btf_id)) {
bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n",
- prog->aux->attach_btf_id, prog->aux->attach_func_name);
+ btf_id, func_name);
return -EINVAL;
}
@@ -260,9 +286,15 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
BTF_SET_START(sleepable_lsm_hooks)
BTF_ID(func, bpf_lsm_bpf)
BTF_ID(func, bpf_lsm_bpf_map)
-BTF_ID(func, bpf_lsm_bpf_map_alloc_security)
-BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_map_create)
+BTF_ID(func, bpf_lsm_bpf_map_free)
BTF_ID(func, bpf_lsm_bpf_prog)
+BTF_ID(func, bpf_lsm_bpf_prog_load)
+BTF_ID(func, bpf_lsm_bpf_prog_free)
+BTF_ID(func, bpf_lsm_bpf_token_create)
+BTF_ID(func, bpf_lsm_bpf_token_free)
+BTF_ID(func, bpf_lsm_bpf_token_cmd)
+BTF_ID(func, bpf_lsm_bpf_token_capable)
BTF_ID(func, bpf_lsm_bprm_check_security)
BTF_ID(func, bpf_lsm_bprm_committed_creds)
BTF_ID(func, bpf_lsm_bprm_committing_creds)
@@ -274,12 +306,9 @@ BTF_ID(func, bpf_lsm_cred_prepare)
BTF_ID(func, bpf_lsm_file_ioctl)
BTF_ID(func, bpf_lsm_file_lock)
BTF_ID(func, bpf_lsm_file_open)
+BTF_ID(func, bpf_lsm_file_post_open)
BTF_ID(func, bpf_lsm_file_receive)
-#ifdef CONFIG_SECURITY_NETWORK
-BTF_ID(func, bpf_lsm_inet_conn_established)
-#endif /* CONFIG_SECURITY_NETWORK */
-
BTF_ID(func, bpf_lsm_inode_create)
BTF_ID(func, bpf_lsm_inode_free_security)
BTF_ID(func, bpf_lsm_inode_getattr)
@@ -310,10 +339,6 @@ BTF_ID(func, bpf_lsm_path_chmod)
BTF_ID(func, bpf_lsm_path_chown)
#endif /* CONFIG_SECURITY_PATH */
-#ifdef CONFIG_KEYS
-BTF_ID(func, bpf_lsm_key_free)
-#endif /* CONFIG_KEYS */
-
BTF_ID(func, bpf_lsm_mmap_file)
BTF_ID(func, bpf_lsm_netlink_send)
BTF_ID(func, bpf_lsm_path_notify)
@@ -330,6 +355,8 @@ BTF_ID(func, bpf_lsm_sb_umount)
BTF_ID(func, bpf_lsm_settime)
#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_inet_conn_established)
+
BTF_ID(func, bpf_lsm_socket_accept)
BTF_ID(func, bpf_lsm_socket_bind)
BTF_ID(func, bpf_lsm_socket_connect)
@@ -348,8 +375,6 @@ BTF_ID(func, bpf_lsm_socket_socketpair)
BTF_ID(func, bpf_lsm_syslog)
BTF_ID(func, bpf_lsm_task_alloc)
-BTF_ID(func, bpf_lsm_current_getsecid_subj)
-BTF_ID(func, bpf_lsm_task_getsecid_obj)
BTF_ID(func, bpf_lsm_task_prctl)
BTF_ID(func, bpf_lsm_task_setscheduler)
BTF_ID(func, bpf_lsm_task_to_inode)
@@ -357,9 +382,8 @@ BTF_ID(func, bpf_lsm_userns_create)
BTF_SET_END(sleepable_lsm_hooks)
BTF_SET_START(untrusted_lsm_hooks)
-BTF_ID(func, bpf_lsm_bpf_map_free_security)
-BTF_ID(func, bpf_lsm_bpf_prog_alloc_security)
-BTF_ID(func, bpf_lsm_bpf_prog_free_security)
+BTF_ID(func, bpf_lsm_bpf_map_free)
+BTF_ID(func, bpf_lsm_bpf_prog_free)
BTF_ID(func, bpf_lsm_file_alloc_security)
BTF_ID(func, bpf_lsm_file_free_security)
#ifdef CONFIG_SECURITY_NETWORK
@@ -386,3 +410,36 @@ const struct bpf_verifier_ops lsm_verifier_ops = {
.get_func_proto = bpf_lsm_func_proto,
.is_valid_access = btf_ctx_access,
};
+
+/* hooks return 0 or 1 */
+BTF_SET_START(bool_lsm_hooks)
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+BTF_ID(func, bpf_lsm_xfrm_state_pol_flow_match)
+#endif
+#ifdef CONFIG_AUDIT
+BTF_ID(func, bpf_lsm_audit_rule_known)
+#endif
+BTF_ID(func, bpf_lsm_inode_xattr_skipcap)
+BTF_SET_END(bool_lsm_hooks)
+
+int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
+ struct bpf_retval_range *retval_range)
+{
+ /* no return value range for void hooks */
+ if (!prog->aux->attach_func_proto->type)
+ return -EINVAL;
+
+ if (btf_id_set_contains(&bool_lsm_hooks, prog->aux->attach_btf_id)) {
+ retval_range->minval = 0;
+ retval_range->maxval = 1;
+ } else {
+ /* All other available LSM hooks, except task_prctl, return 0
+ * on success and negative error code on failure.
+ * To keep things simple, we only allow bpf progs to return 0
+ * or negative errno for task_prctl too.
+ */
+ retval_range->minval = -MAX_ERRNO;
+ retval_range->maxval = 0;
+ }
+ return 0;
+}
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 02068bd0e4d9..040fb1cd840b 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -12,27 +12,18 @@
#include <linux/mutex.h>
#include <linux/btf_ids.h>
#include <linux/rcupdate_wait.h>
-
-enum bpf_struct_ops_state {
- BPF_STRUCT_OPS_STATE_INIT,
- BPF_STRUCT_OPS_STATE_INUSE,
- BPF_STRUCT_OPS_STATE_TOBEFREE,
- BPF_STRUCT_OPS_STATE_READY,
-};
-
-#define BPF_STRUCT_OPS_COMMON_VALUE \
- refcount_t refcnt; \
- enum bpf_struct_ops_state state
+#include <linux/poll.h>
struct bpf_struct_ops_value {
- BPF_STRUCT_OPS_COMMON_VALUE;
+ struct bpf_struct_ops_common_value common;
char data[] ____cacheline_aligned_in_smp;
};
+#define MAX_TRAMP_IMAGE_PAGES 8
+
struct bpf_struct_ops_map {
struct bpf_map map;
- struct rcu_head rcu;
- const struct bpf_struct_ops *st_ops;
+ const struct bpf_struct_ops_desc *st_ops_desc;
/* protect map_update */
struct mutex lock;
/* link has all the bpf_links that is populated
@@ -40,12 +31,16 @@ struct bpf_struct_ops_map {
* (in kvalue.data).
*/
struct bpf_link **links;
- /* image is a page that has all the trampolines
+ /* ksyms for bpf trampolines */
+ struct bpf_ksym **ksyms;
+ u32 funcs_cnt;
+ u32 image_pages_cnt;
+ /* image_pages is an array of pages that has all the trampolines
* that stores the func args before calling the bpf_prog.
- * A PAGE_SIZE "image" is enough to store all trampoline for
- * "links[]".
*/
- void *image;
+ void *image_pages[MAX_TRAMP_IMAGE_PAGES];
+ /* The owner moduler's btf. */
+ struct btf *btf;
/* uvalue->data stores the kernel struct
* (e.g. tcp_congestion_ops) that is more useful
* to userspace than the kvalue. For example,
@@ -63,6 +58,7 @@ struct bpf_struct_ops_map {
struct bpf_struct_ops_link {
struct bpf_link link;
struct bpf_map __rcu *map;
+ wait_queue_head_t wait_hup;
};
static DEFINE_MUTEX(update_mutex);
@@ -70,35 +66,6 @@ static DEFINE_MUTEX(update_mutex);
#define VALUE_PREFIX "bpf_struct_ops_"
#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
-/* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is
- * the map's value exposed to the userspace and its btf-type-id is
- * stored at the map->btf_vmlinux_value_type_id.
- *
- */
-#define BPF_STRUCT_OPS_TYPE(_name) \
-extern struct bpf_struct_ops bpf_##_name; \
- \
-struct bpf_struct_ops_##_name { \
- BPF_STRUCT_OPS_COMMON_VALUE; \
- struct _name data ____cacheline_aligned_in_smp; \
-};
-#include "bpf_struct_ops_types.h"
-#undef BPF_STRUCT_OPS_TYPE
-
-enum {
-#define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name,
-#include "bpf_struct_ops_types.h"
-#undef BPF_STRUCT_OPS_TYPE
- __NR_BPF_STRUCT_OPS_TYPE,
-};
-
-static struct bpf_struct_ops * const bpf_struct_ops[] = {
-#define BPF_STRUCT_OPS_TYPE(_name) \
- [BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name,
-#include "bpf_struct_ops_types.h"
-#undef BPF_STRUCT_OPS_TYPE
-};
-
const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = {
};
@@ -108,138 +75,376 @@ const struct bpf_prog_ops bpf_struct_ops_prog_ops = {
#endif
};
-static const struct btf_type *module_type;
+BTF_ID_LIST(st_ops_ids)
+BTF_ID(struct, module)
+BTF_ID(struct, bpf_struct_ops_common_value)
+
+enum {
+ IDX_MODULE_ID,
+ IDX_ST_OPS_COMMON_VALUE_ID,
+};
+
+extern struct btf *btf_vmlinux;
-void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
+static bool is_valid_value_type(struct btf *btf, s32 value_id,
+ const struct btf_type *type,
+ const char *value_name)
{
- s32 type_id, value_id, module_id;
+ const struct btf_type *common_value_type;
const struct btf_member *member;
- struct bpf_struct_ops *st_ops;
- const struct btf_type *t;
- char value_name[128];
- const char *mname;
- u32 i, j;
+ const struct btf_type *vt, *mt;
- /* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */
-#define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name);
-#include "bpf_struct_ops_types.h"
-#undef BPF_STRUCT_OPS_TYPE
+ vt = btf_type_by_id(btf, value_id);
+ if (btf_vlen(vt) != 2) {
+ pr_warn("The number of %s's members should be 2, but we get %d\n",
+ value_name, btf_vlen(vt));
+ return false;
+ }
+ member = btf_type_member(vt);
+ mt = btf_type_by_id(btf, member->type);
+ common_value_type = btf_type_by_id(btf_vmlinux,
+ st_ops_ids[IDX_ST_OPS_COMMON_VALUE_ID]);
+ if (mt != common_value_type) {
+ pr_warn("The first member of %s should be bpf_struct_ops_common_value\n",
+ value_name);
+ return false;
+ }
+ member++;
+ mt = btf_type_by_id(btf, member->type);
+ if (mt != type) {
+ pr_warn("The second member of %s should be %s\n",
+ value_name, btf_name_by_offset(btf, type->name_off));
+ return false;
+ }
- module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT);
- if (module_id < 0) {
- pr_warn("Cannot find struct module in btf_vmlinux\n");
- return;
+ return true;
+}
+
+static void *bpf_struct_ops_image_alloc(void)
+{
+ void *image;
+ int err;
+
+ err = bpf_jit_charge_modmem(PAGE_SIZE);
+ if (err)
+ return ERR_PTR(err);
+ image = arch_alloc_bpf_trampoline(PAGE_SIZE);
+ if (!image) {
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
+ return ERR_PTR(-ENOMEM);
}
- module_type = btf_type_by_id(btf, module_id);
- for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
- st_ops = bpf_struct_ops[i];
+ return image;
+}
- if (strlen(st_ops->name) + VALUE_PREFIX_LEN >=
- sizeof(value_name)) {
- pr_warn("struct_ops name %s is too long\n",
- st_ops->name);
- continue;
- }
- sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name);
+void bpf_struct_ops_image_free(void *image)
+{
+ if (image) {
+ arch_free_bpf_trampoline(image, PAGE_SIZE);
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
+ }
+}
- value_id = btf_find_by_name_kind(btf, value_name,
- BTF_KIND_STRUCT);
- if (value_id < 0) {
- pr_warn("Cannot find struct %s in btf_vmlinux\n",
- value_name);
+#define MAYBE_NULL_SUFFIX "__nullable"
+#define MAX_STUB_NAME 128
+
+/* Return the type info of a stub function, if it exists.
+ *
+ * The name of a stub function is made up of the name of the struct_ops and
+ * the name of the function pointer member, separated by "__". For example,
+ * if the struct_ops type is named "foo_ops" and the function pointer
+ * member is named "bar", the stub function name would be "foo_ops__bar".
+ */
+static const struct btf_type *
+find_stub_func_proto(const struct btf *btf, const char *st_op_name,
+ const char *member_name)
+{
+ char stub_func_name[MAX_STUB_NAME];
+ const struct btf_type *func_type;
+ s32 btf_id;
+ int cp;
+
+ cp = snprintf(stub_func_name, MAX_STUB_NAME, "%s__%s",
+ st_op_name, member_name);
+ if (cp >= MAX_STUB_NAME) {
+ pr_warn("Stub function name too long\n");
+ return NULL;
+ }
+ btf_id = btf_find_by_name_kind(btf, stub_func_name, BTF_KIND_FUNC);
+ if (btf_id < 0)
+ return NULL;
+ func_type = btf_type_by_id(btf, btf_id);
+ if (!func_type)
+ return NULL;
+
+ return btf_type_by_id(btf, func_type->type); /* FUNC_PROTO */
+}
+
+/* Prepare argument info for every nullable argument of a member of a
+ * struct_ops type.
+ *
+ * Initialize a struct bpf_struct_ops_arg_info according to type info of
+ * the arguments of a stub function. (Check kCFI for more information about
+ * stub functions.)
+ *
+ * Each member in the struct_ops type has a struct bpf_struct_ops_arg_info
+ * to provide an array of struct bpf_ctx_arg_aux, which in turn provides
+ * the information that used by the verifier to check the arguments of the
+ * BPF struct_ops program assigned to the member. Here, we only care about
+ * the arguments that are marked as __nullable.
+ *
+ * The array of struct bpf_ctx_arg_aux is eventually assigned to
+ * prog->aux->ctx_arg_info of BPF struct_ops programs and passed to the
+ * verifier. (See check_struct_ops_btf_id())
+ *
+ * arg_info->info will be the list of struct bpf_ctx_arg_aux if success. If
+ * fails, it will be kept untouched.
+ */
+static int prepare_arg_info(struct btf *btf,
+ const char *st_ops_name,
+ const char *member_name,
+ const struct btf_type *func_proto,
+ struct bpf_struct_ops_arg_info *arg_info)
+{
+ const struct btf_type *stub_func_proto, *pointed_type;
+ const struct btf_param *stub_args, *args;
+ struct bpf_ctx_arg_aux *info, *info_buf;
+ u32 nargs, arg_no, info_cnt = 0;
+ u32 arg_btf_id;
+ int offset;
+
+ stub_func_proto = find_stub_func_proto(btf, st_ops_name, member_name);
+ if (!stub_func_proto)
+ return 0;
+
+ /* Check if the number of arguments of the stub function is the same
+ * as the number of arguments of the function pointer.
+ */
+ nargs = btf_type_vlen(func_proto);
+ if (nargs != btf_type_vlen(stub_func_proto)) {
+ pr_warn("the number of arguments of the stub function %s__%s does not match the number of arguments of the member %s of struct %s\n",
+ st_ops_name, member_name, member_name, st_ops_name);
+ return -EINVAL;
+ }
+
+ if (!nargs)
+ return 0;
+
+ args = btf_params(func_proto);
+ stub_args = btf_params(stub_func_proto);
+
+ info_buf = kcalloc(nargs, sizeof(*info_buf), GFP_KERNEL);
+ if (!info_buf)
+ return -ENOMEM;
+
+ /* Prepare info for every nullable argument */
+ info = info_buf;
+ for (arg_no = 0; arg_no < nargs; arg_no++) {
+ /* Skip arguments that is not suffixed with
+ * "__nullable".
+ */
+ if (!btf_param_match_suffix(btf, &stub_args[arg_no],
+ MAYBE_NULL_SUFFIX))
continue;
+
+ /* Should be a pointer to struct */
+ pointed_type = btf_type_resolve_ptr(btf,
+ args[arg_no].type,
+ &arg_btf_id);
+ if (!pointed_type ||
+ !btf_type_is_struct(pointed_type)) {
+ pr_warn("stub function %s__%s has %s tagging to an unsupported type\n",
+ st_ops_name, member_name, MAYBE_NULL_SUFFIX);
+ goto err_out;
}
- type_id = btf_find_by_name_kind(btf, st_ops->name,
- BTF_KIND_STRUCT);
- if (type_id < 0) {
- pr_warn("Cannot find struct %s in btf_vmlinux\n",
- st_ops->name);
- continue;
+ offset = btf_ctx_arg_offset(btf, func_proto, arg_no);
+ if (offset < 0) {
+ pr_warn("stub function %s__%s has an invalid trampoline ctx offset for arg#%u\n",
+ st_ops_name, member_name, arg_no);
+ goto err_out;
}
- t = btf_type_by_id(btf, type_id);
- if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) {
- pr_warn("Cannot support #%u members in struct %s\n",
- btf_type_vlen(t), st_ops->name);
- continue;
+
+ if (args[arg_no].type != stub_args[arg_no].type) {
+ pr_warn("arg#%u type in stub function %s__%s does not match with its original func_proto\n",
+ arg_no, st_ops_name, member_name);
+ goto err_out;
}
- for_each_member(j, t, member) {
- const struct btf_type *func_proto;
+ /* Fill the information of the new argument */
+ info->reg_type =
+ PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL;
+ info->btf_id = arg_btf_id;
+ info->btf = btf;
+ info->offset = offset;
- mname = btf_name_by_offset(btf, member->name_off);
- if (!*mname) {
- pr_warn("anon member in struct %s is not supported\n",
- st_ops->name);
- break;
- }
+ info++;
+ info_cnt++;
+ }
- if (__btf_member_bitfield_size(t, member)) {
- pr_warn("bit field member %s in struct %s is not supported\n",
- mname, st_ops->name);
- break;
- }
+ if (info_cnt) {
+ arg_info->info = info_buf;
+ arg_info->cnt = info_cnt;
+ } else {
+ kfree(info_buf);
+ }
- func_proto = btf_type_resolve_func_ptr(btf,
- member->type,
- NULL);
- if (func_proto &&
- btf_distill_func_proto(log, btf,
- func_proto, mname,
- &st_ops->func_models[j])) {
- pr_warn("Error in parsing func ptr %s in struct %s\n",
- mname, st_ops->name);
- break;
- }
- }
+ return 0;
- if (j == btf_type_vlen(t)) {
- if (st_ops->init(btf)) {
- pr_warn("Error in init bpf_struct_ops %s\n",
- st_ops->name);
- } else {
- st_ops->type_id = type_id;
- st_ops->type = t;
- st_ops->value_id = value_id;
- st_ops->value_type = btf_type_by_id(btf,
- value_id);
- }
- }
- }
+err_out:
+ kfree(info_buf);
+
+ return -EINVAL;
}
-extern struct btf *btf_vmlinux;
+/* Clean up the arg_info in a struct bpf_struct_ops_desc. */
+void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc)
+{
+ struct bpf_struct_ops_arg_info *arg_info;
+ int i;
+
+ arg_info = st_ops_desc->arg_info;
+ for (i = 0; i < btf_type_vlen(st_ops_desc->type); i++)
+ kfree(arg_info[i].info);
-static const struct bpf_struct_ops *
-bpf_struct_ops_find_value(u32 value_id)
+ kfree(arg_info);
+}
+
+static bool is_module_member(const struct btf *btf, u32 id)
{
- unsigned int i;
+ const struct btf_type *t;
- if (!value_id || !btf_vmlinux)
- return NULL;
+ t = btf_type_resolve_ptr(btf, id, NULL);
+ if (!t)
+ return false;
- for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
- if (bpf_struct_ops[i]->value_id == value_id)
- return bpf_struct_ops[i];
- }
+ if (!__btf_type_is_struct(t) && !btf_type_is_fwd(t))
+ return false;
- return NULL;
+ return !strcmp(btf_name_by_offset(btf, t->name_off), "module");
}
-const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id)
+int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
+ struct btf *btf,
+ struct bpf_verifier_log *log)
{
- unsigned int i;
+ struct bpf_struct_ops *st_ops = st_ops_desc->st_ops;
+ struct bpf_struct_ops_arg_info *arg_info;
+ const struct btf_member *member;
+ const struct btf_type *t;
+ s32 type_id, value_id;
+ char value_name[128];
+ const char *mname;
+ int i, err;
- if (!type_id || !btf_vmlinux)
- return NULL;
+ if (strlen(st_ops->name) + VALUE_PREFIX_LEN >=
+ sizeof(value_name)) {
+ pr_warn("struct_ops name %s is too long\n",
+ st_ops->name);
+ return -EINVAL;
+ }
+ sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name);
- for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
- if (bpf_struct_ops[i]->type_id == type_id)
- return bpf_struct_ops[i];
+ if (!st_ops->cfi_stubs) {
+ pr_warn("struct_ops for %s has no cfi_stubs\n", st_ops->name);
+ return -EINVAL;
}
- return NULL;
+ type_id = btf_find_by_name_kind(btf, st_ops->name,
+ BTF_KIND_STRUCT);
+ if (type_id < 0) {
+ pr_warn("Cannot find struct %s in %s\n",
+ st_ops->name, btf_get_name(btf));
+ return -EINVAL;
+ }
+ t = btf_type_by_id(btf, type_id);
+ if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) {
+ pr_warn("Cannot support #%u members in struct %s\n",
+ btf_type_vlen(t), st_ops->name);
+ return -EINVAL;
+ }
+
+ value_id = btf_find_by_name_kind(btf, value_name,
+ BTF_KIND_STRUCT);
+ if (value_id < 0) {
+ pr_warn("Cannot find struct %s in %s\n",
+ value_name, btf_get_name(btf));
+ return -EINVAL;
+ }
+ if (!is_valid_value_type(btf, value_id, t, value_name))
+ return -EINVAL;
+
+ arg_info = kcalloc(btf_type_vlen(t), sizeof(*arg_info),
+ GFP_KERNEL);
+ if (!arg_info)
+ return -ENOMEM;
+
+ st_ops_desc->arg_info = arg_info;
+ st_ops_desc->type = t;
+ st_ops_desc->type_id = type_id;
+ st_ops_desc->value_id = value_id;
+ st_ops_desc->value_type = btf_type_by_id(btf, value_id);
+
+ for_each_member(i, t, member) {
+ const struct btf_type *func_proto;
+
+ mname = btf_name_by_offset(btf, member->name_off);
+ if (!*mname) {
+ pr_warn("anon member in struct %s is not supported\n",
+ st_ops->name);
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
+ if (__btf_member_bitfield_size(t, member)) {
+ pr_warn("bit field member %s in struct %s is not supported\n",
+ mname, st_ops->name);
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
+ if (!st_ops_ids[IDX_MODULE_ID] && is_module_member(btf, member->type)) {
+ pr_warn("'struct module' btf id not found. Is CONFIG_MODULES enabled? bpf_struct_ops '%s' needs module support.\n",
+ st_ops->name);
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
+ func_proto = btf_type_resolve_func_ptr(btf,
+ member->type,
+ NULL);
+ if (!func_proto)
+ continue;
+
+ if (btf_distill_func_proto(log, btf,
+ func_proto, mname,
+ &st_ops->func_models[i])) {
+ pr_warn("Error in parsing func ptr %s in struct %s\n",
+ mname, st_ops->name);
+ err = -EINVAL;
+ goto errout;
+ }
+
+ err = prepare_arg_info(btf, st_ops->name, mname,
+ func_proto,
+ arg_info + i);
+ if (err)
+ goto errout;
+ }
+
+ if (st_ops->init(btf)) {
+ pr_warn("Error in init bpf_struct_ops %s\n",
+ st_ops->name);
+ err = -EINVAL;
+ goto errout;
+ }
+
+ return 0;
+
+errout:
+ bpf_struct_ops_desc_release(st_ops_desc);
+
+ return err;
}
static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key,
@@ -265,7 +470,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
kvalue = &st_map->kvalue;
/* Pair with smp_store_release() during map_update */
- state = smp_load_acquire(&kvalue->state);
+ state = smp_load_acquire(&kvalue->common.state);
if (state == BPF_STRUCT_OPS_STATE_INIT) {
memset(value, 0, map->value_size);
return 0;
@@ -276,7 +481,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
*/
uvalue = value;
memcpy(uvalue, st_map->uvalue, map->value_size);
- uvalue->state = state;
+ uvalue->common.state = state;
/* This value offers the user space a general estimate of how
* many sockets are still utilizing this struct_ops for TCP
@@ -284,7 +489,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
* should sufficiently meet our present goals.
*/
refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
- refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0));
+ refcount_set(&uvalue->common.refcnt, max_t(s64, refcnt, 0));
return 0;
}
@@ -296,18 +501,26 @@ static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key)
static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
{
- const struct btf_type *t = st_map->st_ops->type;
u32 i;
- for (i = 0; i < btf_type_vlen(t); i++) {
- if (st_map->links[i]) {
- bpf_link_put(st_map->links[i]);
- st_map->links[i] = NULL;
- }
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->links[i])
+ break;
+ bpf_link_put(st_map->links[i]);
+ st_map->links[i] = NULL;
}
}
-static int check_zero_holes(const struct btf_type *t, void *data)
+static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map)
+{
+ int i;
+
+ for (i = 0; i < st_map->image_pages_cnt; i++)
+ bpf_struct_ops_image_free(st_map->image_pages[i]);
+ st_map->image_pages_cnt = 0;
+}
+
+static int check_zero_holes(const struct btf *btf, const struct btf_type *t, void *data)
{
const struct btf_member *member;
u32 i, moff, msize, prev_mend = 0;
@@ -319,8 +532,8 @@ static int check_zero_holes(const struct btf_type *t, void *data)
memchr_inv(data + prev_mend, 0, moff - prev_mend))
return -EINVAL;
- mtype = btf_type_by_id(btf_vmlinux, member->type);
- mtype = btf_resolve_size(btf_vmlinux, mtype, &msize);
+ mtype = btf_type_by_id(btf, member->type);
+ mtype = btf_resolve_size(btf, mtype, &msize);
if (IS_ERR(mtype))
return PTR_ERR(mtype);
prev_mend = moff + msize;
@@ -352,9 +565,12 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = {
int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
struct bpf_tramp_link *link,
const struct btf_func_model *model,
- void *stub_func, void *image, void *image_end)
+ void *stub_func,
+ void **_image, u32 *_image_off,
+ bool allow_alloc)
{
- u32 flags = BPF_TRAMP_F_INDIRECT;
+ u32 image_off = *_image_off, flags = BPF_TRAMP_F_INDIRECT;
+ void *image = *_image;
int size;
tlinks[BPF_TRAMP_FENTRY].links[0] = link;
@@ -364,27 +580,95 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
flags |= BPF_TRAMP_F_RET_FENTRY_RET;
size = arch_bpf_trampoline_size(model, flags, tlinks, NULL);
- if (size < 0)
- return size;
- if (size > (unsigned long)image_end - (unsigned long)image)
- return -E2BIG;
- return arch_prepare_bpf_trampoline(NULL, image, image_end,
+ if (size <= 0)
+ return size ? : -EFAULT;
+
+ /* Allocate image buffer if necessary */
+ if (!image || size > PAGE_SIZE - image_off) {
+ if (!allow_alloc)
+ return -E2BIG;
+
+ image = bpf_struct_ops_image_alloc();
+ if (IS_ERR(image))
+ return PTR_ERR(image);
+ image_off = 0;
+ }
+
+ size = arch_prepare_bpf_trampoline(NULL, image + image_off,
+ image + image_off + size,
model, flags, tlinks, stub_func);
+ if (size <= 0) {
+ if (image != *_image)
+ bpf_struct_ops_image_free(image);
+ return size ? : -EFAULT;
+ }
+
+ *_image = image;
+ *_image_off = image_off + size;
+ return 0;
+}
+
+static void bpf_struct_ops_ksym_init(const char *tname, const char *mname,
+ void *image, unsigned int size,
+ struct bpf_ksym *ksym)
+{
+ snprintf(ksym->name, KSYM_NAME_LEN, "bpf__%s_%s", tname, mname);
+ INIT_LIST_HEAD_RCU(&ksym->lnode);
+ bpf_image_ksym_init(image, size, ksym);
+}
+
+static void bpf_struct_ops_map_add_ksyms(struct bpf_struct_ops_map *st_map)
+{
+ u32 i;
+
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->ksyms[i])
+ break;
+ bpf_image_ksym_add(st_map->ksyms[i]);
+ }
+}
+
+static void bpf_struct_ops_map_del_ksyms(struct bpf_struct_ops_map *st_map)
+{
+ u32 i;
+
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->ksyms[i])
+ break;
+ bpf_image_ksym_del(st_map->ksyms[i]);
+ }
+}
+
+static void bpf_struct_ops_map_free_ksyms(struct bpf_struct_ops_map *st_map)
+{
+ u32 i;
+
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->ksyms[i])
+ break;
+ kfree(st_map->ksyms[i]);
+ st_map->ksyms[i] = NULL;
+ }
}
static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 flags)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
- const struct bpf_struct_ops *st_ops = st_map->st_ops;
+ const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc;
+ const struct bpf_struct_ops *st_ops = st_ops_desc->st_ops;
struct bpf_struct_ops_value *uvalue, *kvalue;
+ const struct btf_type *module_type;
const struct btf_member *member;
- const struct btf_type *t = st_ops->type;
+ const struct btf_type *t = st_ops_desc->type;
struct bpf_tramp_links *tlinks;
void *udata, *kdata;
int prog_fd, err;
- void *image, *image_end;
- u32 i;
+ u32 i, trampoline_start, image_off = 0;
+ void *cur_image = NULL, *image = NULL;
+ struct bpf_link **plink;
+ struct bpf_ksym **pksym;
+ const char *tname, *mname;
if (flags)
return -EINVAL;
@@ -392,16 +676,16 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
if (*(u32 *)key != 0)
return -E2BIG;
- err = check_zero_holes(st_ops->value_type, value);
+ err = check_zero_holes(st_map->btf, st_ops_desc->value_type, value);
if (err)
return err;
uvalue = value;
- err = check_zero_holes(t, uvalue->data);
+ err = check_zero_holes(st_map->btf, t, uvalue->data);
if (err)
return err;
- if (uvalue->state || refcount_read(&uvalue->refcnt))
+ if (uvalue->common.state || refcount_read(&uvalue->common.refcnt))
return -EINVAL;
tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL);
@@ -413,7 +697,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
mutex_lock(&st_map->lock);
- if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) {
+ if (kvalue->common.state != BPF_STRUCT_OPS_STATE_INIT) {
err = -EBUSY;
goto unlock;
}
@@ -422,17 +706,21 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
udata = &uvalue->data;
kdata = &kvalue->data;
- image = st_map->image;
- image_end = st_map->image + PAGE_SIZE;
+ plink = st_map->links;
+ pksym = st_map->ksyms;
+ tname = btf_name_by_offset(st_map->btf, t->name_off);
+ module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
for_each_member(i, t, member) {
const struct btf_type *mtype, *ptype;
struct bpf_prog *prog;
struct bpf_tramp_link *link;
+ struct bpf_ksym *ksym;
u32 moff;
moff = __btf_member_bit_offset(t, member) / 8;
- ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);
+ mname = btf_name_by_offset(st_map->btf, member->name_off);
+ ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL);
if (ptype == module_type) {
if (*(void **)(udata + moff))
goto reset_unlock;
@@ -457,8 +745,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
if (!ptype || !btf_type_is_func_proto(ptype)) {
u32 msize;
- mtype = btf_type_by_id(btf_vmlinux, member->type);
- mtype = btf_resolve_size(btf_vmlinux, mtype, &msize);
+ mtype = btf_type_by_id(st_map->btf, member->type);
+ mtype = btf_resolve_size(st_map->btf, mtype, &msize);
if (IS_ERR(mtype)) {
err = PTR_ERR(mtype);
goto reset_unlock;
@@ -484,7 +772,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
}
if (prog->type != BPF_PROG_TYPE_STRUCT_OPS ||
- prog->aux->attach_btf_id != st_ops->type_id ||
+ prog->aux->attach_btf_id != st_ops_desc->type_id ||
prog->expected_attach_type != i) {
bpf_prog_put(prog);
err = -EINVAL;
@@ -499,40 +787,65 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
}
bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,
&bpf_struct_ops_link_lops, prog);
- st_map->links[i] = &link->link;
+ *plink++ = &link->link;
+ ksym = kzalloc(sizeof(*ksym), GFP_USER);
+ if (!ksym) {
+ err = -ENOMEM;
+ goto reset_unlock;
+ }
+ *pksym++ = ksym;
+
+ trampoline_start = image_off;
err = bpf_struct_ops_prepare_trampoline(tlinks, link,
- &st_ops->func_models[i],
- *(void **)(st_ops->cfi_stubs + moff),
- image, image_end);
- if (err < 0)
+ &st_ops->func_models[i],
+ *(void **)(st_ops->cfi_stubs + moff),
+ &image, &image_off,
+ st_map->image_pages_cnt < MAX_TRAMP_IMAGE_PAGES);
+ if (err)
goto reset_unlock;
- *(void **)(kdata + moff) = image + cfi_get_offset();
- image += err;
+ if (cur_image != image) {
+ st_map->image_pages[st_map->image_pages_cnt++] = image;
+ cur_image = image;
+ trampoline_start = 0;
+ }
+
+ *(void **)(kdata + moff) = image + trampoline_start + cfi_get_offset();
/* put prog_id to udata */
*(unsigned long *)(udata + moff) = prog->aux->id;
+
+ /* init ksym for this trampoline */
+ bpf_struct_ops_ksym_init(tname, mname,
+ image + trampoline_start,
+ image_off - trampoline_start,
+ ksym);
+ }
+
+ if (st_ops->validate) {
+ err = st_ops->validate(kdata);
+ if (err)
+ goto reset_unlock;
+ }
+ for (i = 0; i < st_map->image_pages_cnt; i++) {
+ err = arch_protect_bpf_trampoline(st_map->image_pages[i],
+ PAGE_SIZE);
+ if (err)
+ goto reset_unlock;
}
if (st_map->map.map_flags & BPF_F_LINK) {
err = 0;
- if (st_ops->validate) {
- err = st_ops->validate(kdata);
- if (err)
- goto reset_unlock;
- }
- arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
/* Let bpf_link handle registration & unregistration.
*
* Pair with smp_load_acquire() during lookup_elem().
*/
- smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
+ smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_READY);
goto unlock;
}
- arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
- err = st_ops->reg(kdata);
+ err = st_ops->reg(kdata, NULL);
if (likely(!err)) {
/* This refcnt increment on the map here after
* 'st_ops->reg()' is secure since the state of the
@@ -545,7 +858,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
* It ensures the above udata updates (e.g. prog->aux->id)
* can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
*/
- smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE);
+ smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_INUSE);
goto unlock;
}
@@ -554,15 +867,18 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
* there was a race in registering the struct_ops (under the same name) to
* a sub-system through different struct_ops's maps.
*/
- arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE);
reset_unlock:
+ bpf_struct_ops_map_free_ksyms(st_map);
+ bpf_struct_ops_map_free_image(st_map);
bpf_struct_ops_map_put_progs(st_map);
memset(uvalue, 0, map->value_size);
memset(kvalue, 0, map->value_size);
unlock:
kfree(tlinks);
mutex_unlock(&st_map->lock);
+ if (!err)
+ bpf_struct_ops_map_add_ksyms(st_map);
return err;
}
@@ -575,12 +891,12 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
if (st_map->map.map_flags & BPF_F_LINK)
return -EOPNOTSUPP;
- prev_state = cmpxchg(&st_map->kvalue.state,
+ prev_state = cmpxchg(&st_map->kvalue.common.state,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE);
switch (prev_state) {
case BPF_STRUCT_OPS_STATE_INUSE:
- st_map->st_ops->unreg(&st_map->kvalue.data);
+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, NULL);
bpf_map_put(map);
return 0;
case BPF_STRUCT_OPS_STATE_TOBEFREE:
@@ -597,6 +913,7 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
struct seq_file *m)
{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
void *value;
int err;
@@ -606,9 +923,10 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
if (!err) {
- btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id,
+ btf_type_seq_show(st_map->btf,
+ map->btf_vmlinux_value_type_id,
value, m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
}
kfree(value);
@@ -620,17 +938,28 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
if (st_map->links)
bpf_struct_ops_map_put_progs(st_map);
+ if (st_map->ksyms)
+ bpf_struct_ops_map_free_ksyms(st_map);
bpf_map_area_free(st_map->links);
- if (st_map->image) {
- arch_free_bpf_trampoline(st_map->image, PAGE_SIZE);
- bpf_jit_uncharge_modmem(PAGE_SIZE);
- }
+ bpf_map_area_free(st_map->ksyms);
+ bpf_struct_ops_map_free_image(st_map);
bpf_map_area_free(st_map->uvalue);
bpf_map_area_free(st_map);
}
static void bpf_struct_ops_map_free(struct bpf_map *map)
{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ /* st_ops->owner was acquired during map_alloc to implicitly holds
+ * the btf's refcnt. The acquire was only done when btf_is_module()
+ * st_map->btf cannot be NULL here.
+ */
+ if (btf_is_module(st_map->btf))
+ module_put(st_map->st_ops_desc->st_ops->owner);
+
+ bpf_struct_ops_map_del_ksyms(st_map);
+
/* The struct_ops's function may switch to another struct_ops.
*
* For example, bpf_tcp_cc_x->init() may switch to
@@ -654,29 +983,74 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
{
if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
- (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
+ (attr->map_flags & ~(BPF_F_LINK | BPF_F_VTYPE_BTF_OBJ_FD)) ||
+ !attr->btf_vmlinux_value_type_id)
return -EINVAL;
return 0;
}
+static u32 count_func_ptrs(const struct btf *btf, const struct btf_type *t)
+{
+ int i;
+ u32 count;
+ const struct btf_member *member;
+
+ count = 0;
+ for_each_member(i, t, member)
+ if (btf_type_resolve_func_ptr(btf, member->type, NULL))
+ count++;
+ return count;
+}
+
static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
{
- const struct bpf_struct_ops *st_ops;
+ const struct bpf_struct_ops_desc *st_ops_desc;
size_t st_map_size;
struct bpf_struct_ops_map *st_map;
const struct btf_type *t, *vt;
+ struct module *mod = NULL;
struct bpf_map *map;
+ struct btf *btf;
int ret;
- st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
- if (!st_ops)
- return ERR_PTR(-ENOTSUPP);
+ if (attr->map_flags & BPF_F_VTYPE_BTF_OBJ_FD) {
+ /* The map holds btf for its whole life time. */
+ btf = btf_get_by_fd(attr->value_type_btf_obj_fd);
+ if (IS_ERR(btf))
+ return ERR_CAST(btf);
+ if (!btf_is_module(btf)) {
+ btf_put(btf);
+ return ERR_PTR(-EINVAL);
+ }
- vt = st_ops->value_type;
- if (attr->value_size != vt->size)
- return ERR_PTR(-EINVAL);
+ mod = btf_try_get_module(btf);
+ /* mod holds a refcnt to btf. We don't need an extra refcnt
+ * here.
+ */
+ btf_put(btf);
+ if (!mod)
+ return ERR_PTR(-EINVAL);
+ } else {
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(btf))
+ return ERR_CAST(btf);
+ if (!btf)
+ return ERR_PTR(-ENOTSUPP);
+ }
- t = st_ops->type;
+ st_ops_desc = bpf_struct_ops_find_value(btf, attr->btf_vmlinux_value_type_id);
+ if (!st_ops_desc) {
+ ret = -ENOTSUPP;
+ goto errout;
+ }
+
+ vt = st_ops_desc->value_type;
+ if (attr->value_size != vt->size) {
+ ret = -EINVAL;
+ goto errout;
+ }
+
+ t = st_ops_desc->type;
st_map_size = sizeof(*st_map) +
/* kvalue stores the
@@ -685,54 +1059,54 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
(vt->size - sizeof(struct bpf_struct_ops_value));
st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE);
- if (!st_map)
- return ERR_PTR(-ENOMEM);
+ if (!st_map) {
+ ret = -ENOMEM;
+ goto errout;
+ }
- st_map->st_ops = st_ops;
+ st_map->st_ops_desc = st_ops_desc;
map = &st_map->map;
- ret = bpf_jit_charge_modmem(PAGE_SIZE);
- if (ret) {
- __bpf_struct_ops_map_free(map);
- return ERR_PTR(ret);
- }
-
- st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE);
- if (!st_map->image) {
- /* __bpf_struct_ops_map_free() uses st_map->image as flag
- * for "charged or not". In this case, we need to unchange
- * here.
- */
- bpf_jit_uncharge_modmem(PAGE_SIZE);
- __bpf_struct_ops_map_free(map);
- return ERR_PTR(-ENOMEM);
- }
st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
+ st_map->funcs_cnt = count_func_ptrs(btf, t);
st_map->links =
- bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *),
+ bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_link *),
NUMA_NO_NODE);
- if (!st_map->uvalue || !st_map->links) {
- __bpf_struct_ops_map_free(map);
- return ERR_PTR(-ENOMEM);
+
+ st_map->ksyms =
+ bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_ksym *),
+ NUMA_NO_NODE);
+ if (!st_map->uvalue || !st_map->links || !st_map->ksyms) {
+ ret = -ENOMEM;
+ goto errout_free;
}
+ st_map->btf = btf;
mutex_init(&st_map->lock);
bpf_map_init_from_attr(map, attr);
return map;
+
+errout_free:
+ __bpf_struct_ops_map_free(map);
+errout:
+ module_put(mod);
+
+ return ERR_PTR(ret);
}
static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
- const struct bpf_struct_ops *st_ops = st_map->st_ops;
- const struct btf_type *vt = st_ops->value_type;
+ const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc;
+ const struct btf_type *vt = st_ops_desc->value_type;
u64 usage;
usage = sizeof(*st_map) +
vt->size - sizeof(struct bpf_struct_ops_value);
usage += vt->size;
- usage += btf_type_vlen(vt) * sizeof(struct bpf_links *);
+ usage += st_map->funcs_cnt * sizeof(struct bpf_link *);
+ usage += st_map->funcs_cnt * sizeof(struct bpf_ksym *);
usage += PAGE_SIZE;
return usage;
}
@@ -778,6 +1152,13 @@ void bpf_struct_ops_put(const void *kdata)
bpf_map_put(&st_map->map);
}
+int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff)
+{
+ void *func_ptr = *(void **)(st_ops->cfi_stubs + moff);
+
+ return func_ptr ? 0 : -ENOTSUPP;
+}
+
static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
@@ -785,7 +1166,7 @@ static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
return map->map_type == BPF_MAP_TYPE_STRUCT_OPS &&
map->map_flags & BPF_F_LINK &&
/* Pair with smp_store_release() during map_update */
- smp_load_acquire(&st_map->kvalue.state) == BPF_STRUCT_OPS_STATE_READY;
+ smp_load_acquire(&st_map->kvalue.common.state) == BPF_STRUCT_OPS_STATE_READY;
}
static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
@@ -797,10 +1178,7 @@ static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
st_map = (struct bpf_struct_ops_map *)
rcu_dereference_protected(st_link->map, true);
if (st_map) {
- /* st_link->map can be NULL if
- * bpf_struct_ops_link_create() fails to register.
- */
- st_map->st_ops->unreg(&st_map->kvalue.data);
+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);
bpf_map_put(&st_map->map);
}
kfree(st_link);
@@ -815,7 +1193,8 @@ static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
st_link = container_of(link, struct bpf_struct_ops_link, link);
rcu_read_lock();
map = rcu_dereference(st_link->map);
- seq_printf(seq, "map_id:\t%d\n", map->id);
+ if (map)
+ seq_printf(seq, "map_id:\t%d\n", map->id);
rcu_read_unlock();
}
@@ -828,7 +1207,8 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
st_link = container_of(link, struct bpf_struct_ops_link, link);
rcu_read_lock();
map = rcu_dereference(st_link->map);
- info->struct_ops.map_id = map->id;
+ if (map)
+ info->struct_ops.map_id = map->id;
rcu_read_unlock();
return 0;
}
@@ -847,12 +1227,16 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
if (!bpf_struct_ops_valid_to_reg(new_map))
return -EINVAL;
- if (!st_map->st_ops->update)
+ if (!st_map->st_ops_desc->st_ops->update)
return -EOPNOTSUPP;
mutex_lock(&update_mutex);
old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+ if (!old_map) {
+ err = -ENOLINK;
+ goto err_out;
+ }
if (expected_old_map && old_map != expected_old_map) {
err = -EPERM;
goto err_out;
@@ -860,12 +1244,12 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
/* The new and old struct_ops must be the same type. */
- if (st_map->st_ops != old_st_map->st_ops) {
+ if (st_map->st_ops_desc != old_st_map->st_ops_desc) {
err = -EINVAL;
goto err_out;
}
- err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
+ err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data, link);
if (err)
goto err_out;
@@ -879,11 +1263,53 @@ err_out:
return err;
}
+static int bpf_struct_ops_map_link_detach(struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link = container_of(link, struct bpf_struct_ops_link, link);
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
+
+ mutex_lock(&update_mutex);
+
+ map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+ if (!map) {
+ mutex_unlock(&update_mutex);
+ return 0;
+ }
+ st_map = container_of(map, struct bpf_struct_ops_map, map);
+
+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);
+
+ RCU_INIT_POINTER(st_link->map, NULL);
+ /* Pair with bpf_map_get() in bpf_struct_ops_link_create() or
+ * bpf_map_inc() in bpf_struct_ops_map_link_update().
+ */
+ bpf_map_put(&st_map->map);
+
+ mutex_unlock(&update_mutex);
+
+ wake_up_interruptible_poll(&st_link->wait_hup, EPOLLHUP);
+
+ return 0;
+}
+
+static __poll_t bpf_struct_ops_map_link_poll(struct file *file,
+ struct poll_table_struct *pts)
+{
+ struct bpf_struct_ops_link *st_link = file->private_data;
+
+ poll_wait(file, &st_link->wait_hup, pts);
+
+ return rcu_access_pointer(st_link->map) ? 0 : EPOLLHUP;
+}
+
static const struct bpf_link_ops bpf_struct_ops_map_lops = {
.dealloc = bpf_struct_ops_map_link_dealloc,
+ .detach = bpf_struct_ops_map_link_detach,
.show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
.fill_link_info = bpf_struct_ops_map_link_fill_link_info,
.update_map = bpf_struct_ops_map_link_update,
+ .poll = bpf_struct_ops_map_link_poll,
};
int bpf_struct_ops_link_create(union bpf_attr *attr)
@@ -916,13 +1342,21 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
if (err)
goto err_out;
- err = st_map->st_ops->reg(st_map->kvalue.data);
+ init_waitqueue_head(&link->wait_hup);
+
+ /* Hold the update_mutex such that the subsystem cannot
+ * do link->ops->detach() before the link is fully initialized.
+ */
+ mutex_lock(&update_mutex);
+ err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link);
if (err) {
+ mutex_unlock(&update_mutex);
bpf_link_cleanup(&link_primer);
link = NULL;
goto err_out;
}
RCU_INIT_POINTER(link->map, map);
+ mutex_unlock(&update_mutex);
return bpf_link_settle(&link_primer);
@@ -931,3 +1365,10 @@ err_out:
kfree(link);
return err;
}
+
+void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ info->btf_vmlinux_id = btf_obj_id(st_map->btf);
+}
diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h
deleted file mode 100644
index 5678a9ddf817..000000000000
--- a/kernel/bpf/bpf_struct_ops_types.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* internal file - do not include directly */
-
-#ifdef CONFIG_BPF_JIT
-#ifdef CONFIG_NET
-BPF_STRUCT_OPS_TYPE(bpf_dummy_ops)
-#endif
-#ifdef CONFIG_INET
-#include <net/tcp.h>
-BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)
-#endif
-#endif
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index adf6dfe0ba68..1109475953c0 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -16,7 +16,6 @@
#include <linux/filter.h>
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
-#include <linux/fdtable.h>
#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(task_cache);
@@ -25,22 +24,20 @@ static DEFINE_PER_CPU(int, bpf_task_storage_busy);
static void bpf_task_storage_lock(void)
{
- migrate_disable();
+ cant_migrate();
this_cpu_inc(bpf_task_storage_busy);
}
static void bpf_task_storage_unlock(void)
{
this_cpu_dec(bpf_task_storage_busy);
- migrate_enable();
}
static bool bpf_task_storage_trylock(void)
{
- migrate_disable();
+ cant_migrate();
if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
this_cpu_dec(bpf_task_storage_busy);
- migrate_enable();
return false;
}
return true;
@@ -73,18 +70,19 @@ void bpf_task_storage_free(struct task_struct *task)
{
struct bpf_local_storage *local_storage;
+ migrate_disable();
rcu_read_lock();
local_storage = rcu_dereference(task->bpf_storage);
- if (!local_storage) {
- rcu_read_unlock();
- return;
- }
+ if (!local_storage)
+ goto out;
bpf_task_storage_lock();
bpf_local_storage_destroy(local_storage);
bpf_task_storage_unlock();
+out:
rcu_read_unlock();
+ migrate_enable();
}
static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
@@ -129,6 +127,9 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
struct pid *pid;
int fd, err;
+ if ((map_flags & BPF_F_LOCK) && btf_record_has_field(map->record, BPF_UPTR))
+ return -EOPNOTSUPP;
+
fd = *(int *)key;
pid = pidfd_get_pid(fd, &f_flags);
if (IS_ERR(pid))
@@ -147,7 +148,7 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
bpf_task_storage_lock();
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value, map_flags,
- GFP_ATOMIC);
+ true, GFP_ATOMIC);
bpf_task_storage_unlock();
err = PTR_ERR_OR_ZERO(sdata);
@@ -219,7 +220,7 @@ static void *__bpf_task_storage_get(struct bpf_map *map,
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) {
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST, gfp_flags);
+ BPF_NOEXIST, false, gfp_flags);
return IS_ERR(sdata) ? NULL : sdata->data;
}
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 596471189176..c3223e0db2f5 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -19,6 +19,7 @@
#include <linux/bpf_verifier.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>
+#include <linux/bpf.h>
#include <linux/bpf_lsm.h>
#include <linux/skmsg.h>
#include <linux/perf_event.h>
@@ -211,12 +212,13 @@ enum btf_kfunc_hook {
BTF_KFUNC_HOOK_TRACING,
BTF_KFUNC_HOOK_SYSCALL,
BTF_KFUNC_HOOK_FMODRET,
- BTF_KFUNC_HOOK_CGROUP_SKB,
+ BTF_KFUNC_HOOK_CGROUP,
BTF_KFUNC_HOOK_SCHED_ACT,
BTF_KFUNC_HOOK_SK_SKB,
BTF_KFUNC_HOOK_SOCKET_FILTER,
BTF_KFUNC_HOOK_LWT,
BTF_KFUNC_HOOK_NETFILTER,
+ BTF_KFUNC_HOOK_KPROBE,
BTF_KFUNC_HOOK_MAX,
};
@@ -241,6 +243,12 @@ struct btf_id_dtor_kfunc_tab {
struct btf_id_dtor_kfunc dtors[];
};
+struct btf_struct_ops_tab {
+ u32 cnt;
+ u32 capacity;
+ struct bpf_struct_ops_desc ops[];
+};
+
struct btf {
void *data;
struct btf_type **types;
@@ -258,6 +266,7 @@ struct btf {
struct btf_kfunc_set_tab *kfunc_set_tab;
struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
struct btf_struct_metas *struct_meta_tab;
+ struct btf_struct_ops_tab *struct_ops_tab;
/* split BTF support */
struct btf *base_btf;
@@ -265,6 +274,7 @@ struct btf {
u32 start_str_off; /* first string offset (0 for base BTF) */
char name[MODULE_NAME_LEN];
bool kernel_btf;
+ __u32 *base_id_map; /* map from distilled base BTF -> vmlinux BTF ids */
};
enum verifier_phase {
@@ -405,7 +415,7 @@ const char *btf_type_str(const struct btf_type *t)
struct btf_show {
u64 flags;
void *target; /* target of show operation (seq file, buffer) */
- void (*showfn)(struct btf_show *show, const char *fmt, va_list args);
+ __printf(2, 0) void (*showfn)(struct btf_show *show, const char *fmt, va_list args);
const struct btf *btf;
/* below are used during iteration */
struct {
@@ -488,11 +498,6 @@ bool btf_type_is_void(const struct btf_type *t)
return t == &btf_void;
}
-static bool btf_type_is_fwd(const struct btf_type *t)
-{
- return BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
-}
-
static bool btf_type_is_datasec(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
@@ -521,6 +526,11 @@ static bool btf_type_is_decl_tag_target(const struct btf_type *t)
btf_type_is_var(t) || btf_type_is_typedef(t);
}
+bool btf_is_vmlinux(const struct btf *btf)
+{
+ return btf->kernel_btf && !btf->base_btf;
+}
+
u32 btf_nr_types(const struct btf *btf)
{
u32 total = 0;
@@ -763,7 +773,7 @@ static bool __btf_name_char_ok(char c, bool first)
return true;
}
-static const char *btf_str_by_offset(const struct btf *btf, u32 offset)
+const char *btf_str_by_offset(const struct btf *btf, u32 offset)
{
while (offset < btf->start_str_off)
btf = btf->base_btf;
@@ -775,7 +785,7 @@ static const char *btf_str_by_offset(const struct btf *btf, u32 offset)
return NULL;
}
-static bool __btf_name_valid(const struct btf *btf, u32 offset)
+static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
{
/* offset must be valid */
const char *src = btf_str_by_offset(btf, offset);
@@ -796,14 +806,25 @@ static bool __btf_name_valid(const struct btf *btf, u32 offset)
return !*src;
}
-static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
-{
- return __btf_name_valid(btf, offset);
-}
-
+/* Allow any printable character in DATASEC names */
static bool btf_name_valid_section(const struct btf *btf, u32 offset)
{
- return __btf_name_valid(btf, offset);
+ /* offset must be valid */
+ const char *src = btf_str_by_offset(btf, offset);
+ const char *src_limit;
+
+ if (!*src)
+ return false;
+
+ /* set a limit on identifier length */
+ src_limit = src + KSYM_NAME_LEN;
+ while (*src && src < src_limit) {
+ if (!isprint(*src))
+ return false;
+ src++;
+ }
+
+ return !*src;
}
static const char *__btf_name_by_offset(const struct btf *btf, u32 offset)
@@ -1647,14 +1668,8 @@ static void btf_free_kfunc_set_tab(struct btf *btf)
if (!tab)
return;
- /* For module BTF, we directly assign the sets being registered, so
- * there is nothing to free except kfunc_set_tab.
- */
- if (btf_is_module(btf))
- goto free_tab;
for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++)
kfree(tab->sets[hook]);
-free_tab:
kfree(tab);
btf->kfunc_set_tab = NULL;
}
@@ -1688,15 +1703,36 @@ static void btf_free_struct_meta_tab(struct btf *btf)
btf->struct_meta_tab = NULL;
}
+static void btf_free_struct_ops_tab(struct btf *btf)
+{
+ struct btf_struct_ops_tab *tab = btf->struct_ops_tab;
+ u32 i;
+
+ if (!tab)
+ return;
+
+ for (i = 0; i < tab->cnt; i++)
+ bpf_struct_ops_desc_release(&tab->ops[i]);
+
+ kfree(tab);
+ btf->struct_ops_tab = NULL;
+}
+
static void btf_free(struct btf *btf)
{
btf_free_struct_meta_tab(btf);
btf_free_dtor_kfunc_tab(btf);
btf_free_kfunc_set_tab(btf);
+ btf_free_struct_ops_tab(btf);
kvfree(btf->types);
kvfree(btf->resolved_sizes);
kvfree(btf->resolved_ids);
- kvfree(btf->data);
+ /* vmlinux does not allocate btf->data, it simply points it at
+ * __start_BTF.
+ */
+ if (!btf_is_vmlinux(btf))
+ kvfree(btf->data);
+ kvfree(btf->base_id_map);
kfree(btf);
}
@@ -1707,6 +1743,11 @@ static void btf_free_rcu(struct rcu_head *rcu)
btf_free(btf);
}
+const char *btf_get_name(const struct btf *btf)
+{
+ return btf->name;
+}
+
void btf_get(struct btf *btf)
{
refcount_inc(&btf->refcnt);
@@ -1720,6 +1761,23 @@ void btf_put(struct btf *btf)
}
}
+struct btf *btf_base_btf(const struct btf *btf)
+{
+ return btf->base_btf;
+}
+
+const struct btf_header *btf_header(const struct btf *btf)
+{
+ return &btf->hdr;
+}
+
+void btf_set_base_btf(struct btf *btf, const struct btf *base_btf)
+{
+ btf->base_btf = (struct btf *)base_btf;
+ btf->start_id = btf_nr_types(base_btf);
+ btf->start_str_off = base_btf->hdr.str_len;
+}
+
static int env_resolve_init(struct btf_verifier_env *env)
{
struct btf *btf = env->btf;
@@ -2745,7 +2803,7 @@ static void btf_ref_type_log(struct btf_verifier_env *env,
btf_verifier_log(env, "type_id=%u", t->type);
}
-static struct btf_kind_operations modifier_ops = {
+static const struct btf_kind_operations modifier_ops = {
.check_meta = btf_ref_type_check_meta,
.resolve = btf_modifier_resolve,
.check_member = btf_modifier_check_member,
@@ -2754,7 +2812,7 @@ static struct btf_kind_operations modifier_ops = {
.show = btf_modifier_show,
};
-static struct btf_kind_operations ptr_ops = {
+static const struct btf_kind_operations ptr_ops = {
.check_meta = btf_ref_type_check_meta,
.resolve = btf_ptr_resolve,
.check_member = btf_ptr_check_member,
@@ -2795,7 +2853,7 @@ static void btf_fwd_type_log(struct btf_verifier_env *env,
btf_verifier_log(env, "%s", btf_type_kflag(t) ? "union" : "struct");
}
-static struct btf_kind_operations fwd_ops = {
+static const struct btf_kind_operations fwd_ops = {
.check_meta = btf_fwd_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_df_check_member,
@@ -3046,7 +3104,7 @@ static void btf_array_show(const struct btf *btf, const struct btf_type *t,
__btf_array_show(btf, t, type_id, data, bits_offset, show);
}
-static struct btf_kind_operations array_ops = {
+static const struct btf_kind_operations array_ops = {
.check_meta = btf_array_check_meta,
.resolve = btf_array_resolve,
.check_member = btf_array_check_member,
@@ -3271,7 +3329,7 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
}
static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
- u32 off, int sz, struct btf_field_info *info)
+ u32 off, int sz, struct btf_field_info *info, u32 field_mask)
{
enum btf_field_type type;
u32 res_id;
@@ -3295,9 +3353,14 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
type = BPF_KPTR_REF;
else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off)))
type = BPF_KPTR_PERCPU;
+ else if (!strcmp("uptr", __btf_name_by_offset(btf, t->name_off)))
+ type = BPF_UPTR;
else
return -EINVAL;
+ if (!(type & field_mask))
+ return BTF_FIELD_IGNORE;
+
/* Get the base type */
t = btf_type_skip_modifiers(btf, t->type, &res_id);
/* Only pointer to struct is allowed */
@@ -3310,30 +3373,48 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
return BTF_FIELD_FOUND;
}
-const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
- int comp_idx, const char *tag_key)
+int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt,
+ int comp_idx, const char *tag_key, int last_id)
{
- const char *value = NULL;
- int i;
+ int len = strlen(tag_key);
+ int i, n;
- for (i = 1; i < btf_nr_types(btf); i++) {
+ for (i = last_id + 1, n = btf_nr_types(btf); i < n; i++) {
const struct btf_type *t = btf_type_by_id(btf, i);
- int len = strlen(tag_key);
if (!btf_type_is_decl_tag(t))
continue;
- if (pt != btf_type_by_id(btf, t->type) ||
- btf_type_decl_tag(t)->component_idx != comp_idx)
+ if (pt != btf_type_by_id(btf, t->type))
+ continue;
+ if (btf_type_decl_tag(t)->component_idx != comp_idx)
continue;
if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len))
continue;
- /* Prevent duplicate entries for same type */
- if (value)
- return ERR_PTR(-EEXIST);
- value = __btf_name_by_offset(btf, t->name_off) + len;
+ return i;
}
- if (!value)
- return ERR_PTR(-ENOENT);
+ return -ENOENT;
+}
+
+const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
+ int comp_idx, const char *tag_key)
+{
+ const char *value = NULL;
+ const struct btf_type *t;
+ int len, id;
+
+ id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, 0);
+ if (id < 0)
+ return ERR_PTR(id);
+
+ t = btf_type_by_id(btf, id);
+ len = strlen(tag_key);
+ value = __btf_name_by_offset(btf, t->name_off) + len;
+
+ /* Prevent duplicate entries for same type */
+ id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, id);
+ if (id >= 0)
+ return ERR_PTR(-EEXIST);
+
return value;
}
@@ -3380,10 +3461,12 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
goto end; \
}
-static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
+static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_type,
+ u32 field_mask, u32 *seen_mask,
int *align, int *sz)
{
int type = 0;
+ const char *name = __btf_name_by_offset(btf, var_type->name_off);
if (field_mask & BPF_SPIN_LOCK) {
if (!strcmp(name, "bpf_spin_lock")) {
@@ -3403,6 +3486,15 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
goto end;
}
}
+ if (field_mask & BPF_WORKQUEUE) {
+ if (!strcmp(name, "bpf_wq")) {
+ if (*seen_mask & BPF_WORKQUEUE)
+ return -E2BIG;
+ *seen_mask |= BPF_WORKQUEUE;
+ type = BPF_WORKQUEUE;
+ goto end;
+ }
+ }
field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head");
field_mask_test_name(BPF_LIST_NODE, "bpf_list_node");
field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root");
@@ -3410,7 +3502,7 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
field_mask_test_name(BPF_REFCOUNT, "bpf_refcount");
/* Only return BPF_KPTR when all other types with matchable names fail */
- if (field_mask & BPF_KPTR) {
+ if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) {
type = BPF_KPTR_REF;
goto end;
}
@@ -3423,138 +3515,240 @@ end:
#undef field_mask_test_name
+/* Repeat a number of fields for a specified number of times.
+ *
+ * Copy the fields starting from the first field and repeat them for
+ * repeat_cnt times. The fields are repeated by adding the offset of each
+ * field with
+ * (i + 1) * elem_size
+ * where i is the repeat index and elem_size is the size of an element.
+ */
+static int btf_repeat_fields(struct btf_field_info *info, int info_cnt,
+ u32 field_cnt, u32 repeat_cnt, u32 elem_size)
+{
+ u32 i, j;
+ u32 cur;
+
+ /* Ensure not repeating fields that should not be repeated. */
+ for (i = 0; i < field_cnt; i++) {
+ switch (info[i].type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ /* The type of struct size or variable size is u32,
+ * so the multiplication will not overflow.
+ */
+ if (field_cnt * (repeat_cnt + 1) > info_cnt)
+ return -E2BIG;
+
+ cur = field_cnt;
+ for (i = 0; i < repeat_cnt; i++) {
+ memcpy(&info[cur], &info[0], field_cnt * sizeof(info[0]));
+ for (j = 0; j < field_cnt; j++)
+ info[cur++].off += (i + 1) * elem_size;
+ }
+
+ return 0;
+}
+
static int btf_find_struct_field(const struct btf *btf,
const struct btf_type *t, u32 field_mask,
- struct btf_field_info *info, int info_cnt)
+ struct btf_field_info *info, int info_cnt,
+ u32 level);
+
+/* Find special fields in the struct type of a field.
+ *
+ * This function is used to find fields of special types that is not a
+ * global variable or a direct field of a struct type. It also handles the
+ * repetition if it is the element type of an array.
+ */
+static int btf_find_nested_struct(const struct btf *btf, const struct btf_type *t,
+ u32 off, u32 nelems,
+ u32 field_mask, struct btf_field_info *info,
+ int info_cnt, u32 level)
{
- int ret, idx = 0, align, sz, field_type;
- const struct btf_member *member;
+ int ret, err, i;
+
+ level++;
+ if (level >= MAX_RESOLVE_DEPTH)
+ return -E2BIG;
+
+ ret = btf_find_struct_field(btf, t, field_mask, info, info_cnt, level);
+
+ if (ret <= 0)
+ return ret;
+
+ /* Shift the offsets of the nested struct fields to the offsets
+ * related to the container.
+ */
+ for (i = 0; i < ret; i++)
+ info[i].off += off;
+
+ if (nelems > 1) {
+ err = btf_repeat_fields(info, info_cnt, ret, nelems - 1, t->size);
+ if (err == 0)
+ ret *= nelems;
+ else
+ ret = err;
+ }
+
+ return ret;
+}
+
+static int btf_find_field_one(const struct btf *btf,
+ const struct btf_type *var,
+ const struct btf_type *var_type,
+ int var_idx,
+ u32 off, u32 expected_size,
+ u32 field_mask, u32 *seen_mask,
+ struct btf_field_info *info, int info_cnt,
+ u32 level)
+{
+ int ret, align, sz, field_type;
struct btf_field_info tmp;
+ const struct btf_array *array;
+ u32 i, nelems = 1;
+
+ /* Walk into array types to find the element type and the number of
+ * elements in the (flattened) array.
+ */
+ for (i = 0; i < MAX_RESOLVE_DEPTH && btf_type_is_array(var_type); i++) {
+ array = btf_array(var_type);
+ nelems *= array->nelems;
+ var_type = btf_type_by_id(btf, array->type);
+ }
+ if (i == MAX_RESOLVE_DEPTH)
+ return -E2BIG;
+ if (nelems == 0)
+ return 0;
+
+ field_type = btf_get_field_type(btf, var_type,
+ field_mask, seen_mask, &align, &sz);
+ /* Look into variables of struct types */
+ if (!field_type && __btf_type_is_struct(var_type)) {
+ sz = var_type->size;
+ if (expected_size && expected_size != sz * nelems)
+ return 0;
+ ret = btf_find_nested_struct(btf, var_type, off, nelems, field_mask,
+ &info[0], info_cnt, level);
+ return ret;
+ }
+
+ if (field_type == 0)
+ return 0;
+ if (field_type < 0)
+ return field_type;
+
+ if (expected_size && expected_size != sz * nelems)
+ return 0;
+ if (off % align)
+ return 0;
+
+ switch (field_type) {
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_WORKQUEUE:
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ case BPF_REFCOUNT:
+ ret = btf_find_struct(btf, var_type, off, sz, field_type,
+ info_cnt ? &info[0] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
+ ret = btf_find_kptr(btf, var_type, off, sz,
+ info_cnt ? &info[0] : &tmp, field_mask);
+ if (ret < 0)
+ return ret;
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ ret = btf_find_graph_root(btf, var, var_type,
+ var_idx, off, sz,
+ info_cnt ? &info[0] : &tmp,
+ field_type);
+ if (ret < 0)
+ return ret;
+ break;
+ default:
+ return -EFAULT;
+ }
+
+ if (ret == BTF_FIELD_IGNORE)
+ return 0;
+ if (!info_cnt)
+ return -E2BIG;
+ if (nelems > 1) {
+ ret = btf_repeat_fields(info, info_cnt, 1, nelems - 1, sz);
+ if (ret < 0)
+ return ret;
+ }
+ return nelems;
+}
+
+static int btf_find_struct_field(const struct btf *btf,
+ const struct btf_type *t, u32 field_mask,
+ struct btf_field_info *info, int info_cnt,
+ u32 level)
+{
+ int ret, idx = 0;
+ const struct btf_member *member;
u32 i, off, seen_mask = 0;
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
- field_type = btf_get_field_type(__btf_name_by_offset(btf, member_type->name_off),
- field_mask, &seen_mask, &align, &sz);
- if (field_type == 0)
- continue;
- if (field_type < 0)
- return field_type;
-
off = __btf_member_bit_offset(t, member);
if (off % 8)
/* valid C code cannot generate such BTF */
return -EINVAL;
off /= 8;
- if (off % align)
- continue;
-
- switch (field_type) {
- case BPF_SPIN_LOCK:
- case BPF_TIMER:
- case BPF_LIST_NODE:
- case BPF_RB_NODE:
- case BPF_REFCOUNT:
- ret = btf_find_struct(btf, member_type, off, sz, field_type,
- idx < info_cnt ? &info[idx] : &tmp);
- if (ret < 0)
- return ret;
- break;
- case BPF_KPTR_UNREF:
- case BPF_KPTR_REF:
- case BPF_KPTR_PERCPU:
- ret = btf_find_kptr(btf, member_type, off, sz,
- idx < info_cnt ? &info[idx] : &tmp);
- if (ret < 0)
- return ret;
- break;
- case BPF_LIST_HEAD:
- case BPF_RB_ROOT:
- ret = btf_find_graph_root(btf, t, member_type,
- i, off, sz,
- idx < info_cnt ? &info[idx] : &tmp,
- field_type);
- if (ret < 0)
- return ret;
- break;
- default:
- return -EFAULT;
- }
- if (ret == BTF_FIELD_IGNORE)
- continue;
- if (idx >= info_cnt)
- return -E2BIG;
- ++idx;
+ ret = btf_find_field_one(btf, t, member_type, i,
+ off, 0,
+ field_mask, &seen_mask,
+ &info[idx], info_cnt - idx, level);
+ if (ret < 0)
+ return ret;
+ idx += ret;
}
return idx;
}
static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
u32 field_mask, struct btf_field_info *info,
- int info_cnt)
+ int info_cnt, u32 level)
{
- int ret, idx = 0, align, sz, field_type;
+ int ret, idx = 0;
const struct btf_var_secinfo *vsi;
- struct btf_field_info tmp;
u32 i, off, seen_mask = 0;
for_each_vsi(i, t, vsi) {
const struct btf_type *var = btf_type_by_id(btf, vsi->type);
const struct btf_type *var_type = btf_type_by_id(btf, var->type);
- field_type = btf_get_field_type(__btf_name_by_offset(btf, var_type->name_off),
- field_mask, &seen_mask, &align, &sz);
- if (field_type == 0)
- continue;
- if (field_type < 0)
- return field_type;
-
off = vsi->offset;
- if (vsi->size != sz)
- continue;
- if (off % align)
- continue;
-
- switch (field_type) {
- case BPF_SPIN_LOCK:
- case BPF_TIMER:
- case BPF_LIST_NODE:
- case BPF_RB_NODE:
- case BPF_REFCOUNT:
- ret = btf_find_struct(btf, var_type, off, sz, field_type,
- idx < info_cnt ? &info[idx] : &tmp);
- if (ret < 0)
- return ret;
- break;
- case BPF_KPTR_UNREF:
- case BPF_KPTR_REF:
- case BPF_KPTR_PERCPU:
- ret = btf_find_kptr(btf, var_type, off, sz,
- idx < info_cnt ? &info[idx] : &tmp);
- if (ret < 0)
- return ret;
- break;
- case BPF_LIST_HEAD:
- case BPF_RB_ROOT:
- ret = btf_find_graph_root(btf, var, var_type,
- -1, off, sz,
- idx < info_cnt ? &info[idx] : &tmp,
- field_type);
- if (ret < 0)
- return ret;
- break;
- default:
- return -EFAULT;
- }
-
- if (ret == BTF_FIELD_IGNORE)
- continue;
- if (idx >= info_cnt)
- return -E2BIG;
- ++idx;
+ ret = btf_find_field_one(btf, var, var_type, -1, off, vsi->size,
+ field_mask, &seen_mask,
+ &info[idx], info_cnt - idx,
+ level);
+ if (ret < 0)
+ return ret;
+ idx += ret;
}
return idx;
}
@@ -3564,12 +3758,13 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
int info_cnt)
{
if (__btf_type_is_struct(t))
- return btf_find_struct_field(btf, t, field_mask, info, info_cnt);
+ return btf_find_struct_field(btf, t, field_mask, info, info_cnt, 0);
else if (btf_type_is_datasec(t))
- return btf_find_datasec_var(btf, t, field_mask, info, info_cnt);
+ return btf_find_datasec_var(btf, t, field_mask, info, info_cnt, 0);
return -EINVAL;
}
+/* Callers have to ensure the life cycle of btf if it is program BTF */
static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
struct btf_field_info *info)
{
@@ -3598,7 +3793,6 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
field->kptr.dtor = NULL;
id = info->kptr.type_id;
kptr_btf = (struct btf *)btf;
- btf_get(kptr_btf);
goto found_dtor;
}
if (id < 0)
@@ -3755,6 +3949,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
rec->spin_lock_off = -EINVAL;
rec->timer_off = -EINVAL;
+ rec->wq_off = -EINVAL;
rec->refcount_off = -EINVAL;
for (i = 0; i < cnt; i++) {
field_type_size = btf_field_type_size(info_arr[i].type);
@@ -3785,6 +3980,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
/* Cache offset for faster lookup at runtime */
rec->timer_off = rec->fields[i].offset;
break;
+ case BPF_WORKQUEUE:
+ WARN_ON_ONCE(rec->wq_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->wq_off = rec->fields[i].offset;
+ break;
case BPF_REFCOUNT:
WARN_ON_ONCE(rec->refcount_off >= 0);
/* Cache offset for faster lookup at runtime */
@@ -3793,6 +3993,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
if (ret < 0)
goto end;
@@ -3852,12 +4053,28 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
* Hence we only need to ensure that bpf_{list_head,rb_root} ownership
* does not form cycles.
*/
- if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_GRAPH_ROOT))
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & (BPF_GRAPH_ROOT | BPF_UPTR)))
return 0;
for (i = 0; i < rec->cnt; i++) {
struct btf_struct_meta *meta;
+ const struct btf_type *t;
u32 btf_id;
+ if (rec->fields[i].type == BPF_UPTR) {
+ /* The uptr only supports pinning one page and cannot
+ * point to a kernel struct
+ */
+ if (btf_is_kernel(rec->fields[i].kptr.btf))
+ return -EINVAL;
+ t = btf_type_by_id(rec->fields[i].kptr.btf,
+ rec->fields[i].kptr.btf_id);
+ if (!t->size)
+ return -EINVAL;
+ if (t->size > PAGE_SIZE)
+ return -E2BIG;
+ continue;
+ }
+
if (!(rec->fields[i].type & BPF_GRAPH_ROOT))
continue;
btf_id = rec->fields[i].graph_root.value_btf_id;
@@ -3993,7 +4210,7 @@ static void btf_struct_show(const struct btf *btf, const struct btf_type *t,
__btf_struct_show(btf, t, type_id, data, bits_offset, show);
}
-static struct btf_kind_operations struct_ops = {
+static const struct btf_kind_operations struct_ops = {
.check_meta = btf_struct_check_meta,
.resolve = btf_struct_resolve,
.check_member = btf_struct_check_member,
@@ -4161,7 +4378,7 @@ static void btf_enum_show(const struct btf *btf, const struct btf_type *t,
btf_show_end_type(show);
}
-static struct btf_kind_operations enum_ops = {
+static const struct btf_kind_operations enum_ops = {
.check_meta = btf_enum_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_enum_check_member,
@@ -4264,7 +4481,7 @@ static void btf_enum64_show(const struct btf *btf, const struct btf_type *t,
btf_show_end_type(show);
}
-static struct btf_kind_operations enum64_ops = {
+static const struct btf_kind_operations enum64_ops = {
.check_meta = btf_enum64_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_enum_check_member,
@@ -4342,7 +4559,7 @@ done:
btf_verifier_log(env, ")");
}
-static struct btf_kind_operations func_proto_ops = {
+static const struct btf_kind_operations func_proto_ops = {
.check_meta = btf_func_proto_check_meta,
.resolve = btf_df_resolve,
/*
@@ -4400,7 +4617,7 @@ static int btf_func_resolve(struct btf_verifier_env *env,
return 0;
}
-static struct btf_kind_operations func_ops = {
+static const struct btf_kind_operations func_ops = {
.check_meta = btf_func_check_meta,
.resolve = btf_func_resolve,
.check_member = btf_df_check_member,
@@ -4434,7 +4651,7 @@ static s32 btf_var_check_meta(struct btf_verifier_env *env,
}
if (!t->name_off ||
- !__btf_name_valid(env->btf, t->name_off)) {
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
@@ -5322,36 +5539,72 @@ static const char *alloc_obj_fields[] = {
static struct btf_struct_metas *
btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
{
- union {
- struct btf_id_set set;
- struct {
- u32 _cnt;
- u32 _ids[ARRAY_SIZE(alloc_obj_fields)];
- } _arr;
- } aof;
struct btf_struct_metas *tab = NULL;
+ struct btf_id_set *aof;
int i, n, id, ret;
BUILD_BUG_ON(offsetof(struct btf_id_set, cnt) != 0);
BUILD_BUG_ON(sizeof(struct btf_id_set) != sizeof(u32));
- memset(&aof, 0, sizeof(aof));
+ aof = kmalloc(sizeof(*aof), GFP_KERNEL | __GFP_NOWARN);
+ if (!aof)
+ return ERR_PTR(-ENOMEM);
+ aof->cnt = 0;
+
for (i = 0; i < ARRAY_SIZE(alloc_obj_fields); i++) {
/* Try to find whether this special type exists in user BTF, and
* if so remember its ID so we can easily find it among members
* of structs that we iterate in the next loop.
*/
+ struct btf_id_set *new_aof;
+
id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT);
if (id < 0)
continue;
- aof.set.ids[aof.set.cnt++] = id;
+
+ new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_aof) {
+ ret = -ENOMEM;
+ goto free_aof;
+ }
+ aof = new_aof;
+ aof->ids[aof->cnt++] = id;
}
- if (!aof.set.cnt)
+ n = btf_nr_types(btf);
+ for (i = 1; i < n; i++) {
+ /* Try to find if there are kptrs in user BTF and remember their ID */
+ struct btf_id_set *new_aof;
+ struct btf_field_info tmp;
+ const struct btf_type *t;
+
+ t = btf_type_by_id(btf, i);
+ if (!t) {
+ ret = -EINVAL;
+ goto free_aof;
+ }
+
+ ret = btf_find_kptr(btf, t, 0, 0, &tmp, BPF_KPTR);
+ if (ret != BTF_FIELD_FOUND)
+ continue;
+
+ new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_aof) {
+ ret = -ENOMEM;
+ goto free_aof;
+ }
+ aof = new_aof;
+ aof->ids[aof->cnt++] = i;
+ }
+
+ if (!aof->cnt) {
+ kfree(aof);
return NULL;
- sort(&aof.set.ids, aof.set.cnt, sizeof(aof.set.ids[0]), btf_id_cmp_func, NULL);
+ }
+ sort(&aof->ids, aof->cnt, sizeof(aof->ids[0]), btf_id_cmp_func, NULL);
- n = btf_nr_types(btf);
for (i = 1; i < n; i++) {
struct btf_struct_metas *new_tab;
const struct btf_member *member;
@@ -5361,17 +5614,13 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
int j, tab_cnt;
t = btf_type_by_id(btf, i);
- if (!t) {
- ret = -EINVAL;
- goto free;
- }
if (!__btf_type_is_struct(t))
continue;
cond_resched();
for_each_member(j, t, member) {
- if (btf_id_set_contains(&aof.set, member->type))
+ if (btf_id_set_contains(aof, member->type))
goto parse;
}
continue;
@@ -5390,7 +5639,8 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
type = &tab->types[tab->cnt];
type->btf_id = i;
record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
- BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size);
+ BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT |
+ BPF_KPTR, t->size);
/* The record cannot be unset, treat it as an error if so */
if (IS_ERR_OR_NULL(record)) {
ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
@@ -5399,9 +5649,12 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
type->record = record;
tab->cnt++;
}
+ kfree(aof);
return tab;
free:
btf_struct_metas_free(tab);
+free_aof:
+ kfree(aof);
return ERR_PTR(ret);
}
@@ -5581,8 +5834,8 @@ errout_free:
return ERR_PTR(err);
}
-extern char __weak __start_BTF[];
-extern char __weak __stop_BTF[];
+extern char __start_BTF[];
+extern char __stop_BTF[];
extern struct btf *btf_vmlinux;
#define BPF_MAP_TYPE(_id, _ops)
@@ -5647,15 +5900,38 @@ static int find_kern_ctx_type_id(enum bpf_prog_type prog_type)
return ctx_type->type;
}
-const struct btf_type *
-btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
- const struct btf_type *t, enum bpf_prog_type prog_type,
- int arg)
+bool btf_is_projection_of(const char *pname, const char *tname)
+{
+ if (strcmp(pname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0)
+ return true;
+ if (strcmp(pname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0)
+ return true;
+ return false;
+}
+
+bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
+ const struct btf_type *t, enum bpf_prog_type prog_type,
+ int arg)
{
const struct btf_type *ctx_type;
const char *tname, *ctx_tname;
t = btf_type_by_id(btf, t->type);
+
+ /* KPROBE programs allow bpf_user_pt_regs_t typedef, which we need to
+ * check before we skip all the typedef below.
+ */
+ if (prog_type == BPF_PROG_TYPE_KPROBE) {
+ while (btf_type_is_modifier(t) && !btf_type_is_typedef(t))
+ t = btf_type_by_id(btf, t->type);
+
+ if (btf_type_is_typedef(t)) {
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0)
+ return true;
+ }
+ }
+
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_struct(t)) {
@@ -5664,27 +5940,30 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
* is not supported yet.
* BPF_PROG_TYPE_RAW_TRACEPOINT is fine.
*/
- return NULL;
+ return false;
}
tname = btf_name_by_offset(btf, t->name_off);
if (!tname) {
bpf_log(log, "arg#%d struct doesn't have a name\n", arg);
- return NULL;
+ return false;
}
ctx_type = find_canonical_prog_ctx_type(prog_type);
if (!ctx_type) {
bpf_log(log, "btf_vmlinux is malformed\n");
/* should not happen */
- return NULL;
+ return false;
}
again:
ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off);
if (!ctx_tname) {
/* should not happen */
bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n");
- return NULL;
+ return false;
}
+ /* program types without named context types work only with arg:ctx tag */
+ if (ctx_tname[0] == '\0')
+ return false;
/* only compare that prog's ctx type name is the same as
* kernel expects. No need to compare field by field.
* It's ok for bpf prog to do:
@@ -5692,21 +5971,19 @@ again:
* int socket_filter_bpf_prog(struct __sk_buff *skb)
* { // no fields of skb are ever used }
*/
- if (strcmp(ctx_tname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0)
- return ctx_type;
- if (strcmp(ctx_tname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0)
- return ctx_type;
+ if (btf_is_projection_of(ctx_tname, tname))
+ return true;
if (strcmp(ctx_tname, tname)) {
/* bpf_user_pt_regs_t is a typedef, so resolve it to
* underlying struct and check name again
*/
if (!btf_type_is_modifier(ctx_type))
- return NULL;
+ return false;
while (btf_type_is_modifier(ctx_type))
ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type);
goto again;
}
- return ctx_type;
+ return true;
}
/* forward declarations for arch-specific underlying types of
@@ -5858,7 +6135,7 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
enum bpf_prog_type prog_type,
int arg)
{
- if (!btf_get_prog_ctx_type(log, btf, t, prog_type, arg))
+ if (!btf_is_prog_ctx_type(log, btf, t, prog_type, arg))
return -ENOENT;
return find_kern_ctx_type_id(prog_type);
}
@@ -5886,19 +6163,14 @@ int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_ty
BTF_ID_LIST(bpf_ctx_convert_btf_id)
BTF_ID(struct, bpf_ctx_convert)
-struct btf *btf_parse_vmlinux(void)
+static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name,
+ void *data, unsigned int data_size)
{
- struct btf_verifier_env *env = NULL;
- struct bpf_verifier_log *log;
struct btf *btf = NULL;
int err;
- env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
- if (!env)
- return ERR_PTR(-ENOMEM);
-
- log = &env->log;
- log->level = BPF_LOG_KERNEL;
+ if (!IS_ENABLED(CONFIG_DEBUG_INFO_BTF))
+ return ERR_PTR(-ENOENT);
btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
if (!btf) {
@@ -5907,10 +6179,10 @@ struct btf *btf_parse_vmlinux(void)
}
env->btf = btf;
- btf->data = __start_BTF;
- btf->data_size = __stop_BTF - __start_BTF;
+ btf->data = data;
+ btf->data_size = data_size;
btf->kernel_btf = true;
- snprintf(btf->name, sizeof(btf->name), "vmlinux");
+ snprintf(btf->name, sizeof(btf->name), "%s", name);
err = btf_parse_hdr(env);
if (err)
@@ -5930,22 +6202,11 @@ struct btf *btf_parse_vmlinux(void)
if (err)
goto errout;
- /* btf_parse_vmlinux() runs under bpf_verifier_lock */
- bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
-
- bpf_struct_ops_init(btf, log);
-
refcount_set(&btf->refcnt, 1);
- err = btf_alloc_id(btf);
- if (err)
- goto errout;
-
- btf_verifier_env_free(env);
return btf;
errout:
- btf_verifier_env_free(env);
if (btf) {
kvfree(btf->types);
kfree(btf);
@@ -5953,19 +6214,61 @@ errout:
return ERR_PTR(err);
}
+struct btf *btf_parse_vmlinux(void)
+{
+ struct btf_verifier_env *env = NULL;
+ struct bpf_verifier_log *log;
+ struct btf *btf;
+ int err;
+
+ env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
+ if (!env)
+ return ERR_PTR(-ENOMEM);
+
+ log = &env->log;
+ log->level = BPF_LOG_KERNEL;
+ btf = btf_parse_base(env, "vmlinux", __start_BTF, __stop_BTF - __start_BTF);
+ if (IS_ERR(btf))
+ goto err_out;
+
+ /* btf_parse_vmlinux() runs under bpf_verifier_lock */
+ bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
+ err = btf_alloc_id(btf);
+ if (err) {
+ btf_free(btf);
+ btf = ERR_PTR(err);
+ }
+err_out:
+ btf_verifier_env_free(env);
+ return btf;
+}
+
+/* If .BTF_ids section was created with distilled base BTF, both base and
+ * split BTF ids will need to be mapped to actual base/split ids for
+ * BTF now that it has been relocated.
+ */
+static __u32 btf_relocate_id(const struct btf *btf, __u32 id)
+{
+ if (!btf->base_btf || !btf->base_id_map)
+ return id;
+ return btf->base_id_map[id];
+}
+
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
-static struct btf *btf_parse_module(const char *module_name, const void *data, unsigned int data_size)
+static struct btf *btf_parse_module(const char *module_name, const void *data,
+ unsigned int data_size, void *base_data,
+ unsigned int base_data_size)
{
+ struct btf *btf = NULL, *vmlinux_btf, *base_btf = NULL;
struct btf_verifier_env *env = NULL;
struct bpf_verifier_log *log;
- struct btf *btf = NULL, *base_btf;
- int err;
+ int err = 0;
- base_btf = bpf_get_btf_vmlinux();
- if (IS_ERR(base_btf))
- return base_btf;
- if (!base_btf)
+ vmlinux_btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(vmlinux_btf))
+ return vmlinux_btf;
+ if (!vmlinux_btf)
return ERR_PTR(-EINVAL);
env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
@@ -5975,6 +6278,16 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u
log = &env->log;
log->level = BPF_LOG_KERNEL;
+ if (base_data) {
+ base_btf = btf_parse_base(env, ".BTF.base", base_data, base_data_size);
+ if (IS_ERR(base_btf)) {
+ err = PTR_ERR(base_btf);
+ goto errout;
+ }
+ } else {
+ base_btf = vmlinux_btf;
+ }
+
btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
if (!btf) {
err = -ENOMEM;
@@ -5988,12 +6301,11 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u
btf->kernel_btf = true;
snprintf(btf->name, sizeof(btf->name), "%s", module_name);
- btf->data = kvmalloc(data_size, GFP_KERNEL | __GFP_NOWARN);
+ btf->data = kvmemdup(data, data_size, GFP_KERNEL | __GFP_NOWARN);
if (!btf->data) {
err = -ENOMEM;
goto errout;
}
- memcpy(btf->data, data, data_size);
btf->data_size = data_size;
err = btf_parse_hdr(env);
@@ -6014,12 +6326,22 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u
if (err)
goto errout;
+ if (base_btf != vmlinux_btf) {
+ err = btf_relocate(btf, vmlinux_btf, &btf->base_id_map);
+ if (err)
+ goto errout;
+ btf_free(base_btf);
+ base_btf = vmlinux_btf;
+ }
+
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
return btf;
errout:
btf_verifier_env_free(env);
+ if (!IS_ERR(base_btf) && base_btf != vmlinux_btf)
+ btf_free(base_btf);
if (btf) {
kvfree(btf->data);
kvfree(btf->types);
@@ -6092,6 +6414,123 @@ static bool prog_args_trusted(const struct bpf_prog *prog)
}
}
+int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto,
+ u32 arg_no)
+{
+ const struct btf_param *args;
+ const struct btf_type *t;
+ int off = 0, i;
+ u32 sz;
+
+ args = btf_params(func_proto);
+ for (i = 0; i < arg_no; i++) {
+ t = btf_type_by_id(btf, args[i].type);
+ t = btf_resolve_size(btf, t, &sz);
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+ off += roundup(sz, 8);
+ }
+
+ return off;
+}
+
+struct bpf_raw_tp_null_args {
+ const char *func;
+ u64 mask;
+};
+
+static const struct bpf_raw_tp_null_args raw_tp_null_args[] = {
+ /* sched */
+ { "sched_pi_setprio", 0x10 },
+ /* ... from sched_numa_pair_template event class */
+ { "sched_stick_numa", 0x100 },
+ { "sched_swap_numa", 0x100 },
+ /* afs */
+ { "afs_make_fs_call", 0x10 },
+ { "afs_make_fs_calli", 0x10 },
+ { "afs_make_fs_call1", 0x10 },
+ { "afs_make_fs_call2", 0x10 },
+ { "afs_protocol_error", 0x1 },
+ { "afs_flock_ev", 0x10 },
+ /* cachefiles */
+ { "cachefiles_lookup", 0x1 | 0x200 },
+ { "cachefiles_unlink", 0x1 },
+ { "cachefiles_rename", 0x1 },
+ { "cachefiles_prep_read", 0x1 },
+ { "cachefiles_mark_active", 0x1 },
+ { "cachefiles_mark_failed", 0x1 },
+ { "cachefiles_mark_inactive", 0x1 },
+ { "cachefiles_vfs_error", 0x1 },
+ { "cachefiles_io_error", 0x1 },
+ { "cachefiles_ondemand_open", 0x1 },
+ { "cachefiles_ondemand_copen", 0x1 },
+ { "cachefiles_ondemand_close", 0x1 },
+ { "cachefiles_ondemand_read", 0x1 },
+ { "cachefiles_ondemand_cread", 0x1 },
+ { "cachefiles_ondemand_fd_write", 0x1 },
+ { "cachefiles_ondemand_fd_release", 0x1 },
+ /* ext4, from ext4__mballoc event class */
+ { "ext4_mballoc_discard", 0x10 },
+ { "ext4_mballoc_free", 0x10 },
+ /* fib */
+ { "fib_table_lookup", 0x100 },
+ /* filelock */
+ /* ... from filelock_lock event class */
+ { "posix_lock_inode", 0x10 },
+ { "fcntl_setlk", 0x10 },
+ { "locks_remove_posix", 0x10 },
+ { "flock_lock_inode", 0x10 },
+ /* ... from filelock_lease event class */
+ { "break_lease_noblock", 0x10 },
+ { "break_lease_block", 0x10 },
+ { "break_lease_unblock", 0x10 },
+ { "generic_delete_lease", 0x10 },
+ { "time_out_leases", 0x10 },
+ /* host1x */
+ { "host1x_cdma_push_gather", 0x10000 },
+ /* huge_memory */
+ { "mm_khugepaged_scan_pmd", 0x10 },
+ { "mm_collapse_huge_page_isolate", 0x1 },
+ { "mm_khugepaged_scan_file", 0x10 },
+ { "mm_khugepaged_collapse_file", 0x10 },
+ /* kmem */
+ { "mm_page_alloc", 0x1 },
+ { "mm_page_pcpu_drain", 0x1 },
+ /* .. from mm_page event class */
+ { "mm_page_alloc_zone_locked", 0x1 },
+ /* netfs */
+ { "netfs_failure", 0x10 },
+ /* power */
+ { "device_pm_callback_start", 0x10 },
+ /* qdisc */
+ { "qdisc_dequeue", 0x1000 },
+ /* rxrpc */
+ { "rxrpc_recvdata", 0x1 },
+ { "rxrpc_resend", 0x10 },
+ /* skb */
+ {"kfree_skb", 0x1000},
+ /* sunrpc */
+ { "xs_stream_read_data", 0x1 },
+ /* ... from xprt_cong_event event class */
+ { "xprt_reserve_cong", 0x10 },
+ { "xprt_release_cong", 0x10 },
+ { "xprt_get_cong", 0x10 },
+ { "xprt_put_cong", 0x10 },
+ /* tcp */
+ { "tcp_send_reset", 0x11 },
+ /* tegra_apb_dma */
+ { "tegra_dma_tx_status", 0x100 },
+ /* timer_migration */
+ { "tmigr_update_events", 0x1 },
+ /* writeback, from writeback_folio_template event class */
+ { "writeback_dirty_folio", 0x10 },
+ { "folio_wait_writeback", 0x10 },
+ /* rdma */
+ { "mr_integ_alloc", 0x2000 },
+ /* bpf_testmod */
+ { "bpf_testmod_test_read", 0x0 },
+};
+
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
@@ -6102,6 +6541,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const char *tname = prog->aux->attach_func_name;
struct bpf_verifier_log *log = info->log;
const struct btf_param *args;
+ bool ptr_err_raw_tp = false;
const char *tag_value;
u32 nr_args, arg;
int i, ret;
@@ -6131,8 +6571,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (arg == nr_args) {
switch (prog->expected_attach_type) {
- case BPF_LSM_CGROUP:
case BPF_LSM_MAC:
+ /* mark we are accessing the return value */
+ info->is_retval = true;
+ fallthrough;
+ case BPF_LSM_CGROUP:
case BPF_TRACE_FEXIT:
/* When LSM programs are attached to void LSM hooks
* they use FEXIT trampolines and when attached to
@@ -6193,6 +6636,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
return false;
}
+ if (size != sizeof(u64)) {
+ bpf_log(log, "func '%s' size %d must be 8\n",
+ tname, size);
+ return false;
+ }
+
/* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
@@ -6228,7 +6677,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
info->reg_type = ctx_arg_info->reg_type;
- info->btf = btf_vmlinux;
+ info->btf = ctx_arg_info->btf ? : btf_vmlinux;
info->btf_id = ctx_arg_info->btf_id;
return true;
}
@@ -6238,6 +6687,42 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (prog_args_trusted(prog))
info->reg_type |= PTR_TRUSTED;
+ if (btf_param_match_suffix(btf, &args[arg], "__nullable"))
+ info->reg_type |= PTR_MAYBE_NULL;
+
+ if (prog->expected_attach_type == BPF_TRACE_RAW_TP) {
+ struct btf *btf = prog->aux->attach_btf;
+ const struct btf_type *t;
+ const char *tname;
+
+ /* BTF lookups cannot fail, return false on error */
+ t = btf_type_by_id(btf, prog->aux->attach_btf_id);
+ if (!t)
+ return false;
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (!tname)
+ return false;
+ /* Checked by bpf_check_attach_target */
+ tname += sizeof("btf_trace_") - 1;
+ for (i = 0; i < ARRAY_SIZE(raw_tp_null_args); i++) {
+ /* Is this a func with potential NULL args? */
+ if (strcmp(tname, raw_tp_null_args[i].func))
+ continue;
+ if (raw_tp_null_args[i].mask & (0x1 << (arg * 4)))
+ info->reg_type |= PTR_MAYBE_NULL;
+ /* Is the current arg IS_ERR? */
+ if (raw_tp_null_args[i].mask & (0x2 << (arg * 4)))
+ ptr_err_raw_tp = true;
+ break;
+ }
+ /* If we don't know NULL-ness specification and the tracepoint
+ * is coming from a loadable module, be conservative and mark
+ * argument as PTR_MAYBE_NULL.
+ */
+ if (i == ARRAY_SIZE(raw_tp_null_args) && btf_is_module(btf))
+ info->reg_type |= PTR_MAYBE_NULL;
+ }
+
if (tgt_prog) {
enum bpf_prog_type tgt_type;
@@ -6282,8 +6767,18 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n",
tname, arg, info->btf_id, btf_type_str(t),
__btf_name_by_offset(btf, t->name_off));
+
+ /* Perform all checks on the validity of type for this argument, but if
+ * we know it can be IS_ERR at runtime, scrub pointer type and mark as
+ * scalar.
+ */
+ if (ptr_err_raw_tp) {
+ bpf_log(log, "marking pointer arg%d as scalar as it may encode error", arg);
+ info->reg_type = SCALAR_VALUE;
+ }
return true;
}
+EXPORT_SYMBOL_GPL(btf_ctx_access);
enum bpf_struct_walk_result {
/* < 0 error */
@@ -6575,7 +7070,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
for (i = 0; i < rec->cnt; i++) {
struct btf_field *field = &rec->fields[i];
u32 offset = field->offset;
- if (off < offset + btf_field_type_size(field->type) && offset < off + size) {
+ if (off < offset + field->size && offset < off + size) {
bpf_log(log,
"direct access to %s is disallowed\n",
btf_field_type_name(field->type));
@@ -6946,6 +7441,81 @@ static bool btf_is_dynptr_ptr(const struct btf *btf, const struct btf_type *t)
return false;
}
+struct bpf_cand_cache {
+ const char *name;
+ u32 name_len;
+ u16 kind;
+ u16 cnt;
+ struct {
+ const struct btf *btf;
+ u32 id;
+ } cands[];
+};
+
+static DEFINE_MUTEX(cand_cache_mutex);
+
+static struct bpf_cand_cache *
+bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id);
+
+static int btf_get_ptr_to_btf_id(struct bpf_verifier_log *log, int arg_idx,
+ const struct btf *btf, const struct btf_type *t)
+{
+ struct bpf_cand_cache *cc;
+ struct bpf_core_ctx ctx = {
+ .btf = btf,
+ .log = log,
+ };
+ u32 kern_type_id, type_id;
+ int err = 0;
+
+ /* skip PTR and modifiers */
+ type_id = t->type;
+ t = btf_type_by_id(btf, t->type);
+ while (btf_type_is_modifier(t)) {
+ type_id = t->type;
+ t = btf_type_by_id(btf, t->type);
+ }
+
+ mutex_lock(&cand_cache_mutex);
+ cc = bpf_core_find_cands(&ctx, type_id);
+ if (IS_ERR(cc)) {
+ err = PTR_ERR(cc);
+ bpf_log(log, "arg#%d reference type('%s %s') candidate matching error: %d\n",
+ arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off),
+ err);
+ goto cand_cache_unlock;
+ }
+ if (cc->cnt != 1) {
+ bpf_log(log, "arg#%d reference type('%s %s') %s\n",
+ arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off),
+ cc->cnt == 0 ? "has no matches" : "is ambiguous");
+ err = cc->cnt == 0 ? -ENOENT : -ESRCH;
+ goto cand_cache_unlock;
+ }
+ if (btf_is_module(cc->cands[0].btf)) {
+ bpf_log(log, "arg#%d reference type('%s %s') points to kernel module type (unsupported)\n",
+ arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off));
+ err = -EOPNOTSUPP;
+ goto cand_cache_unlock;
+ }
+ kern_type_id = cc->cands[0].id;
+
+cand_cache_unlock:
+ mutex_unlock(&cand_cache_mutex);
+ if (err)
+ return err;
+
+ return kern_type_id;
+}
+
+enum btf_arg_tag {
+ ARG_TAG_CTX = BIT_ULL(0),
+ ARG_TAG_NONNULL = BIT_ULL(1),
+ ARG_TAG_TRUSTED = BIT_ULL(2),
+ ARG_TAG_NULLABLE = BIT_ULL(3),
+ ARG_TAG_ARENA = BIT_ULL(4),
+};
+
/* Process BTF of a function to produce high-level expectation of function
* arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information
* is cached in subprog info for reuse.
@@ -7009,6 +7579,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
args = (const struct btf_param *)(t + 1);
nargs = btf_type_vlen(t);
if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+ if (!is_global)
+ return -EINVAL;
bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n",
tname, nargs, MAX_BPF_FUNC_REG_ARGS);
return -EINVAL;
@@ -7018,6 +7590,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_int(t) && !btf_is_any_enum(t)) {
+ if (!is_global)
+ return -EINVAL;
bpf_log(log,
"Global function %s() doesn't return scalar. Only those are supported.\n",
tname);
@@ -7027,92 +7601,134 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
* Only PTR_TO_CTX and SCALAR are supported atm.
*/
for (i = 0; i < nargs; i++) {
- bool is_nonnull = false;
- const char *tag;
+ u32 tags = 0;
+ int id = 0;
- t = btf_type_by_id(btf, args[i].type);
-
- tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:");
- if (IS_ERR(tag) && PTR_ERR(tag) == -ENOENT) {
- tag = NULL;
- } else if (IS_ERR(tag)) {
- bpf_log(log, "arg#%d type's tag fetching failure: %ld\n", i, PTR_ERR(tag));
- return PTR_ERR(tag);
- }
/* 'arg:<tag>' decl_tag takes precedence over derivation of
* register type from BTF type itself
*/
- if (tag) {
+ while ((id = btf_find_next_decl_tag(btf, fn_t, i, "arg:", id)) > 0) {
+ const struct btf_type *tag_t = btf_type_by_id(btf, id);
+ const char *tag = __btf_name_by_offset(btf, tag_t->name_off) + 4;
+
/* disallow arg tags in static subprogs */
if (!is_global) {
bpf_log(log, "arg#%d type tag is not supported in static functions\n", i);
return -EOPNOTSUPP;
}
+
if (strcmp(tag, "ctx") == 0) {
- sub->args[i].arg_type = ARG_PTR_TO_CTX;
- continue;
+ tags |= ARG_TAG_CTX;
+ } else if (strcmp(tag, "trusted") == 0) {
+ tags |= ARG_TAG_TRUSTED;
+ } else if (strcmp(tag, "nonnull") == 0) {
+ tags |= ARG_TAG_NONNULL;
+ } else if (strcmp(tag, "nullable") == 0) {
+ tags |= ARG_TAG_NULLABLE;
+ } else if (strcmp(tag, "arena") == 0) {
+ tags |= ARG_TAG_ARENA;
+ } else {
+ bpf_log(log, "arg#%d has unsupported set of tags\n", i);
+ return -EOPNOTSUPP;
}
- if (strcmp(tag, "nonnull") == 0)
- is_nonnull = true;
+ }
+ if (id != -ENOENT) {
+ bpf_log(log, "arg#%d type tag fetching failure: %d\n", i, id);
+ return id;
}
+ t = btf_type_by_id(btf, args[i].type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_int(t) || btf_is_any_enum(t)) {
- sub->args[i].arg_type = ARG_ANYTHING;
- continue;
- }
- if (btf_type_is_ptr(t) && btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
+ if (!btf_type_is_ptr(t))
+ goto skip_pointer;
+
+ if ((tags & ARG_TAG_CTX) || btf_is_prog_ctx_type(log, btf, t, prog_type, i)) {
+ if (tags & ~ARG_TAG_CTX) {
+ bpf_log(log, "arg#%d has invalid combination of tags\n", i);
+ return -EINVAL;
+ }
+ if ((tags & ARG_TAG_CTX) &&
+ btf_validate_prog_ctx_type(log, btf, t, i, prog_type,
+ prog->expected_attach_type))
+ return -EINVAL;
sub->args[i].arg_type = ARG_PTR_TO_CTX;
continue;
}
- if (btf_type_is_ptr(t) && btf_is_dynptr_ptr(btf, t)) {
+ if (btf_is_dynptr_ptr(btf, t)) {
+ if (tags) {
+ bpf_log(log, "arg#%d has invalid combination of tags\n", i);
+ return -EINVAL;
+ }
sub->args[i].arg_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY;
continue;
}
- if (is_global && btf_type_is_ptr(t)) {
+ if (tags & ARG_TAG_TRUSTED) {
+ int kern_type_id;
+
+ if (tags & ARG_TAG_NONNULL) {
+ bpf_log(log, "arg#%d has invalid combination of tags\n", i);
+ return -EINVAL;
+ }
+
+ kern_type_id = btf_get_ptr_to_btf_id(log, i, btf, t);
+ if (kern_type_id < 0)
+ return kern_type_id;
+
+ sub->args[i].arg_type = ARG_PTR_TO_BTF_ID | PTR_TRUSTED;
+ if (tags & ARG_TAG_NULLABLE)
+ sub->args[i].arg_type |= PTR_MAYBE_NULL;
+ sub->args[i].btf_id = kern_type_id;
+ continue;
+ }
+ if (tags & ARG_TAG_ARENA) {
+ if (tags & ~ARG_TAG_ARENA) {
+ bpf_log(log, "arg#%d arena cannot be combined with any other tags\n", i);
+ return -EINVAL;
+ }
+ sub->args[i].arg_type = ARG_PTR_TO_ARENA;
+ continue;
+ }
+ if (is_global) { /* generic user data pointer */
u32 mem_size;
+ if (tags & ARG_TAG_NULLABLE) {
+ bpf_log(log, "arg#%d has invalid combination of tags\n", i);
+ return -EINVAL;
+ }
+
t = btf_type_skip_modifiers(btf, t->type, NULL);
ref_t = btf_resolve_size(btf, t, &mem_size);
if (IS_ERR(ref_t)) {
- bpf_log(log,
- "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
- i, btf_type_str(t), btf_name_by_offset(btf, t->name_off),
+ bpf_log(log, "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+ i, btf_type_str(t), btf_name_by_offset(btf, t->name_off),
PTR_ERR(ref_t));
return -EINVAL;
}
- sub->args[i].arg_type = is_nonnull ? ARG_PTR_TO_MEM : ARG_PTR_TO_MEM_OR_NULL;
+ sub->args[i].arg_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL;
+ if (tags & ARG_TAG_NONNULL)
+ sub->args[i].arg_type &= ~PTR_MAYBE_NULL;
sub->args[i].mem_size = mem_size;
continue;
}
- if (is_nonnull) {
- bpf_log(log, "arg#%d marked as non-null, but is not a pointer type\n", i);
+
+skip_pointer:
+ if (tags) {
+ bpf_log(log, "arg#%d has pointer tag, but is not a pointer type\n", i);
return -EINVAL;
}
+ if (btf_type_is_int(t) || btf_is_any_enum(t)) {
+ sub->args[i].arg_type = ARG_ANYTHING;
+ continue;
+ }
+ if (!is_global)
+ return -EINVAL;
bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
i, btf_type_str(t), tname);
return -EINVAL;
}
- for (i = 0; i < nargs; i++) {
- const char *tag;
-
- if (sub->args[i].arg_type != ARG_PTR_TO_CTX)
- continue;
-
- /* check if arg has "arg:ctx" tag */
- t = btf_type_by_id(btf, args[i].type);
- tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:");
- if (IS_ERR_OR_NULL(tag) || strcmp(tag, "ctx") != 0)
- continue;
-
- if (btf_validate_prog_ctx_type(log, btf, t, i, prog_type,
- prog->expected_attach_type))
- return -EINVAL;
- }
-
sub->arg_cnt = nargs;
sub->args_cached = true;
@@ -7131,8 +7747,8 @@ static void btf_type_show(const struct btf *btf, u32 type_id, void *obj,
btf_type_ops(t)->show(btf, t, type_id, obj, 0, show);
}
-static void btf_seq_show(struct btf_show *show, const char *fmt,
- va_list args)
+__printf(2, 0) static void btf_seq_show(struct btf_show *show, const char *fmt,
+ va_list args)
{
seq_vprintf((struct seq_file *)show->target, fmt, args);
}
@@ -7165,8 +7781,8 @@ struct btf_show_snprintf {
int len; /* length we would have written */
};
-static void btf_snprintf_show(struct btf_show *show, const char *fmt,
- va_list args)
+__printf(2, 0) static void btf_snprintf_show(struct btf_show *show, const char *fmt,
+ va_list args)
{
struct btf_show_snprintf *ssnprintf = (struct btf_show_snprintf *)show;
int len;
@@ -7266,21 +7882,11 @@ int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
struct btf *btf_get_by_fd(int fd)
{
struct btf *btf;
- struct fd f;
-
- f = fdget(fd);
-
- if (!f.file)
- return ERR_PTR(-EBADF);
+ CLASS(fd, f)(fd);
- if (f.file->f_op != &btf_fops) {
- fdput(f);
- return ERR_PTR(-EINVAL);
- }
-
- btf = f.file->private_data;
- refcount_inc(&btf->refcnt);
- fdput(f);
+ btf = __btf_get_by_fd(f);
+ if (!IS_ERR(btf))
+ refcount_inc(&btf->refcnt);
return btf;
}
@@ -7397,17 +8003,6 @@ struct btf_module {
static LIST_HEAD(btf_modules);
static DEFINE_MUTEX(btf_module_mutex);
-static ssize_t
-btf_module_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t off, size_t len)
-{
- const struct btf *btf = bin_attr->private;
-
- memcpy(buf, btf->data + off, len);
- return len;
-}
-
static void purge_cand_cache(struct btf *btf);
static int btf_module_notify(struct notifier_block *nb, unsigned long op,
@@ -7430,7 +8025,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
err = -ENOMEM;
goto out;
}
- btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size);
+ btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size,
+ mod->btf_base_data, mod->btf_base_data_size);
if (IS_ERR(btf)) {
kfree(btf_mod);
if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) {
@@ -7467,8 +8063,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
attr->attr.name = btf->name;
attr->attr.mode = 0444;
attr->size = btf->data_size;
- attr->private = btf;
- attr->read = btf_module_read;
+ attr->private = btf->data;
+ attr->read_new = sysfs_bin_attr_simple_read;
err = sysfs_create_bin_file(btf_kobj, attr);
if (err) {
@@ -7589,6 +8185,17 @@ static struct btf *btf_get_module_btf(const struct module *module)
return btf;
}
+static int check_btf_kconfigs(const struct module *module, const char *feature)
+{
+ if (!module && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
+ pr_err("missing vmlinux BTF, cannot register %s\n", feature);
+ return -ENOENT;
+ }
+ if (module && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
+ pr_warn("missing module BTF, cannot register %s\n", feature);
+ return 0;
+}
+
BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags)
{
struct btf *btf = NULL;
@@ -7630,15 +8237,44 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
BTF_TRACING_TYPE_xxx
#undef BTF_TRACING_TYPE
+/* Validate well-formedness of iter argument type.
+ * On success, return positive BTF ID of iter state's STRUCT type.
+ * On error, negative error is returned.
+ */
+int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx)
+{
+ const struct btf_param *arg;
+ const struct btf_type *t;
+ const char *name;
+ int btf_id;
+
+ if (btf_type_vlen(func) <= arg_idx)
+ return -EINVAL;
+
+ arg = &btf_params(func)[arg_idx];
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ t = btf_type_skip_modifiers(btf, t->type, &btf_id);
+ if (!t || !__btf_type_is_struct(t))
+ return -EINVAL;
+
+ name = btf_name_by_offset(btf, t->name_off);
+ if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
+ return -EINVAL;
+
+ return btf_id;
+}
+
static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
const struct btf_type *func, u32 func_flags)
{
u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
- const char *name, *sfx, *iter_name;
- const struct btf_param *arg;
+ const char *sfx, *iter_name;
const struct btf_type *t;
char exp_name[128];
u32 nr_args;
+ int btf_id;
/* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */
if (!flags || (flags & (flags - 1)))
@@ -7649,28 +8285,21 @@ static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
if (nr_args < 1)
return -EINVAL;
- arg = &btf_params(func)[0];
- t = btf_type_skip_modifiers(btf, arg->type, NULL);
- if (!t || !btf_type_is_ptr(t))
- return -EINVAL;
- t = btf_type_skip_modifiers(btf, t->type, NULL);
- if (!t || !__btf_type_is_struct(t))
- return -EINVAL;
-
- name = btf_name_by_offset(btf, t->name_off);
- if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
- return -EINVAL;
+ btf_id = btf_check_iter_arg(btf, func, 0);
+ if (btf_id < 0)
+ return btf_id;
/* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to
* fit nicely in stack slots
*/
+ t = btf_type_by_id(btf, btf_id);
if (t->size == 0 || (t->size % 8))
return -EINVAL;
/* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *)
* naming pattern
*/
- iter_name = name + sizeof(ITER_PREFIX) - 1;
+ iter_name = btf_name_by_offset(btf, t->name_off) + sizeof(ITER_PREFIX) - 1;
if (flags & KF_ITER_NEW)
sfx = "new";
else if (flags & KF_ITER_NEXT)
@@ -7743,7 +8372,7 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
bool add_filter = !!kset->filter;
struct btf_kfunc_set_tab *tab;
struct btf_id_set8 *set;
- u32 set_cnt;
+ u32 set_cnt, i;
int ret;
if (hook >= BTF_KFUNC_HOOK_MAX) {
@@ -7789,21 +8418,15 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
goto end;
}
- /* We don't need to allocate, concatenate, and sort module sets, because
- * only one is allowed per hook. Hence, we can directly assign the
- * pointer and return.
- */
- if (!vmlinux_set) {
- tab->sets[hook] = add_set;
- goto do_add_filter;
- }
-
/* In case of vmlinux sets, there may be more than one set being
* registered per hook. To create a unified set, we allocate a new set
* and concatenate all individual sets being registered. While each set
* is individually sorted, they may become unsorted when concatenated,
* hence re-sorting the final set again is required to make binary
* searching the set using btf_id_set8_contains function work.
+ *
+ * For module sets, we need to allocate as we may need to relocate
+ * BTF ids.
*/
set_cnt = set ? set->cnt : 0;
@@ -7833,11 +8456,14 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
/* Concatenate the two sets */
memcpy(set->pairs + set->cnt, add_set->pairs, add_set->cnt * sizeof(set->pairs[0]));
+ /* Now that the set is copied, update with relocated BTF ids */
+ for (i = set->cnt; i < set->cnt + add_set->cnt; i++)
+ set->pairs[i].id = btf_relocate_id(btf, set->pairs[i].id);
+
set->cnt += add_set->cnt;
sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL);
-do_add_filter:
if (add_filter) {
hook_filter = &tab->hook_filters[hook];
hook_filter->filters[hook_filter->nr_filters++] = kset->filter;
@@ -7888,13 +8514,19 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_STRUCT_OPS:
return BTF_KFUNC_HOOK_STRUCT_OPS;
case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_LSM:
return BTF_KFUNC_HOOK_TRACING;
case BPF_PROG_TYPE_SYSCALL:
return BTF_KFUNC_HOOK_SYSCALL;
case BPF_PROG_TYPE_CGROUP_SKB:
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
- return BTF_KFUNC_HOOK_CGROUP_SKB;
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ return BTF_KFUNC_HOOK_CGROUP;
case BPF_PROG_TYPE_SCHED_ACT:
return BTF_KFUNC_HOOK_SCHED_ACT;
case BPF_PROG_TYPE_SK_SKB:
@@ -7908,6 +8540,8 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
return BTF_KFUNC_HOOK_LWT;
case BPF_PROG_TYPE_NETFILTER:
return BTF_KFUNC_HOOK_NETFILTER;
+ case BPF_PROG_TYPE_KPROBE:
+ return BTF_KFUNC_HOOK_KPROBE;
default:
return BTF_KFUNC_HOOK_MAX;
}
@@ -7949,20 +8583,13 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
int ret, i;
btf = btf_get_module_btf(kset->owner);
- if (!btf) {
- if (!kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
- pr_err("missing vmlinux BTF, cannot register kfuncs\n");
- return -ENOENT;
- }
- if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
- pr_warn("missing module BTF, cannot register kfuncs\n");
- return 0;
- }
+ if (!btf)
+ return check_btf_kconfigs(kset->owner, "kfunc");
if (IS_ERR(btf))
return PTR_ERR(btf);
for (i = 0; i < kset->set->cnt; i++) {
- ret = btf_check_kfunc_protos(btf, kset->set->pairs[i].id,
+ ret = btf_check_kfunc_protos(btf, btf_relocate_id(btf, kset->set->pairs[i].id),
kset->set->pairs[i].flags);
if (ret)
goto err_out;
@@ -7981,6 +8608,14 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
{
enum btf_kfunc_hook hook;
+ /* All kfuncs need to be tagged as such in BTF.
+ * WARN() for initcall registrations that do not check errors.
+ */
+ if (!(kset->set->flags & BTF_SET8_KFUNCS)) {
+ WARN_ON(!kset->owner);
+ return -EINVAL;
+ }
+
hook = bpf_prog_type_to_kfunc_hook(prog_type);
return __register_btf_kfunc_id_set(hook, kset);
}
@@ -8018,7 +8653,7 @@ static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc
u32 nr_args, i;
for (i = 0; i < cnt; i++) {
- dtor_btf_id = dtors[i].kfunc_btf_id;
+ dtor_btf_id = btf_relocate_id(btf, dtors[i].kfunc_btf_id);
dtor_func = btf_type_by_id(btf, dtor_btf_id);
if (!dtor_func || !btf_type_is_func(dtor_func))
@@ -8053,21 +8688,12 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c
{
struct btf_id_dtor_kfunc_tab *tab;
struct btf *btf;
- u32 tab_cnt;
+ u32 tab_cnt, i;
int ret;
btf = btf_get_module_btf(owner);
- if (!btf) {
- if (!owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
- pr_err("missing vmlinux BTF, cannot register dtor kfuncs\n");
- return -ENOENT;
- }
- if (owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) {
- pr_err("missing module BTF, cannot register dtor kfuncs\n");
- return -ENOENT;
- }
- return 0;
- }
+ if (!btf)
+ return check_btf_kconfigs(owner, "dtor kfuncs");
if (IS_ERR(btf))
return PTR_ERR(btf);
@@ -8113,6 +8739,13 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c
btf->dtor_kfunc_tab = tab;
memcpy(tab->dtors + tab->cnt, dtors, add_cnt * sizeof(tab->dtors[0]));
+
+ /* remap BTF ids based on BTF relocation (if any) */
+ for (i = tab_cnt; i < tab_cnt + add_cnt; i++) {
+ tab->dtors[i].btf_id = btf_relocate_id(btf, tab->dtors[i].btf_id);
+ tab->dtors[i].kfunc_btf_id = btf_relocate_id(btf, tab->dtors[i].kfunc_btf_id);
+ }
+
tab->cnt += add_cnt;
sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL);
@@ -8182,17 +8815,6 @@ size_t bpf_core_essential_name_len(const char *name)
return n;
}
-struct bpf_cand_cache {
- const char *name;
- u32 name_len;
- u16 kind;
- u16 cnt;
- struct {
- const struct btf *btf;
- u32 id;
- } cands[];
-};
-
static void bpf_free_cands(struct bpf_cand_cache *cands)
{
if (!cands->cnt)
@@ -8213,8 +8835,6 @@ static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE];
#define MODULE_CAND_CACHE_SIZE 31
static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE];
-static DEFINE_MUTEX(cand_cache_mutex);
-
static void __print_cand_cache(struct bpf_verifier_log *log,
struct bpf_cand_cache **cache,
int cache_size)
@@ -8482,6 +9102,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
struct bpf_core_cand_list cands = {};
struct bpf_core_relo_res targ_res;
struct bpf_core_spec *specs;
+ const struct btf_type *type;
int err;
/* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5"
@@ -8491,6 +9112,14 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
if (!specs)
return -ENOMEM;
+ type = btf_type_by_id(ctx->btf, relo->type_id);
+ if (!type) {
+ bpf_log(ctx->log, "relo #%u: bad type id %u\n",
+ relo_idx, relo->type_id);
+ kfree(specs);
+ return -EINVAL;
+ }
+
if (need_cands) {
struct bpf_cand_cache *cc;
int i;
@@ -8645,3 +9274,141 @@ bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log,
return !strncmp(reg_name, arg_name, cmp_len);
}
+
+#ifdef CONFIG_BPF_JIT
+static int
+btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops,
+ struct bpf_verifier_log *log)
+{
+ struct btf_struct_ops_tab *tab, *new_tab;
+ int i, err;
+
+ tab = btf->struct_ops_tab;
+ if (!tab) {
+ tab = kzalloc(offsetof(struct btf_struct_ops_tab, ops[4]),
+ GFP_KERNEL);
+ if (!tab)
+ return -ENOMEM;
+ tab->capacity = 4;
+ btf->struct_ops_tab = tab;
+ }
+
+ for (i = 0; i < tab->cnt; i++)
+ if (tab->ops[i].st_ops == st_ops)
+ return -EEXIST;
+
+ if (tab->cnt == tab->capacity) {
+ new_tab = krealloc(tab,
+ offsetof(struct btf_struct_ops_tab,
+ ops[tab->capacity * 2]),
+ GFP_KERNEL);
+ if (!new_tab)
+ return -ENOMEM;
+ tab = new_tab;
+ tab->capacity *= 2;
+ btf->struct_ops_tab = tab;
+ }
+
+ tab->ops[btf->struct_ops_tab->cnt].st_ops = st_ops;
+
+ err = bpf_struct_ops_desc_init(&tab->ops[btf->struct_ops_tab->cnt], btf, log);
+ if (err)
+ return err;
+
+ btf->struct_ops_tab->cnt++;
+
+ return 0;
+}
+
+const struct bpf_struct_ops_desc *
+bpf_struct_ops_find_value(struct btf *btf, u32 value_id)
+{
+ const struct bpf_struct_ops_desc *st_ops_list;
+ unsigned int i;
+ u32 cnt;
+
+ if (!value_id)
+ return NULL;
+ if (!btf->struct_ops_tab)
+ return NULL;
+
+ cnt = btf->struct_ops_tab->cnt;
+ st_ops_list = btf->struct_ops_tab->ops;
+ for (i = 0; i < cnt; i++) {
+ if (st_ops_list[i].value_id == value_id)
+ return &st_ops_list[i];
+ }
+
+ return NULL;
+}
+
+const struct bpf_struct_ops_desc *
+bpf_struct_ops_find(struct btf *btf, u32 type_id)
+{
+ const struct bpf_struct_ops_desc *st_ops_list;
+ unsigned int i;
+ u32 cnt;
+
+ if (!type_id)
+ return NULL;
+ if (!btf->struct_ops_tab)
+ return NULL;
+
+ cnt = btf->struct_ops_tab->cnt;
+ st_ops_list = btf->struct_ops_tab->ops;
+ for (i = 0; i < cnt; i++) {
+ if (st_ops_list[i].type_id == type_id)
+ return &st_ops_list[i];
+ }
+
+ return NULL;
+}
+
+int __register_bpf_struct_ops(struct bpf_struct_ops *st_ops)
+{
+ struct bpf_verifier_log *log;
+ struct btf *btf;
+ int err = 0;
+
+ btf = btf_get_module_btf(st_ops->owner);
+ if (!btf)
+ return check_btf_kconfigs(st_ops->owner, "struct_ops");
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ log = kzalloc(sizeof(*log), GFP_KERNEL | __GFP_NOWARN);
+ if (!log) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ log->level = BPF_LOG_KERNEL;
+
+ err = btf_add_struct_ops(btf, st_ops, log);
+
+errout:
+ kfree(log);
+ btf_put(btf);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(__register_bpf_struct_ops);
+#endif
+
+bool btf_param_match_suffix(const struct btf *btf,
+ const struct btf_param *arg,
+ const char *suffix)
+{
+ int suffix_len = strlen(suffix), len;
+ const char *param_name;
+
+ /* In the future, this can be ported to use BTF tagging */
+ param_name = btf_name_by_offset(btf, arg->name_off);
+ if (str_is_empty(param_name))
+ return false;
+ len = strlen(param_name);
+ if (len <= suffix_len)
+ return false;
+ param_name += len - suffix_len;
+ return !strncmp(param_name, suffix, suffix_len);
+}
diff --git a/kernel/bpf/btf_iter.c b/kernel/bpf/btf_iter.c
new file mode 100644
index 000000000000..0e2c66a52df9
--- /dev/null
+++ b/kernel/bpf/btf_iter.c
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+#include "../../tools/lib/bpf/btf_iter.c"
diff --git a/kernel/bpf/btf_relocate.c b/kernel/bpf/btf_relocate.c
new file mode 100644
index 000000000000..c12ccbf66507
--- /dev/null
+++ b/kernel/bpf/btf_relocate.c
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+#include "../../tools/lib/bpf/btf_relocate.c"
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 491d20038cbe..46e5db65dbc8 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -24,6 +24,23 @@
DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+/*
+ * cgroup bpf destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions. Use a separate workqueue so that cgroup bpf
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_bpf_destroy_wq;
+
+static int __init cgroup_bpf_wq_init(void)
+{
+ cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1);
+ if (!cgroup_bpf_destroy_wq)
+ panic("Failed to alloc workqueue for cgroup bpf destroy.\n");
+ return 0;
+}
+core_initcall(cgroup_bpf_wq_init);
+
/* __always_inline is necessary to prevent indirect call through run_prog
* function pointer.
*/
@@ -334,7 +351,7 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref)
struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
- queue_work(system_wq, &cgrp->bpf.release_work);
+ queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work);
}
/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
@@ -1358,15 +1375,12 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb,
enum cgroup_bpf_attach_type atype)
{
- unsigned int offset = skb->data - skb_network_header(skb);
+ unsigned int offset = -skb_network_offset(skb);
struct sock *save_sk;
void *saved_data_end;
struct cgroup *cgrp;
int ret;
- if (!sk || !sk_fullsock(sk))
- return 0;
-
if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
return 0;
@@ -1630,7 +1644,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
}
@@ -1694,7 +1708,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
* returned value != 1 during execution. In all other cases 0 is returned.
*/
int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
- struct ctl_table *table, int write,
+ const struct ctl_table *table, int write,
char **buf, size_t *pcount, loff_t *ppos,
enum cgroup_bpf_attach_type atype)
{
@@ -2191,7 +2205,7 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
}
@@ -2348,7 +2362,7 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
}
@@ -2578,14 +2592,14 @@ cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
switch (func_id) {
case BPF_FUNC_get_current_uid_gid:
return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_current_pid_tgid:
- return &bpf_get_current_pid_tgid_proto;
case BPF_FUNC_get_current_comm:
return &bpf_get_current_comm_proto;
#ifdef CONFIG_CGROUP_NET_CLASSID
case BPF_FUNC_get_cgroup_classid:
return &bpf_get_cgroup_classid_curr_proto;
#endif
+ case BPF_FUNC_current_task_under_cgroup:
+ return &bpf_current_task_under_cgroup_proto;
default:
return NULL;
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ea6843be2616..da729cbbaeb9 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -21,11 +21,11 @@
#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
-#include <linux/random.h>
-#include <linux/moduleloader.h>
+#include <linux/prandom.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/objtool.h>
+#include <linux/overflow.h>
#include <linux/rbtree_latch.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
@@ -37,9 +37,10 @@
#include <linux/nospec.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/memcontrol.h>
+#include <linux/execmem.h>
#include <asm/barrier.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
/* Registers */
#define BPF_R0 regs[BPF_REG_0]
@@ -88,13 +89,18 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
return NULL;
}
+/* tell bpf programs that include vmlinux.h kernel's PAGE_SIZE */
+enum page_size_enum {
+ __PAGE_SIZE = PAGE_SIZE
+};
+
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
{
gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog_aux *aux;
struct bpf_prog *fp;
- size = round_up(size, PAGE_SIZE);
+ size = round_up(size, __PAGE_SIZE);
fp = __vmalloc(size, gfp_flags);
if (fp == NULL)
return NULL;
@@ -125,6 +131,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode);
#endif
mutex_init(&fp->aux->used_maps_mutex);
+ mutex_init(&fp->aux->ext_mutex);
mutex_init(&fp->aux->dst_mutex);
return fp;
@@ -532,6 +539,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
{
+ int err;
+
/* Branch offsets can't overflow when program is shrinking, no need
* to call bpf_adj_branches(..., true) here
*/
@@ -539,7 +548,9 @@ int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
sizeof(struct bpf_insn) * (prog->len - off - cnt));
prog->len -= cnt;
- return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false));
+ err = bpf_adj_branches(prog, off, off + cnt, off, false);
+ WARN_ON_ONCE(err);
+ return err;
}
static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
@@ -682,7 +693,7 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
if (!bpf_prog_kallsyms_candidate(fp) ||
- !bpf_capable())
+ !bpf_token_capable(fp->aux->token, CAP_BPF))
return;
bpf_prog_ksym_set_addr(fp);
@@ -730,11 +741,11 @@ static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
return n ? container_of(n, struct bpf_ksym, tnode) : NULL;
}
-const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
+int __bpf_address_lookup(unsigned long addr, unsigned long *size,
unsigned long *off, char *sym)
{
struct bpf_ksym *ksym;
- char *ret = NULL;
+ int ret = 0;
rcu_read_lock();
ksym = bpf_ksym_find(addr);
@@ -742,9 +753,8 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
unsigned long symbol_start = ksym->start;
unsigned long symbol_end = ksym->end;
- strncpy(sym, ksym->name, KSYM_NAME_LEN);
+ ret = strscpy(sym, ksym->name, KSYM_NAME_LEN);
- ret = sym;
if (size)
*size = symbol_end - symbol_start;
if (off)
@@ -808,7 +818,7 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
if (it++ != symnum)
continue;
- strncpy(sym, ksym->name, KSYM_NAME_LEN);
+ strscpy(sym, ksym->name, KSYM_NAME_LEN);
*value = ksym->start;
*type = BPF_SYM_ELF_TYPE;
@@ -844,7 +854,7 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
return -EINVAL;
}
- tab = krealloc(tab, size * sizeof(*poke), GFP_KERNEL);
+ tab = krealloc_array(tab, size, sizeof(*poke), GFP_KERNEL);
if (!tab)
return -ENOMEM;
@@ -888,7 +898,12 @@ static LIST_HEAD(pack_list);
* CONFIG_MMU=n. Use PAGE_SIZE in these cases.
*/
#ifdef PMD_SIZE
-#define BPF_PROG_PACK_SIZE (PMD_SIZE * num_possible_nodes())
+/* PMD_SIZE is really big for some archs. It doesn't make sense to
+ * reserve too much memory in one allocation. Hardcode BPF_PROG_PACK_SIZE to
+ * 2MiB * num_possible_nodes(). On most architectures PMD_SIZE will be
+ * greater than or equal to 2MB.
+ */
+#define BPF_PROG_PACK_SIZE (SZ_2M * num_possible_nodes())
#else
#define BPF_PROG_PACK_SIZE PAGE_SIZE
#endif
@@ -898,23 +913,30 @@ static LIST_HEAD(pack_list);
static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_prog_pack *pack;
+ int err;
pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)),
GFP_KERNEL);
if (!pack)
return NULL;
pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE);
- if (!pack->ptr) {
- kfree(pack);
- return NULL;
- }
+ if (!pack->ptr)
+ goto out;
bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE);
bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
- list_add_tail(&pack->list, &pack_list);
set_vm_flush_reset_perms(pack->ptr);
- set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+ err = set_memory_rox((unsigned long)pack->ptr,
+ BPF_PROG_PACK_SIZE / PAGE_SIZE);
+ if (err)
+ goto out;
+ list_add_tail(&pack->list, &pack_list);
return pack;
+
+out:
+ bpf_jit_free_exec(pack->ptr);
+ kfree(pack);
+ return NULL;
}
void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
@@ -929,9 +951,16 @@ void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
size = round_up(size, PAGE_SIZE);
ptr = bpf_jit_alloc_exec(size);
if (ptr) {
+ int err;
+
bpf_fill_ill_insns(ptr, size);
set_vm_flush_reset_perms(ptr);
- set_memory_rox((unsigned long)ptr, size / PAGE_SIZE);
+ err = set_memory_rox((unsigned long)ptr,
+ size / PAGE_SIZE);
+ if (err) {
+ bpf_jit_free_exec(ptr);
+ ptr = NULL;
+ }
}
goto out;
}
@@ -1040,12 +1069,12 @@ void bpf_jit_uncharge_modmem(u32 size)
void *__weak bpf_jit_alloc_exec(unsigned long size)
{
- return module_alloc(size);
+ return execmem_alloc(EXECMEM_BPF, size);
}
void __weak bpf_jit_free_exec(void *addr)
{
- module_memfree(addr);
+ execmem_free(addr);
}
struct bpf_binary_header *
@@ -1149,8 +1178,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
}
/* Copy JITed text from rw_header to its final location, the ro_header. */
-int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
- struct bpf_binary_header *ro_header,
+int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header,
struct bpf_binary_header *rw_header)
{
void *ptr;
@@ -1675,6 +1703,7 @@ bool bpf_opcode_in_insntable(u8 code)
[BPF_LD | BPF_IND | BPF_B] = true,
[BPF_LD | BPF_IND | BPF_H] = true,
[BPF_LD | BPF_IND | BPF_W] = true,
+ [BPF_JMP | BPF_JCOND] = true,
};
#undef BPF_INSN_3_TBL
#undef BPF_INSN_2_TBL
@@ -2193,6 +2222,7 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn
u64 stack[stack_size / sizeof(u64)]; \
u64 regs[MAX_BPF_EXT_REG] = {}; \
\
+ kmsan_unpoison_memory(stack, sizeof(stack)); \
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
ARG1 = (u64) (unsigned long) ctx; \
return ___bpf_prog_run(regs, insn); \
@@ -2206,6 +2236,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
u64 stack[stack_size / sizeof(u64)]; \
u64 regs[MAX_BPF_EXT_REG]; \
\
+ kmsan_unpoison_memory(stack, sizeof(stack)); \
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
BPF_R1 = r1; \
BPF_R2 = r2; \
@@ -2276,6 +2307,7 @@ bool bpf_prog_map_compatible(struct bpf_map *map,
{
enum bpf_prog_type prog_type = resolve_prog_type(fp);
bool ret;
+ struct bpf_prog_aux *aux = fp->aux;
if (fp->kprobe_override)
return false;
@@ -2285,7 +2317,7 @@ bool bpf_prog_map_compatible(struct bpf_map *map,
* in the case of devmap and cpumap). Until device checks
* are implemented, prohibit adding dev-bound programs to program maps.
*/
- if (bpf_prog_is_dev_bound(fp->aux))
+ if (bpf_prog_is_dev_bound(aux))
return false;
spin_lock(&map->owner.lock);
@@ -2295,12 +2327,26 @@ bool bpf_prog_map_compatible(struct bpf_map *map,
*/
map->owner.type = prog_type;
map->owner.jited = fp->jited;
- map->owner.xdp_has_frags = fp->aux->xdp_has_frags;
+ map->owner.xdp_has_frags = aux->xdp_has_frags;
+ map->owner.attach_func_proto = aux->attach_func_proto;
ret = true;
} else {
ret = map->owner.type == prog_type &&
map->owner.jited == fp->jited &&
- map->owner.xdp_has_frags == fp->aux->xdp_has_frags;
+ map->owner.xdp_has_frags == aux->xdp_has_frags;
+ if (ret &&
+ map->owner.attach_func_proto != aux->attach_func_proto) {
+ switch (prog_type) {
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_EXT:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ ret = false;
+ break;
+ default:
+ break;
+ }
+ }
}
spin_unlock(&map->owner.lock);
@@ -2392,7 +2438,9 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
}
finalize:
- bpf_prog_lock_ro(fp);
+ *err = bpf_prog_lock_ro(fp);
+ if (*err)
+ return fp;
/* The tail call compatibility check can only be done at
* this late stage as we need to determine, if we deal
@@ -2426,13 +2474,14 @@ EXPORT_SYMBOL(bpf_empty_prog_array);
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
{
+ struct bpf_prog_array *p;
+
if (prog_cnt)
- return kzalloc(sizeof(struct bpf_prog_array) +
- sizeof(struct bpf_prog_array_item) *
- (prog_cnt + 1),
- flags);
+ p = kzalloc(struct_size(p, items, prog_cnt + 1), flags);
+ else
+ p = &bpf_empty_prog_array.hdr;
- return &bpf_empty_prog_array.hdr;
+ return p;
}
void bpf_prog_array_free(struct bpf_prog_array *progs)
@@ -2695,7 +2744,7 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux,
bool sleepable;
u32 i;
- sleepable = aux->sleepable;
+ sleepable = aux->prog->sleepable;
for (i = 0; i < len; i++) {
map = used_maps[i];
if (map->ops->map_poke_untrack)
@@ -2712,8 +2761,7 @@ static void bpf_free_used_maps(struct bpf_prog_aux *aux)
kfree(aux->used_maps);
}
-void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
- struct btf_mod_pair *used_btfs, u32 len)
+void __bpf_free_used_btfs(struct btf_mod_pair *used_btfs, u32 len)
{
#ifdef CONFIG_BPF_SYSCALL
struct btf_mod_pair *btf_mod;
@@ -2730,7 +2778,7 @@ void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
static void bpf_free_used_btfs(struct bpf_prog_aux *aux)
{
- __bpf_free_used_btfs(aux, aux->used_btfs, aux->used_btf_cnt);
+ __bpf_free_used_btfs(aux->used_btfs, aux->used_btf_cnt);
kfree(aux->used_btfs);
}
@@ -2779,12 +2827,13 @@ void bpf_prog_free(struct bpf_prog *fp)
if (aux->dst_prog)
bpf_prog_put(aux->dst_prog);
+ bpf_token_put(aux->token);
INIT_WORK(&aux->work, bpf_prog_free_deferred);
schedule_work(&aux->work);
}
EXPORT_SYMBOL_GPL(bpf_prog_free);
-/* RNG for unpriviledged user space with separated state from prandom_u32(). */
+/* RNG for unprivileged user space with separated state from prandom_u32(). */
static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);
void bpf_user_rnd_init_once(void)
@@ -2891,7 +2940,7 @@ void __weak bpf_jit_compile(struct bpf_prog *prog)
{
}
-bool __weak bpf_helper_changes_pkt_data(void *func)
+bool __weak bpf_helper_changes_pkt_data(enum bpf_func_id func_id)
{
return false;
}
@@ -2909,12 +2958,28 @@ bool __weak bpf_jit_needs_zext(void)
return false;
}
+/* Return true if the JIT inlines the call to the helper corresponding to
+ * the imm.
+ *
+ * The verifier will not patch the insn->imm for the call to the helper if
+ * this returns true.
+ */
+bool __weak bpf_jit_inlines_helper_call(s32 imm)
+{
+ return false;
+}
+
/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
bool __weak bpf_jit_supports_subprog_tailcalls(void)
{
return false;
}
+bool __weak bpf_jit_supports_percpu_insn(void)
+{
+ return false;
+}
+
bool __weak bpf_jit_supports_kfunc_call(void)
{
return false;
@@ -2925,6 +2990,35 @@ bool __weak bpf_jit_supports_far_kfunc_call(void)
return false;
}
+bool __weak bpf_jit_supports_arena(void)
+{
+ return false;
+}
+
+bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
+{
+ return false;
+}
+
+u64 __weak bpf_arch_uaddress_limit(void)
+{
+#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE)
+ return TASK_SIZE;
+#else
+ return 0;
+#endif
+}
+
+/* Return TRUE if the JIT backend satisfies the following two conditions:
+ * 1) JIT backend supports atomic_xchg() on pointer-sized words.
+ * 2) Under the specific arch, the implementation of xchg() is the same
+ * as atomic_xchg() on pointer-sized words.
+ */
+bool __weak bpf_jit_supports_ptr_xchg(void)
+{
+ return false;
+}
+
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
* skb_copy_bits(), so provide a weak definition of it for NET-less config.
*/
@@ -2955,10 +3049,26 @@ bool __weak bpf_jit_supports_exceptions(void)
return false;
}
+bool __weak bpf_jit_supports_private_stack(void)
+{
+ return false;
+}
+
void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
{
}
+/* for configs without MMU or 32-bit */
+__weak const struct bpf_map_ops arena_map_ops;
+__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
+{
+ return 0;
+}
+__weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
+{
+ return 0;
+}
+
#ifdef CONFIG_BPF_SYSCALL
static int __init bpf_global_ma_init(void)
{
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ef82ffc90cbe..774accbd4a22 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -24,6 +24,7 @@
#include <linux/filter.h>
#include <linux/ptr_ring.h>
#include <net/xdp.h>
+#include <net/hotdata.h>
#include <linux/sched.h>
#include <linux/workqueue.h>
@@ -78,8 +79,6 @@ struct bpf_cpu_map {
struct bpf_cpu_map_entry __rcu **cpu_map;
};
-static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list);
-
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
u32 value_size = attr->value_size;
@@ -191,7 +190,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
int err;
rxq.dev = xdpf->dev_rx;
- rxq.mem = xdpf->mem;
+ rxq.mem.type = xdpf->mem_type;
/* TODO: report queue_index to xdp_rxq_info */
xdp_convert_frame_to_buff(xdpf, &xdp);
@@ -239,12 +238,14 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
int xdp_n, struct xdp_cpumap_stats *stats,
struct list_head *list)
{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int nframes;
if (!rcpu->prog)
return xdp_n;
rcu_read_lock_bh();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats);
@@ -254,6 +255,7 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
if (unlikely(!list_empty(list)))
cpu_map_bpf_prog_run_skb(rcpu, list, stats);
+ bpf_net_ctx_clear(bpf_net_ctx);
rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
return nframes;
@@ -262,6 +264,7 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
static int cpu_map_kthread_run(void *data)
{
struct bpf_cpu_map_entry *rcpu = data;
+ unsigned long last_qs = jiffies;
complete(&rcpu->kthread_running);
set_current_state(TASK_INTERRUPTIBLE);
@@ -287,10 +290,12 @@ static int cpu_map_kthread_run(void *data)
if (__ptr_ring_empty(rcpu->queue)) {
schedule();
sched = 1;
+ last_qs = jiffies;
} else {
__set_current_state(TASK_RUNNING);
}
} else {
+ rcu_softirq_qs_periodic(last_qs);
sched = cond_resched();
}
@@ -326,7 +331,8 @@ static int cpu_map_kthread_run(void *data)
/* Support running another XDP prog on this CPU */
nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list);
if (nframes) {
- m = kmem_cache_alloc_bulk(skbuff_cache, gfp, nframes, skbs);
+ m = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+ gfp, nframes, skbs);
if (unlikely(m == 0)) {
for (i = 0; i < nframes; i++)
skbs[i] = NULL; /* effect: xdp_return_frame */
@@ -348,12 +354,14 @@ static int cpu_map_kthread_run(void *data)
list_add_tail(&skb->list, &list);
}
- netif_receive_skb_list(&list);
- /* Feedback loop via tracepoint */
+ /* Feedback loop via tracepoint.
+ * NB: keep before recv to allow measuring enqueue/dequeue latency.
+ */
trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops,
sched, &stats);
+ netif_receive_skb_list(&list);
local_bh_enable(); /* resched point, may call do_softirq() */
}
__set_current_state(TASK_RUNNING);
@@ -701,7 +709,6 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
*/
static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
- struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
@@ -718,8 +725,11 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
*/
bq->q[bq->count++] = xdpf;
- if (!bq->flush_node.prev)
+ if (!bq->flush_node.prev) {
+ struct list_head *flush_list = bpf_net_ctx_get_cpu_map_flush_list();
+
list_add(&bq->flush_node, flush_list);
+ }
}
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
@@ -751,9 +761,8 @@ trace:
return ret;
}
-void __cpu_map_flush(void)
+void __cpu_map_flush(struct list_head *flush_list)
{
- struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
@@ -763,24 +772,3 @@ void __cpu_map_flush(void)
wake_up_process(bq->obj->kthread);
}
}
-
-#ifdef CONFIG_DEBUG_NET
-bool cpu_map_check_flush(void)
-{
- if (list_empty(this_cpu_ptr(&cpu_map_flush_list)))
- return false;
- __cpu_map_flush();
- return true;
-}
-#endif
-
-static int __init cpu_map_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu));
- return 0;
-}
-
-subsys_initcall(cpu_map_init);
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 2e73533a3811..cfa1c18e3a48 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -91,9 +91,7 @@ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
if (!refcount_dec_and_test(&cpumask->usage))
return;
- migrate_disable();
bpf_mem_cache_free_rcu(&bpf_cpumask_ma, cpumask);
- migrate_enable();
}
__bpf_kfunc void bpf_cpumask_release_dtor(void *cpumask)
@@ -424,7 +422,7 @@ __bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask)
__bpf_kfunc_end_defs();
-BTF_SET8_START(cpumask_kfunc_btf_ids)
+BTF_KFUNCS_START(cpumask_kfunc_btf_ids)
BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
@@ -450,7 +448,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
-BTF_SET8_END(cpumask_kfunc_btf_ids)
+BTF_KFUNCS_END(cpumask_kfunc_btf_ids)
static const struct btf_kfunc_id_set cpumask_kfunc_set = {
.owner = THIS_MODULE,
@@ -474,6 +472,7 @@ static int __init cpumask_kfunc_init(void)
ret = bpf_mem_alloc_init(&bpf_cpumask_ma, sizeof(struct bpf_cpumask), false);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &cpumask_kfunc_set);
return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors,
ARRAY_SIZE(cpumask_dtors),
THIS_MODULE);
diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c
new file mode 100644
index 000000000000..94854cd9c4cc
--- /dev/null
+++ b/kernel/bpf/crypto.c
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Meta, Inc */
+#include <linux/bpf.h>
+#include <linux/bpf_crypto.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/filter.h>
+#include <linux/scatterlist.h>
+#include <linux/skbuff.h>
+#include <crypto/skcipher.h>
+
+struct bpf_crypto_type_list {
+ const struct bpf_crypto_type *type;
+ struct list_head list;
+};
+
+/* BPF crypto initialization parameters struct */
+/**
+ * struct bpf_crypto_params - BPF crypto initialization parameters structure
+ * @type: The string of crypto operation type.
+ * @reserved: Reserved member, will be reused for more options in future
+ * Values:
+ * 0
+ * @algo: The string of algorithm to initialize.
+ * @key: The cipher key used to init crypto algorithm.
+ * @key_len: The length of cipher key.
+ * @authsize: The length of authentication tag used by algorithm.
+ */
+struct bpf_crypto_params {
+ char type[14];
+ u8 reserved[2];
+ char algo[128];
+ u8 key[256];
+ u32 key_len;
+ u32 authsize;
+};
+
+static LIST_HEAD(bpf_crypto_types);
+static DECLARE_RWSEM(bpf_crypto_types_sem);
+
+/**
+ * struct bpf_crypto_ctx - refcounted BPF crypto context structure
+ * @type: The pointer to bpf crypto type
+ * @tfm: The pointer to instance of crypto API struct.
+ * @siv_len: Size of IV and state storage for cipher
+ * @rcu: The RCU head used to free the crypto context with RCU safety.
+ * @usage: Object reference counter. When the refcount goes to 0, the
+ * memory is released back to the BPF allocator, which provides
+ * RCU safety.
+ */
+struct bpf_crypto_ctx {
+ const struct bpf_crypto_type *type;
+ void *tfm;
+ u32 siv_len;
+ struct rcu_head rcu;
+ refcount_t usage;
+};
+
+int bpf_crypto_register_type(const struct bpf_crypto_type *type)
+{
+ struct bpf_crypto_type_list *node;
+ int err = -EEXIST;
+
+ down_write(&bpf_crypto_types_sem);
+ list_for_each_entry(node, &bpf_crypto_types, list) {
+ if (!strcmp(node->type->name, type->name))
+ goto unlock;
+ }
+
+ node = kmalloc(sizeof(*node), GFP_KERNEL);
+ err = -ENOMEM;
+ if (!node)
+ goto unlock;
+
+ node->type = type;
+ list_add(&node->list, &bpf_crypto_types);
+ err = 0;
+
+unlock:
+ up_write(&bpf_crypto_types_sem);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(bpf_crypto_register_type);
+
+int bpf_crypto_unregister_type(const struct bpf_crypto_type *type)
+{
+ struct bpf_crypto_type_list *node;
+ int err = -ENOENT;
+
+ down_write(&bpf_crypto_types_sem);
+ list_for_each_entry(node, &bpf_crypto_types, list) {
+ if (strcmp(node->type->name, type->name))
+ continue;
+
+ list_del(&node->list);
+ kfree(node);
+ err = 0;
+ break;
+ }
+ up_write(&bpf_crypto_types_sem);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(bpf_crypto_unregister_type);
+
+static const struct bpf_crypto_type *bpf_crypto_get_type(const char *name)
+{
+ const struct bpf_crypto_type *type = ERR_PTR(-ENOENT);
+ struct bpf_crypto_type_list *node;
+
+ down_read(&bpf_crypto_types_sem);
+ list_for_each_entry(node, &bpf_crypto_types, list) {
+ if (strcmp(node->type->name, name))
+ continue;
+
+ if (try_module_get(node->type->owner))
+ type = node->type;
+ break;
+ }
+ up_read(&bpf_crypto_types_sem);
+
+ return type;
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_crypto_ctx_create() - Create a mutable BPF crypto context.
+ *
+ * Allocates a crypto context that can be used, acquired, and released by
+ * a BPF program. The crypto context returned by this function must either
+ * be embedded in a map as a kptr, or freed with bpf_crypto_ctx_release().
+ * As crypto API functions use GFP_KERNEL allocations, this function can
+ * only be used in sleepable BPF programs.
+ *
+ * bpf_crypto_ctx_create() allocates memory for crypto context.
+ * It may return NULL if no memory is available.
+ * @params: pointer to struct bpf_crypto_params which contains all the
+ * details needed to initialise crypto context.
+ * @params__sz: size of steuct bpf_crypto_params usef by bpf program
+ * @err: integer to store error code when NULL is returned.
+ */
+__bpf_kfunc struct bpf_crypto_ctx *
+bpf_crypto_ctx_create(const struct bpf_crypto_params *params, u32 params__sz,
+ int *err)
+{
+ const struct bpf_crypto_type *type;
+ struct bpf_crypto_ctx *ctx;
+
+ if (!params || params->reserved[0] || params->reserved[1] ||
+ params__sz != sizeof(struct bpf_crypto_params)) {
+ *err = -EINVAL;
+ return NULL;
+ }
+
+ type = bpf_crypto_get_type(params->type);
+ if (IS_ERR(type)) {
+ *err = PTR_ERR(type);
+ return NULL;
+ }
+
+ if (!type->has_algo(params->algo)) {
+ *err = -EOPNOTSUPP;
+ goto err_module_put;
+ }
+
+ if (!!params->authsize ^ !!type->setauthsize) {
+ *err = -EOPNOTSUPP;
+ goto err_module_put;
+ }
+
+ if (!params->key_len || params->key_len > sizeof(params->key)) {
+ *err = -EINVAL;
+ goto err_module_put;
+ }
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx) {
+ *err = -ENOMEM;
+ goto err_module_put;
+ }
+
+ ctx->type = type;
+ ctx->tfm = type->alloc_tfm(params->algo);
+ if (IS_ERR(ctx->tfm)) {
+ *err = PTR_ERR(ctx->tfm);
+ goto err_free_ctx;
+ }
+
+ if (params->authsize) {
+ *err = type->setauthsize(ctx->tfm, params->authsize);
+ if (*err)
+ goto err_free_tfm;
+ }
+
+ *err = type->setkey(ctx->tfm, params->key, params->key_len);
+ if (*err)
+ goto err_free_tfm;
+
+ if (type->get_flags(ctx->tfm) & CRYPTO_TFM_NEED_KEY) {
+ *err = -EINVAL;
+ goto err_free_tfm;
+ }
+
+ ctx->siv_len = type->ivsize(ctx->tfm) + type->statesize(ctx->tfm);
+
+ refcount_set(&ctx->usage, 1);
+
+ return ctx;
+
+err_free_tfm:
+ type->free_tfm(ctx->tfm);
+err_free_ctx:
+ kfree(ctx);
+err_module_put:
+ module_put(type->owner);
+
+ return NULL;
+}
+
+static void crypto_free_cb(struct rcu_head *head)
+{
+ struct bpf_crypto_ctx *ctx;
+
+ ctx = container_of(head, struct bpf_crypto_ctx, rcu);
+ ctx->type->free_tfm(ctx->tfm);
+ module_put(ctx->type->owner);
+ kfree(ctx);
+}
+
+/**
+ * bpf_crypto_ctx_acquire() - Acquire a reference to a BPF crypto context.
+ * @ctx: The BPF crypto context being acquired. The ctx must be a trusted
+ * pointer.
+ *
+ * Acquires a reference to a BPF crypto context. The context returned by this function
+ * must either be embedded in a map as a kptr, or freed with
+ * bpf_crypto_ctx_release().
+ */
+__bpf_kfunc struct bpf_crypto_ctx *
+bpf_crypto_ctx_acquire(struct bpf_crypto_ctx *ctx)
+{
+ if (!refcount_inc_not_zero(&ctx->usage))
+ return NULL;
+ return ctx;
+}
+
+/**
+ * bpf_crypto_ctx_release() - Release a previously acquired BPF crypto context.
+ * @ctx: The crypto context being released.
+ *
+ * Releases a previously acquired reference to a BPF crypto context. When the final
+ * reference of the BPF crypto context has been released, its memory
+ * will be released.
+ */
+__bpf_kfunc void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx)
+{
+ if (refcount_dec_and_test(&ctx->usage))
+ call_rcu(&ctx->rcu, crypto_free_cb);
+}
+
+static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx,
+ const struct bpf_dynptr_kern *src,
+ const struct bpf_dynptr_kern *dst,
+ const struct bpf_dynptr_kern *siv,
+ bool decrypt)
+{
+ u32 src_len, dst_len, siv_len;
+ const u8 *psrc;
+ u8 *pdst, *piv;
+ int err;
+
+ if (__bpf_dynptr_is_rdonly(dst))
+ return -EINVAL;
+
+ siv_len = siv ? __bpf_dynptr_size(siv) : 0;
+ src_len = __bpf_dynptr_size(src);
+ dst_len = __bpf_dynptr_size(dst);
+ if (!src_len || !dst_len)
+ return -EINVAL;
+
+ if (siv_len != ctx->siv_len)
+ return -EINVAL;
+
+ psrc = __bpf_dynptr_data(src, src_len);
+ if (!psrc)
+ return -EINVAL;
+ pdst = __bpf_dynptr_data_rw(dst, dst_len);
+ if (!pdst)
+ return -EINVAL;
+
+ piv = siv_len ? __bpf_dynptr_data_rw(siv, siv_len) : NULL;
+ if (siv_len && !piv)
+ return -EINVAL;
+
+ err = decrypt ? ctx->type->decrypt(ctx->tfm, psrc, pdst, src_len, piv)
+ : ctx->type->encrypt(ctx->tfm, psrc, pdst, src_len, piv);
+
+ return err;
+}
+
+/**
+ * bpf_crypto_decrypt() - Decrypt buffer using configured context and IV provided.
+ * @ctx: The crypto context being used. The ctx must be a trusted pointer.
+ * @src: bpf_dynptr to the encrypted data. Must be a trusted pointer.
+ * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer.
+ * @siv__nullable: bpf_dynptr to IV data and state data to be used by decryptor. May be NULL.
+ *
+ * Decrypts provided buffer using IV data and the crypto context. Crypto context must be configured.
+ */
+__bpf_kfunc int bpf_crypto_decrypt(struct bpf_crypto_ctx *ctx,
+ const struct bpf_dynptr *src,
+ const struct bpf_dynptr *dst,
+ const struct bpf_dynptr *siv__nullable)
+{
+ const struct bpf_dynptr_kern *src_kern = (struct bpf_dynptr_kern *)src;
+ const struct bpf_dynptr_kern *dst_kern = (struct bpf_dynptr_kern *)dst;
+ const struct bpf_dynptr_kern *siv_kern = (struct bpf_dynptr_kern *)siv__nullable;
+
+ return bpf_crypto_crypt(ctx, src_kern, dst_kern, siv_kern, true);
+}
+
+/**
+ * bpf_crypto_encrypt() - Encrypt buffer using configured context and IV provided.
+ * @ctx: The crypto context being used. The ctx must be a trusted pointer.
+ * @src: bpf_dynptr to the plain data. Must be a trusted pointer.
+ * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer.
+ * @siv__nullable: bpf_dynptr to IV data and state data to be used by decryptor. May be NULL.
+ *
+ * Encrypts provided buffer using IV data and the crypto context. Crypto context must be configured.
+ */
+__bpf_kfunc int bpf_crypto_encrypt(struct bpf_crypto_ctx *ctx,
+ const struct bpf_dynptr *src,
+ const struct bpf_dynptr *dst,
+ const struct bpf_dynptr *siv__nullable)
+{
+ const struct bpf_dynptr_kern *src_kern = (struct bpf_dynptr_kern *)src;
+ const struct bpf_dynptr_kern *dst_kern = (struct bpf_dynptr_kern *)dst;
+ const struct bpf_dynptr_kern *siv_kern = (struct bpf_dynptr_kern *)siv__nullable;
+
+ return bpf_crypto_crypt(ctx, src_kern, dst_kern, siv_kern, false);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(crypt_init_kfunc_btf_ids)
+BTF_ID_FLAGS(func, bpf_crypto_ctx_create, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_crypto_ctx_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_crypto_ctx_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_KFUNCS_END(crypt_init_kfunc_btf_ids)
+
+static const struct btf_kfunc_id_set crypt_init_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &crypt_init_kfunc_btf_ids,
+};
+
+BTF_KFUNCS_START(crypt_kfunc_btf_ids)
+BTF_ID_FLAGS(func, bpf_crypto_decrypt, KF_RCU)
+BTF_ID_FLAGS(func, bpf_crypto_encrypt, KF_RCU)
+BTF_KFUNCS_END(crypt_kfunc_btf_ids)
+
+static const struct btf_kfunc_id_set crypt_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &crypt_kfunc_btf_ids,
+};
+
+BTF_ID_LIST(bpf_crypto_dtor_ids)
+BTF_ID(struct, bpf_crypto_ctx)
+BTF_ID(func, bpf_crypto_ctx_release)
+
+static int __init crypto_kfunc_init(void)
+{
+ int ret;
+ const struct btf_id_dtor_kfunc bpf_crypto_dtors[] = {
+ {
+ .btf_id = bpf_crypto_dtor_ids[0],
+ .kfunc_btf_id = bpf_crypto_dtor_ids[1]
+ },
+ };
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &crypt_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &crypt_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &crypt_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
+ &crypt_init_kfunc_set);
+ return ret ?: register_btf_id_dtor_kfuncs(bpf_crypto_dtors,
+ ARRAY_SIZE(bpf_crypto_dtors),
+ THIS_MODULE);
+}
+
+late_initcall(crypto_kfunc_init);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index a936c704d4e7..482d284a1553 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -83,7 +83,6 @@ struct bpf_dtab {
u32 n_buckets;
};
-static DEFINE_PER_CPU(struct list_head, dev_flush_list);
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
@@ -107,7 +106,7 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)];
}
-static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
+static int dev_map_alloc_check(union bpf_attr *attr)
{
u32 valsize = attr->value_size;
@@ -121,22 +120,28 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
attr->map_flags & ~DEV_CREATE_FLAG_MASK)
return -EINVAL;
+ if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+ /* Hash table size must be power of 2; roundup_pow_of_two()
+ * can overflow into UB on 32-bit arches
+ */
+ if (attr->max_entries > 1UL << 31)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
+{
/* Lookup returns a pointer straight to dev->ifindex, so make sure the
* verifier prevents writes from the BPF side
*/
attr->map_flags |= BPF_F_RDONLY_PROG;
-
-
bpf_map_init_from_attr(&dtab->map, attr);
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+ /* Hash table size must be power of 2 */
dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
-
- if (!dtab->n_buckets) /* Overflow check */
- return -EINVAL;
- }
-
- if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
dtab->map.numa_node);
if (!dtab->dev_index_head)
@@ -179,7 +184,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
static void dev_map_free(struct bpf_map *map)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- int i;
+ u32 i;
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
* so the programs (can be more than one that used this map) were
@@ -195,7 +200,14 @@ static void dev_map_free(struct bpf_map *map)
list_del_rcu(&dtab->list);
spin_unlock(&dev_map_lock);
- bpf_clear_redirect_map(map);
+ /* bpf_redirect_info->map is assigned in __bpf_xdp_redirect_map()
+ * during NAPI callback and cleared after the XDP redirect. There is no
+ * explicit RCU read section which protects bpf_redirect_info->map but
+ * local_bh_disable() also marks the beginning an RCU section. This
+ * makes the complete softirq callback RCU protected. Thus after
+ * following synchronize_rcu() there no bpf_redirect_info->map == map
+ * assignment.
+ */
synchronize_rcu();
/* Make sure prior __dev_map_entry_free() have completed. */
@@ -321,9 +333,11 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
struct xdp_frame **frames, int n,
- struct net_device *dev)
+ struct net_device *tx_dev,
+ struct net_device *rx_dev)
{
- struct xdp_txq_info txq = { .dev = dev };
+ struct xdp_txq_info txq = { .dev = tx_dev };
+ struct xdp_rxq_info rxq = { .dev = rx_dev };
struct xdp_buff xdp;
int i, nframes = 0;
@@ -334,6 +348,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
xdp_convert_frame_to_buff(xdpf, &xdp);
xdp.txq = &txq;
+ xdp.rxq = &rxq;
act = bpf_prog_run_xdp(xdp_prog, &xdp);
switch (act) {
@@ -348,7 +363,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
fallthrough;
case XDP_ABORTED:
- trace_xdp_exception(dev, xdp_prog, act);
+ trace_xdp_exception(tx_dev, xdp_prog, act);
fallthrough;
case XDP_DROP:
xdp_return_frame_rx_napi(xdpf);
@@ -376,7 +391,7 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
}
if (bq->xdp_prog) {
- to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev);
+ to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx);
if (!to_send)
goto out;
}
@@ -405,9 +420,8 @@ out:
* driver before returning from its napi->poll() routine. See the comment above
* xdp_do_flush() in filter.c.
*/
-void __dev_flush(void)
+void __dev_flush(struct list_head *flush_list)
{
- struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
@@ -418,16 +432,6 @@ void __dev_flush(void)
}
}
-#ifdef CONFIG_DEBUG_NET
-bool dev_check_flush(void)
-{
- if (list_empty(this_cpu_ptr(&dev_flush_list)))
- return false;
- __dev_flush();
- return true;
-}
-#endif
-
/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
* by local_bh_disable() (from XDP calls inside NAPI). The
* rcu_read_lock_bh_held() below makes lockdep accept both.
@@ -452,7 +456,6 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx, struct bpf_prog *xdp_prog)
{
- struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
@@ -466,6 +469,8 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
* are only ever modified together.
*/
if (!bq->dev_rx) {
+ struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list();
+
bq->dev_rx = dev_rx;
bq->xdp_prog = xdp_prog;
list_add(&bq->flush_node, flush_list);
@@ -673,7 +678,7 @@ int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
}
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
int err;
@@ -696,7 +701,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
struct sk_buff *skb,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
struct sk_buff *nskb;
int err;
@@ -715,8 +720,8 @@ static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
}
int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
- struct bpf_prog *xdp_prog, struct bpf_map *map,
- bool exclude_ingress)
+ const struct bpf_prog *xdp_prog,
+ struct bpf_map *map, bool exclude_ingress)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dst, *last_dst = NULL;
@@ -759,9 +764,6 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
for (i = 0; i < dtab->n_buckets; i++) {
head = dev_map_index_hash(dtab, i);
hlist_for_each_entry_safe(dst, next, head, index_hlist) {
- if (!dst)
- continue;
-
if (is_ifindex_excluded(excluded_devices, num_excluded,
dst->dev->ifindex))
continue;
@@ -819,7 +821,7 @@ static long dev_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *old_dev;
- int k = *(u32 *)key;
+ u32 k = *(u32 *)key;
if (k >= map->max_entries)
return -EINVAL;
@@ -836,7 +838,7 @@ static long dev_map_hash_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *old_dev;
- int k = *(u32 *)key;
+ u32 k = *(u32 *)key;
unsigned long flags;
int ret = -ENOENT;
@@ -1042,6 +1044,7 @@ static u64 dev_map_mem_usage(const struct bpf_map *map)
BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab)
const struct bpf_map_ops dev_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = dev_map_alloc_check,
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
.map_get_next_key = dev_map_get_next_key,
@@ -1056,6 +1059,7 @@ const struct bpf_map_ops dev_map_ops = {
const struct bpf_map_ops dev_map_hash_ops = {
.map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = dev_map_alloc_check,
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
.map_get_next_key = dev_map_hash_get_next_key,
@@ -1155,15 +1159,11 @@ static struct notifier_block dev_map_notifier = {
static int __init dev_map_init(void)
{
- int cpu;
-
/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
offsetof(struct _bpf_dtab_netdev, dev));
register_netdevice_notifier(&dev_map_notifier);
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu));
return 0;
}
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 49940c26a227..309c4aa1b026 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -166,6 +166,23 @@ static bool is_movsx(const struct bpf_insn *insn)
(insn->off == 8 || insn->off == 16 || insn->off == 32);
}
+static bool is_addr_space_cast(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
+ insn->off == BPF_ADDR_SPACE_CAST;
+}
+
+/* Special (internal-only) form of mov, used to resolve per-CPU addrs:
+ * dst_reg = src_reg + <percpu_base_off>
+ * BPF_ADDR_PERCPU is used as a special insn->off value.
+ */
+#define BPF_ADDR_PERCPU (-1)
+
+static inline bool is_mov_percpu_addr(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU;
+}
+
void print_bpf_insn(const struct bpf_insn_cbs *cbs,
const struct bpf_insn *insn,
bool allow_ptr_leaks)
@@ -184,6 +201,13 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
insn->code, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg);
+ } else if (is_addr_space_cast(insn)) {
+ verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n",
+ insn->code, insn->dst_reg,
+ insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm);
+ } else if (is_mov_percpu_addr(insn)) {
+ verbose(cbs->private_data, "(%02x) r%d = &(void __percpu *)(r%d)\n",
+ insn->code, insn->dst_reg, insn->src_reg);
} else if (BPF_SRC(insn->code) == BPF_X) {
verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n",
insn->code, class == BPF_ALU ? 'w' : 'r',
@@ -322,6 +346,10 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
} else if (insn->code == (BPF_JMP | BPF_JA)) {
verbose(cbs->private_data, "(%02x) goto pc%+d\n",
insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP | BPF_JCOND) &&
+ insn->src_reg == BPF_MAY_GOTO) {
+ verbose(cbs->private_data, "(%02x) may_goto pc%+d\n",
+ insn->code, insn->off);
} else if (insn->code == (BPF_JMP32 | BPF_JA)) {
verbose(cbs->private_data, "(%02x) gotol pc%+d\n",
insn->code, insn->imm);
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index 70fb82bf1637..b77db7413f8c 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -154,7 +154,8 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
d->image = NULL;
goto out;
}
- bpf_image_ksym_add(d->image, PAGE_SIZE, &d->ksym);
+ bpf_image_ksym_init(d->image, PAGE_SIZE, &d->ksym);
+ bpf_image_ksym_add(&d->ksym);
}
prev_num_progs = d->num_progs;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 03a6a2500b6a..4a9eeb7aef85 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -221,13 +221,11 @@ static bool htab_has_extra_elems(struct bpf_htab *htab)
return !htab_is_percpu(htab) && !htab_is_lru(htab);
}
-static void htab_free_prealloced_timers(struct bpf_htab *htab)
+static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab)
{
u32 num_entries = htab->map.max_entries;
int i;
- if (!btf_record_has_field(htab->map.record, BPF_TIMER))
- return;
if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus();
@@ -235,7 +233,12 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab)
struct htab_elem *elem;
elem = get_htab_elem(htab, i);
- bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
+ if (btf_record_has_field(htab->map.record, BPF_TIMER))
+ bpf_obj_free_timer(htab->map.record,
+ elem->key + round_up(htab->map.key_size, 8));
+ if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
+ bpf_obj_free_workqueue(htab->map.record,
+ elem->key + round_up(htab->map.key_size, 8));
cond_resched();
}
}
@@ -459,6 +462,9 @@ static int htab_map_alloc_check(union bpf_attr *attr)
* kmalloc-able later in htab_map_update_elem()
*/
return -E2BIG;
+ /* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */
+ if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE)
+ return -E2BIG;
return 0;
}
@@ -499,7 +505,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
num_possible_cpus());
}
- /* hash table size must be power of 2 */
+ /* hash table size must be power of 2; roundup_pow_of_two() can overflow
+ * into UB on 32-bit arches, so check that first
+ */
+ err = -E2BIG;
+ if (htab->map.max_entries > 1UL << 31)
+ goto free_htab;
+
htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
htab->elem_size = sizeof(struct htab_elem) +
@@ -509,10 +521,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
else
htab->elem_size += round_up(htab->map.value_size, 8);
- err = -E2BIG;
- /* prevent zero size kmalloc and check for u32 overflow */
- if (htab->n_buckets == 0 ||
- htab->n_buckets > U32_MAX / sizeof(struct bucket))
+ /* check for u32 overflow */
+ if (htab->n_buckets > U32_MAX / sizeof(struct bucket))
goto free_htab;
err = bpf_map_init_elem_count(&htab->map);
@@ -814,13 +824,14 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l == tgt_l) {
hlist_nulls_del_rcu(&l->hash_node);
- check_and_free_fields(htab, l);
bpf_map_dec_elem_count(&htab->map);
break;
}
htab_unlock_bucket(htab, b, tgt_l->hash, flags);
+ if (l == tgt_l)
+ check_and_free_fields(htab, l);
return l == tgt_l;
}
@@ -886,6 +897,7 @@ find_first_elem:
static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
check_and_free_fields(htab, l);
+
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
bpf_mem_cache_free(&htab->ma, l);
@@ -938,7 +950,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
if (htab_is_prealloc(htab)) {
bpf_map_dec_elem_count(&htab->map);
check_and_free_fields(htab, l);
- __pcpu_freelist_push(&htab->freelist, &l->fnode);
+ pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
dec_elem_count(htab);
htab_elem_free(htab, l);
@@ -1008,7 +1020,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
*/
pl_new = this_cpu_ptr(htab->extra_elems);
l_new = *pl_new;
- htab_put_fd_value(htab, old_elem);
*pl_new = old_elem;
} else {
struct pcpu_freelist_node *l;
@@ -1042,14 +1053,15 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
/* alloc_percpu zero-fills */
- pptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
- if (!pptr) {
+ void *ptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
+
+ if (!ptr) {
bpf_mem_cache_free(&htab->ma, l_new);
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
- l_new->ptr_to_pptr = pptr;
- pptr = *(void **)pptr;
+ l_new->ptr_to_pptr = ptr;
+ pptr = *(void __percpu **)ptr;
}
pcpu_init_value(htab, pptr, value, onallcpus);
@@ -1094,6 +1106,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
struct htab_elem *l_new = NULL, *l_old;
struct hlist_nulls_head *head;
unsigned long flags;
+ void *old_map_ptr;
struct bucket *b;
u32 key_size, hash;
int ret;
@@ -1172,12 +1185,27 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
if (l_old) {
hlist_nulls_del_rcu(&l_old->hash_node);
+
+ /* l_old has already been stashed in htab->extra_elems, free
+ * its special fields before it is available for reuse. Also
+ * save the old map pointer in htab of maps before unlock
+ * and release it after unlock.
+ */
+ old_map_ptr = NULL;
+ if (htab_is_prealloc(htab)) {
+ if (map->ops->map_fd_put_ptr)
+ old_map_ptr = fd_htab_map_get_ptr(map, l_old);
+ check_and_free_fields(htab, l_old);
+ }
+ }
+ htab_unlock_bucket(htab, b, hash, flags);
+ if (l_old) {
+ if (old_map_ptr)
+ map->ops->map_fd_put_ptr(map, old_map_ptr, true);
if (!htab_is_prealloc(htab))
free_htab_elem(htab, l_old);
- else
- check_and_free_fields(htab, l_old);
}
- ret = 0;
+ return 0;
err:
htab_unlock_bucket(htab, b, hash, flags);
return ret;
@@ -1421,15 +1449,15 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key)
return ret;
l = lookup_elem_raw(head, hash, key, key_size);
-
- if (l) {
+ if (l)
hlist_nulls_del_rcu(&l->hash_node);
- free_htab_elem(htab, l);
- } else {
+ else
ret = -ENOENT;
- }
htab_unlock_bucket(htab, b, hash, flags);
+
+ if (l)
+ free_htab_elem(htab, l);
return ret;
}
@@ -1473,10 +1501,9 @@ static void delete_all_elements(struct bpf_htab *htab)
{
int i;
- /* It's called from a worker thread, so disable migration here,
- * since bpf_mem_cache_free() relies on that.
+ /* It's called from a worker thread and migration has been disabled,
+ * therefore, it is OK to invoke bpf_mem_cache_free() directly.
*/
- migrate_disable();
for (i = 0; i < htab->n_buckets; i++) {
struct hlist_nulls_head *head = select_bucket(htab, i);
struct hlist_nulls_node *n;
@@ -1486,11 +1513,11 @@ static void delete_all_elements(struct bpf_htab *htab)
hlist_nulls_del_rcu(&l->hash_node);
htab_elem_free(htab, l);
}
+ cond_resched();
}
- migrate_enable();
}
-static void htab_free_malloced_timers(struct bpf_htab *htab)
+static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab)
{
int i;
@@ -1502,24 +1529,29 @@ static void htab_free_malloced_timers(struct bpf_htab *htab)
hlist_nulls_for_each_entry(l, n, head, hash_node) {
/* We only free timer on uref dropping to zero */
- bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8));
+ if (btf_record_has_field(htab->map.record, BPF_TIMER))
+ bpf_obj_free_timer(htab->map.record,
+ l->key + round_up(htab->map.key_size, 8));
+ if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
+ bpf_obj_free_workqueue(htab->map.record,
+ l->key + round_up(htab->map.key_size, 8));
}
cond_resched_rcu();
}
rcu_read_unlock();
}
-static void htab_map_free_timers(struct bpf_map *map)
+static void htab_map_free_timers_and_wq(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- /* We only free timer on uref dropping to zero */
- if (!btf_record_has_field(htab->map.record, BPF_TIMER))
- return;
- if (!htab_is_prealloc(htab))
- htab_free_malloced_timers(htab);
- else
- htab_free_prealloced_timers(htab);
+ /* We only free timer and workqueue on uref dropping to zero */
+ if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) {
+ if (!htab_is_prealloc(htab))
+ htab_free_malloced_timers_and_wq(htab);
+ else
+ htab_free_prealloced_timers_and_wq(htab);
+ }
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
@@ -1534,7 +1566,7 @@ static void htab_map_free(struct bpf_map *map)
*/
/* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
- * underneath and is reponsible for waiting for callbacks to finish
+ * underneath and is responsible for waiting for callbacks to finish
* during bpf_mem_alloc_destroy().
*/
if (!htab_is_prealloc(htab)) {
@@ -1573,7 +1605,7 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
seq_puts(m, ": ");
btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
rcu_read_unlock();
}
@@ -1603,41 +1635,44 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
l = lookup_elem_raw(head, hash, key, key_size);
if (!l) {
ret = -ENOENT;
- } else {
- if (is_percpu) {
- u32 roundup_value_size = round_up(map->value_size, 8);
- void __percpu *pptr;
- int off = 0, cpu;
+ goto out_unlock;
+ }
- pptr = htab_elem_get_ptr(l, key_size);
- for_each_possible_cpu(cpu) {
- copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
- check_and_init_map_value(&htab->map, value + off);
- off += roundup_value_size;
- }
- } else {
- u32 roundup_key_size = round_up(map->key_size, 8);
+ if (is_percpu) {
+ u32 roundup_value_size = round_up(map->value_size, 8);
+ void __percpu *pptr;
+ int off = 0, cpu;
- if (flags & BPF_F_LOCK)
- copy_map_value_locked(map, value, l->key +
- roundup_key_size,
- true);
- else
- copy_map_value(map, value, l->key +
- roundup_key_size);
- /* Zeroing special fields in the temp buffer */
- check_and_init_map_value(map, value);
+ pptr = htab_elem_get_ptr(l, key_size);
+ for_each_possible_cpu(cpu) {
+ copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(&htab->map, value + off);
+ off += roundup_value_size;
}
+ } else {
+ u32 roundup_key_size = round_up(map->key_size, 8);
- hlist_nulls_del_rcu(&l->hash_node);
- if (!is_lru_map)
- free_htab_elem(htab, l);
+ if (flags & BPF_F_LOCK)
+ copy_map_value_locked(map, value, l->key +
+ roundup_key_size,
+ true);
+ else
+ copy_map_value(map, value, l->key +
+ roundup_key_size);
+ /* Zeroing special fields in the temp buffer */
+ check_and_init_map_value(map, value);
}
+ hlist_nulls_del_rcu(&l->hash_node);
+out_unlock:
htab_unlock_bucket(htab, b, hash, bflags);
- if (is_lru_map && l)
- htab_lru_push_free(htab, l);
+ if (l) {
+ if (is_lru_map)
+ htab_lru_push_free(htab, l);
+ else
+ free_htab_elem(htab, l);
+ }
return ret;
}
@@ -1836,13 +1871,14 @@ again_nocopy:
* may cause deadlock. See comments in function
* prealloc_lru_pop(). Let us do bpf_lru_push_free()
* after releasing the bucket lock.
+ *
+ * For htab of maps, htab_put_fd_value() in
+ * free_htab_elem() may acquire a spinlock with bucket
+ * lock being held and it violates the lock rule, so
+ * invoke free_htab_elem() after unlock as well.
*/
- if (is_lru_map) {
- l->batch_flink = node_to_free;
- node_to_free = l;
- } else {
- free_htab_elem(htab, l);
- }
+ l->batch_flink = node_to_free;
+ node_to_free = l;
}
dst_key += key_size;
dst_val += value_size;
@@ -1854,7 +1890,10 @@ again_nocopy:
while (node_to_free) {
l = node_to_free;
node_to_free = node_to_free->batch_flink;
- htab_lru_push_free(htab, l);
+ if (is_lru_map)
+ htab_lru_push_free(htab, l);
+ else
+ free_htab_elem(htab, l);
}
next_batch:
@@ -2169,17 +2208,18 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_
bool is_percpu;
u64 ret = 0;
+ cant_migrate();
+
if (flags != 0)
return -EINVAL;
is_percpu = htab_is_percpu(htab);
roundup_key_size = round_up(map->key_size, 8);
- /* disable migration so percpu value prepared here will be the
- * same as the one seen by the bpf program with bpf_map_lookup_elem().
+ /* migration has been disabled, so percpu value prepared here will be
+ * the same as the one seen by the bpf program with
+ * bpf_map_lookup_elem().
*/
- if (is_percpu)
- migrate_disable();
for (i = 0; i < htab->n_buckets; i++) {
b = &htab->buckets[i];
rcu_read_lock();
@@ -2205,8 +2245,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_
rcu_read_unlock();
}
out:
- if (is_percpu)
- migrate_enable();
return num_elems;
}
@@ -2255,7 +2293,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
- .map_release_uref = htab_map_free_timers,
+ .map_release_uref = htab_map_free_timers_and_wq,
.map_lookup_elem = htab_map_lookup_elem,
.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
.map_update_elem = htab_map_update_elem,
@@ -2276,7 +2314,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
- .map_release_uref = htab_map_free_timers,
+ .map_release_uref = htab_map_free_timers_and_wq,
.map_lookup_elem = htab_lru_map_lookup_elem,
.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
@@ -2303,6 +2341,26 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+/* inline bpf_map_lookup_elem() call for per-CPU hashmap */
+static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ if (!bpf_jit_supports_percpu_insn())
+ return -EOPNOTSUPP;
+
+ BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+ (void *(*)(struct bpf_map *map, void *key))NULL));
+ *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0,
+ offsetof(struct htab_elem, key) + map->key_size);
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
+ *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
+
+ return insn - insn_buf;
+}
+
static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
{
struct htab_elem *l;
@@ -2417,7 +2475,7 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
seq_printf(m, "\tcpu%d: ", cpu);
btf_type_seq_show(map->btf, map->btf_value_type_id,
per_cpu_ptr(pptr, cpu), m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
}
seq_puts(m, "}\n");
@@ -2431,6 +2489,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_percpu_map_lookup_elem,
+ .map_gen_lookup = htab_percpu_map_gen_lookup,
.map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
.map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index d19cd863d294..f27ce162427a 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -111,7 +111,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT,
+ .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
};
BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value)
@@ -124,7 +124,7 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT,
+ .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
};
BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
@@ -158,6 +158,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
.func = bpf_get_smp_processor_id,
.gpl_only = false,
.ret_type = RET_INTEGER,
+ .allow_fastcall = true,
};
BPF_CALL_0(bpf_get_numa_node_id)
@@ -334,7 +335,7 @@ static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock)
__this_cpu_write(irqsave_flags, flags);
}
-notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+NOTRACE_BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
{
__bpf_spin_lock_irqsave(lock);
return 0;
@@ -357,7 +358,7 @@ static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
local_irq_restore(flags);
}
-notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+NOTRACE_BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
{
__bpf_spin_unlock_irqrestore(lock);
return 0;
@@ -517,16 +518,15 @@ static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags,
}
BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags,
- long *, res)
+ s64 *, res)
{
long long _res;
int err;
+ *res = 0;
err = __bpf_strtoll(buf, buf_len, flags, &_res);
if (err < 0)
return err;
- if (_res != (long)_res)
- return -ERANGE;
*res = _res;
return err;
}
@@ -538,23 +538,23 @@ const struct bpf_func_proto bpf_strtol_proto = {
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_LONG,
+ .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
+ .arg4_size = sizeof(s64),
};
BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags,
- unsigned long *, res)
+ u64 *, res)
{
unsigned long long _res;
bool is_negative;
int err;
+ *res = 0;
err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
if (err < 0)
return err;
if (is_negative)
return -EINVAL;
- if (_res != (unsigned long)_res)
- return -ERANGE;
*res = _res;
return err;
}
@@ -566,7 +566,8 @@ const struct bpf_func_proto bpf_strtoul_proto = {
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_LONG,
+ .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
+ .arg4_size = sizeof(u64),
};
BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
@@ -714,7 +715,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
if (cpu >= nr_cpu_ids)
return (unsigned long)NULL;
- return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu);
+ return (unsigned long)per_cpu_ptr((const void __percpu *)(const uintptr_t)ptr, cpu);
}
const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
@@ -727,7 +728,7 @@ const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
{
- return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr);
+ return (unsigned long)this_cpu_ptr((const void __percpu *)(const uintptr_t)percpu_ptr);
}
const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
@@ -1079,11 +1080,23 @@ const struct bpf_func_proto bpf_snprintf_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
+struct bpf_async_cb {
+ struct bpf_map *map;
+ struct bpf_prog *prog;
+ void __rcu *callback_fn;
+ void *value;
+ union {
+ struct rcu_head rcu;
+ struct work_struct delete_work;
+ };
+ u64 flags;
+};
+
/* BPF map elements can contain 'struct bpf_timer'.
* Such map owns all of its BPF timers.
* 'struct bpf_timer' is allocated as part of map element allocation
* and it's zero initialized.
- * That space is used to keep 'struct bpf_timer_kern'.
+ * That space is used to keep 'struct bpf_async_kern'.
* bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and
* remembers 'struct bpf_map *' pointer it's part of.
* bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn.
@@ -1096,17 +1109,24 @@ const struct bpf_func_proto bpf_snprintf_proto = {
* freeing the timers when inner map is replaced or deleted by user space.
*/
struct bpf_hrtimer {
+ struct bpf_async_cb cb;
struct hrtimer timer;
- struct bpf_map *map;
- struct bpf_prog *prog;
- void __rcu *callback_fn;
- void *value;
- struct rcu_head rcu;
+ atomic_t cancelling;
+};
+
+struct bpf_work {
+ struct bpf_async_cb cb;
+ struct work_struct work;
+ struct work_struct delete_work;
};
-/* the actual struct hidden inside uapi struct bpf_timer */
-struct bpf_timer_kern {
- struct bpf_hrtimer *timer;
+/* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */
+struct bpf_async_kern {
+ union {
+ struct bpf_async_cb *cb;
+ struct bpf_hrtimer *timer;
+ struct bpf_work *work;
+ };
/* bpf_spin_lock is used here instead of spinlock_t to make
* sure that it always fits into space reserved by struct bpf_timer
* regardless of LOCKDEP and spinlock debug flags.
@@ -1114,19 +1134,24 @@ struct bpf_timer_kern {
struct bpf_spin_lock lock;
} __attribute__((aligned(8)));
+enum bpf_async_type {
+ BPF_ASYNC_TYPE_TIMER = 0,
+ BPF_ASYNC_TYPE_WQ,
+};
+
static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
{
struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
- struct bpf_map *map = t->map;
- void *value = t->value;
+ struct bpf_map *map = t->cb.map;
+ void *value = t->cb.value;
bpf_callback_t callback_fn;
void *key;
u32 idx;
BTF_TYPE_EMIT(struct bpf_timer);
- callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held());
+ callback_fn = rcu_dereference_check(t->cb.callback_fn, rcu_read_lock_bh_held());
if (!callback_fn)
goto out;
@@ -1155,46 +1180,129 @@ out:
return HRTIMER_NORESTART;
}
-BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map,
- u64, flags)
+static void bpf_wq_work(struct work_struct *work)
{
- clockid_t clockid = flags & (MAX_CLOCKS - 1);
+ struct bpf_work *w = container_of(work, struct bpf_work, work);
+ struct bpf_async_cb *cb = &w->cb;
+ struct bpf_map *map = cb->map;
+ bpf_callback_t callback_fn;
+ void *value = cb->value;
+ void *key;
+ u32 idx;
+
+ BTF_TYPE_EMIT(struct bpf_wq);
+
+ callback_fn = READ_ONCE(cb->callback_fn);
+ if (!callback_fn)
+ return;
+
+ if (map->map_type == BPF_MAP_TYPE_ARRAY) {
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ /* compute the key */
+ idx = ((char *)value - array->value) / array->elem_size;
+ key = &idx;
+ } else { /* hash or lru */
+ key = value - round_up(map->key_size, 8);
+ }
+
+ rcu_read_lock_trace();
+ migrate_disable();
+
+ callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
+
+ migrate_enable();
+ rcu_read_unlock_trace();
+}
+
+static void bpf_wq_delete_work(struct work_struct *work)
+{
+ struct bpf_work *w = container_of(work, struct bpf_work, delete_work);
+
+ cancel_work_sync(&w->work);
+
+ kfree_rcu(w, cb.rcu);
+}
+
+static void bpf_timer_delete_work(struct work_struct *work)
+{
+ struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work);
+
+ /* Cancel the timer and wait for callback to complete if it was running.
+ * If hrtimer_cancel() can be safely called it's safe to call
+ * kfree_rcu(t) right after for both preallocated and non-preallocated
+ * maps. The async->cb = NULL was already done and no code path can see
+ * address 't' anymore. Timer if armed for existing bpf_hrtimer before
+ * bpf_timer_cancel_and_free will have been cancelled.
+ */
+ hrtimer_cancel(&t->timer);
+ kfree_rcu(t, cb.rcu);
+}
+
+static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
+ enum bpf_async_type type)
+{
+ struct bpf_async_cb *cb;
struct bpf_hrtimer *t;
+ struct bpf_work *w;
+ clockid_t clockid;
+ size_t size;
int ret = 0;
- BUILD_BUG_ON(MAX_CLOCKS != 16);
- BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer));
- BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer));
-
if (in_nmi())
return -EOPNOTSUPP;
- if (flags >= MAX_CLOCKS ||
- /* similar to timerfd except _ALARM variants are not supported */
- (clockid != CLOCK_MONOTONIC &&
- clockid != CLOCK_REALTIME &&
- clockid != CLOCK_BOOTTIME))
+ switch (type) {
+ case BPF_ASYNC_TYPE_TIMER:
+ size = sizeof(struct bpf_hrtimer);
+ break;
+ case BPF_ASYNC_TYPE_WQ:
+ size = sizeof(struct bpf_work);
+ break;
+ default:
return -EINVAL;
- __bpf_spin_lock_irqsave(&timer->lock);
- t = timer->timer;
+ }
+
+ __bpf_spin_lock_irqsave(&async->lock);
+ t = async->timer;
if (t) {
ret = -EBUSY;
goto out;
}
+
/* allocate hrtimer via map_kmalloc to use memcg accounting */
- t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node);
- if (!t) {
+ cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
+ if (!cb) {
ret = -ENOMEM;
goto out;
}
- t->value = (void *)timer - map->record->timer_off;
- t->map = map;
- t->prog = NULL;
- rcu_assign_pointer(t->callback_fn, NULL);
- hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
- t->timer.function = bpf_timer_cb;
- WRITE_ONCE(timer->timer, t);
- /* Guarantee the order between timer->timer and map->usercnt. So
+
+ switch (type) {
+ case BPF_ASYNC_TYPE_TIMER:
+ clockid = flags & (MAX_CLOCKS - 1);
+ t = (struct bpf_hrtimer *)cb;
+
+ atomic_set(&t->cancelling, 0);
+ INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work);
+ hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
+ t->timer.function = bpf_timer_cb;
+ cb->value = (void *)async - map->record->timer_off;
+ break;
+ case BPF_ASYNC_TYPE_WQ:
+ w = (struct bpf_work *)cb;
+
+ INIT_WORK(&w->work, bpf_wq_work);
+ INIT_WORK(&w->delete_work, bpf_wq_delete_work);
+ cb->value = (void *)async - map->record->wq_off;
+ break;
+ }
+ cb->map = map;
+ cb->prog = NULL;
+ cb->flags = flags;
+ rcu_assign_pointer(cb->callback_fn, NULL);
+
+ WRITE_ONCE(async->cb, cb);
+ /* Guarantee the order between async->cb and map->usercnt. So
* when there are concurrent uref release and bpf timer init, either
* bpf_timer_cancel_and_free() called by uref release reads a no-NULL
* timer or atomic64_read() below returns a zero usercnt.
@@ -1204,15 +1312,34 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map
/* maps with timers must be either held by user space
* or pinned in bpffs.
*/
- WRITE_ONCE(timer->timer, NULL);
- kfree(t);
+ WRITE_ONCE(async->cb, NULL);
+ kfree(cb);
ret = -EPERM;
}
out:
- __bpf_spin_unlock_irqrestore(&timer->lock);
+ __bpf_spin_unlock_irqrestore(&async->lock);
return ret;
}
+BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map,
+ u64, flags)
+{
+ clock_t clockid = flags & (MAX_CLOCKS - 1);
+
+ BUILD_BUG_ON(MAX_CLOCKS != 16);
+ BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer));
+ BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer));
+
+ if (flags >= MAX_CLOCKS ||
+ /* similar to timerfd except _ALARM variants are not supported */
+ (clockid != CLOCK_MONOTONIC &&
+ clockid != CLOCK_REALTIME &&
+ clockid != CLOCK_BOOTTIME))
+ return -EINVAL;
+
+ return __bpf_async_init(timer, map, flags, BPF_ASYNC_TYPE_TIMER);
+}
+
static const struct bpf_func_proto bpf_timer_init_proto = {
.func = bpf_timer_init,
.gpl_only = true,
@@ -1222,22 +1349,23 @@ static const struct bpf_func_proto bpf_timer_init_proto = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn,
- struct bpf_prog_aux *, aux)
+static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn,
+ struct bpf_prog_aux *aux, unsigned int flags,
+ enum bpf_async_type type)
{
struct bpf_prog *prev, *prog = aux->prog;
- struct bpf_hrtimer *t;
+ struct bpf_async_cb *cb;
int ret = 0;
if (in_nmi())
return -EOPNOTSUPP;
- __bpf_spin_lock_irqsave(&timer->lock);
- t = timer->timer;
- if (!t) {
+ __bpf_spin_lock_irqsave(&async->lock);
+ cb = async->cb;
+ if (!cb) {
ret = -EINVAL;
goto out;
}
- if (!atomic64_read(&t->map->usercnt)) {
+ if (!atomic64_read(&cb->map->usercnt)) {
/* maps with timers must be either held by user space
* or pinned in bpffs. Otherwise timer might still be
* running even when bpf prog is detached and user space
@@ -1246,7 +1374,7 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb
ret = -EPERM;
goto out;
}
- prev = t->prog;
+ prev = cb->prog;
if (prev != prog) {
/* Bump prog refcnt once. Every bpf_timer_set_callback()
* can pick different callback_fn-s within the same prog.
@@ -1259,14 +1387,20 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb
if (prev)
/* Drop prev prog refcnt when swapping with new prog */
bpf_prog_put(prev);
- t->prog = prog;
+ cb->prog = prog;
}
- rcu_assign_pointer(t->callback_fn, callback_fn);
+ rcu_assign_pointer(cb->callback_fn, callback_fn);
out:
- __bpf_spin_unlock_irqrestore(&timer->lock);
+ __bpf_spin_unlock_irqrestore(&async->lock);
return ret;
}
+BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn,
+ struct bpf_prog_aux *, aux)
+{
+ return __bpf_async_set_callback(timer, callback_fn, aux, 0, BPF_ASYNC_TYPE_TIMER);
+}
+
static const struct bpf_func_proto bpf_timer_set_callback_proto = {
.func = bpf_timer_set_callback,
.gpl_only = true,
@@ -1275,7 +1409,7 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = {
.arg2_type = ARG_PTR_TO_FUNC,
};
-BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags)
+BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags)
{
struct bpf_hrtimer *t;
int ret = 0;
@@ -1287,7 +1421,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
return -EINVAL;
__bpf_spin_lock_irqsave(&timer->lock);
t = timer->timer;
- if (!t || !t->prog) {
+ if (!t || !t->cb.prog) {
ret = -EINVAL;
goto out;
}
@@ -1315,20 +1449,21 @@ static const struct bpf_func_proto bpf_timer_start_proto = {
.arg3_type = ARG_ANYTHING,
};
-static void drop_prog_refcnt(struct bpf_hrtimer *t)
+static void drop_prog_refcnt(struct bpf_async_cb *async)
{
- struct bpf_prog *prog = t->prog;
+ struct bpf_prog *prog = async->prog;
if (prog) {
bpf_prog_put(prog);
- t->prog = NULL;
- rcu_assign_pointer(t->callback_fn, NULL);
+ async->prog = NULL;
+ rcu_assign_pointer(async->callback_fn, NULL);
}
}
-BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
+BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
{
- struct bpf_hrtimer *t;
+ struct bpf_hrtimer *t, *cur_t;
+ bool inc = false;
int ret = 0;
if (in_nmi())
@@ -1340,21 +1475,50 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
ret = -EINVAL;
goto out;
}
- if (this_cpu_read(hrtimer_running) == t) {
+
+ cur_t = this_cpu_read(hrtimer_running);
+ if (cur_t == t) {
/* If bpf callback_fn is trying to bpf_timer_cancel()
* its own timer the hrtimer_cancel() will deadlock
- * since it waits for callback_fn to finish
+ * since it waits for callback_fn to finish.
*/
ret = -EDEADLK;
goto out;
}
- drop_prog_refcnt(t);
+
+ /* Only account in-flight cancellations when invoked from a timer
+ * callback, since we want to avoid waiting only if other _callbacks_
+ * are waiting on us, to avoid introducing lockups. Non-callback paths
+ * are ok, since nobody would synchronously wait for their completion.
+ */
+ if (!cur_t)
+ goto drop;
+ atomic_inc(&t->cancelling);
+ /* Need full barrier after relaxed atomic_inc */
+ smp_mb__after_atomic();
+ inc = true;
+ if (atomic_read(&cur_t->cancelling)) {
+ /* We're cancelling timer t, while some other timer callback is
+ * attempting to cancel us. In such a case, it might be possible
+ * that timer t belongs to the other callback, or some other
+ * callback waiting upon it (creating transitive dependencies
+ * upon us), and we will enter a deadlock if we continue
+ * cancelling and waiting for it synchronously, since it might
+ * do the same. Bail!
+ */
+ ret = -EDEADLK;
+ goto out;
+ }
+drop:
+ drop_prog_refcnt(&t->cb);
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
/* Cancel the timer and wait for associated callback to finish
* if it was running.
*/
ret = ret ?: hrtimer_cancel(&t->timer);
+ if (inc)
+ atomic_dec(&t->cancelling);
rcu_read_unlock();
return ret;
}
@@ -1366,57 +1530,114 @@ static const struct bpf_func_proto bpf_timer_cancel_proto = {
.arg1_type = ARG_PTR_TO_TIMER,
};
-/* This function is called by map_delete/update_elem for individual element and
- * by ops->map_release_uref when the user space reference to a map reaches zero.
- */
-void bpf_timer_cancel_and_free(void *val)
+static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *async)
{
- struct bpf_timer_kern *timer = val;
- struct bpf_hrtimer *t;
+ struct bpf_async_cb *cb;
- /* Performance optimization: read timer->timer without lock first. */
- if (!READ_ONCE(timer->timer))
- return;
+ /* Performance optimization: read async->cb without lock first. */
+ if (!READ_ONCE(async->cb))
+ return NULL;
- __bpf_spin_lock_irqsave(&timer->lock);
+ __bpf_spin_lock_irqsave(&async->lock);
/* re-read it under lock */
- t = timer->timer;
- if (!t)
+ cb = async->cb;
+ if (!cb)
goto out;
- drop_prog_refcnt(t);
+ drop_prog_refcnt(cb);
/* The subsequent bpf_timer_start/cancel() helpers won't be able to use
* this timer, since it won't be initialized.
*/
- WRITE_ONCE(timer->timer, NULL);
+ WRITE_ONCE(async->cb, NULL);
out:
- __bpf_spin_unlock_irqrestore(&timer->lock);
+ __bpf_spin_unlock_irqrestore(&async->lock);
+ return cb;
+}
+
+/* This function is called by map_delete/update_elem for individual element and
+ * by ops->map_release_uref when the user space reference to a map reaches zero.
+ */
+void bpf_timer_cancel_and_free(void *val)
+{
+ struct bpf_hrtimer *t;
+
+ t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val);
+
if (!t)
return;
- /* Cancel the timer and wait for callback to complete if it was running.
- * If hrtimer_cancel() can be safely called it's safe to call kfree(t)
- * right after for both preallocated and non-preallocated maps.
- * The timer->timer = NULL was already done and no code path can
- * see address 't' anymore.
- *
- * Check that bpf_map_delete/update_elem() wasn't called from timer
- * callback_fn. In such case don't call hrtimer_cancel() (since it will
- * deadlock) and don't call hrtimer_try_to_cancel() (since it will just
- * return -1). Though callback_fn is still running on this cpu it's
+ /* We check that bpf_map_delete/update_elem() was called from timer
+ * callback_fn. In such case we don't call hrtimer_cancel() (since it
+ * will deadlock) and don't call hrtimer_try_to_cancel() (since it will
+ * just return -1). Though callback_fn is still running on this cpu it's
* safe to do kfree(t) because bpf_timer_cb() read everything it needed
* from 't'. The bpf subprog callback_fn won't be able to access 't',
- * since timer->timer = NULL was already done. The timer will be
+ * since async->cb = NULL was already done. The timer will be
* effectively cancelled because bpf_timer_cb() will return
* HRTIMER_NORESTART.
+ *
+ * However, it is possible the timer callback_fn calling us armed the
+ * timer _before_ calling us, such that failing to cancel it here will
+ * cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
+ * Therefore, we _need_ to cancel any outstanding timers before we do
+ * kfree_rcu, even though no more timers can be armed.
+ *
+ * Moreover, we need to schedule work even if timer does not belong to
+ * the calling callback_fn, as on two different CPUs, we can end up in a
+ * situation where both sides run in parallel, try to cancel one
+ * another, and we end up waiting on both sides in hrtimer_cancel
+ * without making forward progress, since timer1 depends on time2
+ * callback to finish, and vice versa.
+ *
+ * CPU 1 (timer1_cb) CPU 2 (timer2_cb)
+ * bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1)
+ *
+ * To avoid these issues, punt to workqueue context when we are in a
+ * timer callback.
+ */
+ if (this_cpu_read(hrtimer_running)) {
+ queue_work(system_unbound_wq, &t->cb.delete_work);
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ /* If the timer is running on other CPU, also use a kworker to
+ * wait for the completion of the timer instead of trying to
+ * acquire a sleepable lock in hrtimer_cancel() to wait for its
+ * completion.
+ */
+ if (hrtimer_try_to_cancel(&t->timer) >= 0)
+ kfree_rcu(t, cb.rcu);
+ else
+ queue_work(system_unbound_wq, &t->cb.delete_work);
+ } else {
+ bpf_timer_delete_work(&t->cb.delete_work);
+ }
+}
+
+/* This function is called by map_delete/update_elem for individual element and
+ * by ops->map_release_uref when the user space reference to a map reaches zero.
+ */
+void bpf_wq_cancel_and_free(void *val)
+{
+ struct bpf_work *work;
+
+ BTF_TYPE_EMIT(struct bpf_wq);
+
+ work = (struct bpf_work *)__bpf_async_cancel_and_free(val);
+ if (!work)
+ return;
+ /* Trigger cancel of the sleepable work, but *do not* wait for
+ * it to finish if it was running as we might not be in a
+ * sleepable context.
+ * kfree will be called once the work has finished.
*/
- if (this_cpu_read(hrtimer_running) != t)
- hrtimer_cancel(&t->timer);
- kfree_rcu(t, rcu);
+ schedule_work(&work->delete_work);
}
-BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
+BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr)
{
- unsigned long *kptr = map_value;
+ unsigned long *kptr = dst;
+ /* This helper may be inlined by verifier. */
return xchg(kptr, (unsigned long)ptr);
}
@@ -1429,7 +1650,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
.ret_btf_id = BPF_PTR_POISON,
- .arg1_type = ARG_PTR_TO_KPTR,
+ .arg1_type = ARG_KPTR_XCHG_DEST,
.arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE,
.arg2_btf_id = BPF_PTR_POISON,
};
@@ -1442,7 +1663,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = {
#define DYNPTR_SIZE_MASK 0xFFFFFF
#define DYNPTR_RDONLY_BIT BIT(31)
-static bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
+bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
{
return ptr->size & DYNPTR_RDONLY_BIT;
}
@@ -1535,7 +1756,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
+ .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE,
};
BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
@@ -1682,7 +1903,7 @@ const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id)
+bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_map_lookup_elem:
@@ -1729,11 +1950,15 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_strtol_proto;
case BPF_FUNC_strtoul:
return &bpf_strtoul_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_ns_current_pid_tgid:
+ return &bpf_get_ns_current_pid_tgid_proto;
default:
break;
}
- if (!bpf_capable())
+ if (!bpf_token_capable(prog->aux->token, CAP_BPF))
return NULL;
switch (func_id) {
@@ -1791,7 +2016,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
break;
}
- if (!perfmon_capable())
+ if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
return NULL;
switch (func_id) {
@@ -1823,6 +2048,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return NULL;
}
}
+EXPORT_SYMBOL_GPL(bpf_base_func_proto);
void bpf_list_head_free(const struct btf_field *field, void *list_head,
struct bpf_spin_lock *spin_lock)
@@ -1854,9 +2080,7 @@ unlock:
/* The contained type can also have resources, including a
* bpf_list_head which needs to be freed.
*/
- migrate_disable();
__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
- migrate_enable();
}
}
@@ -1893,9 +2117,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
obj -= field->graph_root.node_offset;
- migrate_disable();
__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
- migrate_enable();
}
}
@@ -2247,6 +2469,29 @@ __bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task,
return ret;
}
+BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct cgroup *cgrp;
+
+ if (unlikely(idx >= array->map.max_entries))
+ return -E2BIG;
+
+ cgrp = READ_ONCE(array->ptrs[idx]);
+ if (unlikely(!cgrp))
+ return -EAGAIN;
+
+ return task_under_cgroup_hierarchy(current, cgrp);
+}
+
+const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
+ .func = bpf_current_task_under_cgroup,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
+
/**
* bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a
* specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
@@ -2287,8 +2532,27 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
}
/**
+ * bpf_task_from_vpid - Find a struct task_struct from its vpid by looking it up
+ * in the pid namespace of the current task. If a task is returned, it must
+ * either be stored in a map, or released with bpf_task_release().
+ * @vpid: The vpid of the task being looked up.
+ */
+__bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid)
+{
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = find_task_by_vpid(vpid);
+ if (p)
+ p = bpf_task_acquire(p);
+ rcu_read_unlock();
+
+ return p;
+}
+
+/**
* bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
- * @ptr: The dynptr whose data slice to retrieve
+ * @p: The dynptr whose data slice to retrieve
* @offset: Offset into the dynptr
* @buffer__opt: User-provided buffer to copy contents into. May be NULL
* @buffer__szk: Size (in bytes) of the buffer if present. This is the
@@ -2314,9 +2578,10 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
* provided buffer, with its contents containing the data, if unable to obtain
* direct pointer)
*/
-__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset,
+__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset,
void *buffer__opt, u32 buffer__szk)
{
+ const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
enum bpf_dynptr_type type;
u32 len = buffer__szk;
int err;
@@ -2358,7 +2623,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
/**
* bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data.
- * @ptr: The dynptr whose data slice to retrieve
+ * @p: The dynptr whose data slice to retrieve
* @offset: Offset into the dynptr
* @buffer__opt: User-provided buffer to copy contents into. May be NULL
* @buffer__szk: Size (in bytes) of the buffer if present. This is the
@@ -2398,16 +2663,18 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
* provided buffer, with its contents containing the data, if unable to obtain
* direct pointer)
*/
-__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset,
+__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset,
void *buffer__opt, u32 buffer__szk)
{
+ const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
if (!ptr->data || __bpf_dynptr_is_rdonly(ptr))
return NULL;
/* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
*
* For skb-type dynptrs, it is safe to write into the returned pointer
- * if the bpf program allows skb data writes. There are two possiblities
+ * if the bpf program allows skb data writes. There are two possibilities
* that may occur when calling bpf_dynptr_slice_rdwr:
*
* 1) The requested slice is in the head of the skb. In this case, the
@@ -2426,11 +2693,12 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 o
* will be copied out into the buffer and the user will need to call
* bpf_dynptr_write() to commit changes.
*/
- return bpf_dynptr_slice(ptr, offset, buffer__opt, buffer__szk);
+ return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk);
}
-__bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 end)
+__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
u32 size;
if (!ptr->data || start > end)
@@ -2447,36 +2715,45 @@ __bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 en
return 0;
}
-__bpf_kfunc bool bpf_dynptr_is_null(struct bpf_dynptr_kern *ptr)
+__bpf_kfunc bool bpf_dynptr_is_null(const struct bpf_dynptr *p)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
return !ptr->data;
}
-__bpf_kfunc bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr)
+__bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
if (!ptr->data)
return false;
return __bpf_dynptr_is_rdonly(ptr);
}
-__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
+__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr *p)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
if (!ptr->data)
return -EINVAL;
return __bpf_dynptr_size(ptr);
}
-__bpf_kfunc int bpf_dynptr_clone(struct bpf_dynptr_kern *ptr,
- struct bpf_dynptr_kern *clone__uninit)
+__bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p,
+ struct bpf_dynptr *clone__uninit)
{
+ struct bpf_dynptr_kern *clone = (struct bpf_dynptr_kern *)clone__uninit;
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
if (!ptr->data) {
- bpf_dynptr_set_null(clone__uninit);
+ bpf_dynptr_set_null(clone);
return -EINVAL;
}
- *clone__uninit = *ptr;
+ *clone = *ptr;
return 0;
}
@@ -2486,9 +2763,9 @@ __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
return obj;
}
-__bpf_kfunc void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k)
+__bpf_kfunc void *bpf_rdonly_cast(const void *obj__ign, u32 btf_id__k)
{
- return obj__ign;
+ return (void *)obj__ign;
}
__bpf_kfunc void bpf_rcu_read_lock(void)
@@ -2544,10 +2821,271 @@ __bpf_kfunc void bpf_throw(u64 cookie)
WARN(1, "A call to BPF exception callback should never return\n");
}
+__bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags)
+{
+ struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
+ struct bpf_map *map = p__map;
+
+ BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq));
+ BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq));
+
+ if (flags)
+ return -EINVAL;
+
+ return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ);
+}
+
+__bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags)
+{
+ struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
+ struct bpf_work *w;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ if (flags)
+ return -EINVAL;
+ w = READ_ONCE(async->work);
+ if (!w || !READ_ONCE(w->cb.prog))
+ return -EINVAL;
+
+ schedule_work(&w->work);
+ return 0;
+}
+
+__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq,
+ int (callback_fn)(void *map, int *key, void *value),
+ unsigned int flags,
+ void *aux__ign)
+{
+ struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__ign;
+ struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
+
+ if (flags)
+ return -EINVAL;
+
+ return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ);
+}
+
+__bpf_kfunc void bpf_preempt_disable(void)
+{
+ preempt_disable();
+}
+
+__bpf_kfunc void bpf_preempt_enable(void)
+{
+ preempt_enable();
+}
+
+struct bpf_iter_bits {
+ __u64 __opaque[2];
+} __aligned(8);
+
+#define BITS_ITER_NR_WORDS_MAX 511
+
+struct bpf_iter_bits_kern {
+ union {
+ __u64 *bits;
+ __u64 bits_copy;
+ };
+ int nr_bits;
+ int bit;
+} __aligned(8);
+
+/* On 64-bit hosts, unsigned long and u64 have the same size, so passing
+ * a u64 pointer and an unsigned long pointer to find_next_bit() will
+ * return the same result, as both point to the same 8-byte area.
+ *
+ * For 32-bit little-endian hosts, using a u64 pointer or unsigned long
+ * pointer also makes no difference. This is because the first iterated
+ * unsigned long is composed of bits 0-31 of the u64 and the second unsigned
+ * long is composed of bits 32-63 of the u64.
+ *
+ * However, for 32-bit big-endian hosts, this is not the case. The first
+ * iterated unsigned long will be bits 32-63 of the u64, so swap these two
+ * ulong values within the u64.
+ */
+static void swap_ulong_in_u64(u64 *bits, unsigned int nr)
+{
+#if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN)
+ unsigned int i;
+
+ for (i = 0; i < nr; i++)
+ bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32);
+#endif
+}
+
+/**
+ * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area
+ * @it: The new bpf_iter_bits to be created
+ * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over
+ * @nr_words: The size of the specified memory area, measured in 8-byte units.
+ * The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be
+ * further reduced by the BPF memory allocator implementation.
+ *
+ * This function initializes a new bpf_iter_bits structure for iterating over
+ * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It
+ * copies the data of the memory area to the newly created bpf_iter_bits @it for
+ * subsequent iteration operations.
+ *
+ * On success, 0 is returned. On failure, ERR is returned.
+ */
+__bpf_kfunc int
+bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_words)
+{
+ struct bpf_iter_bits_kern *kit = (void *)it;
+ u32 nr_bytes = nr_words * sizeof(u64);
+ u32 nr_bits = BYTES_TO_BITS(nr_bytes);
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_bits_kern) != sizeof(struct bpf_iter_bits));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_bits_kern) !=
+ __alignof__(struct bpf_iter_bits));
+
+ kit->nr_bits = 0;
+ kit->bits_copy = 0;
+ kit->bit = -1;
+
+ if (!unsafe_ptr__ign || !nr_words)
+ return -EINVAL;
+ if (nr_words > BITS_ITER_NR_WORDS_MAX)
+ return -E2BIG;
+
+ /* Optimization for u64 mask */
+ if (nr_bits == 64) {
+ err = bpf_probe_read_kernel_common(&kit->bits_copy, nr_bytes, unsafe_ptr__ign);
+ if (err)
+ return -EFAULT;
+
+ swap_ulong_in_u64(&kit->bits_copy, nr_words);
+
+ kit->nr_bits = nr_bits;
+ return 0;
+ }
+
+ if (bpf_mem_alloc_check_size(false, nr_bytes))
+ return -E2BIG;
+
+ /* Fallback to memalloc */
+ kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes);
+ if (!kit->bits)
+ return -ENOMEM;
+
+ err = bpf_probe_read_kernel_common(kit->bits, nr_bytes, unsafe_ptr__ign);
+ if (err) {
+ bpf_mem_free(&bpf_global_ma, kit->bits);
+ return err;
+ }
+
+ swap_ulong_in_u64(kit->bits, nr_words);
+
+ kit->nr_bits = nr_bits;
+ return 0;
+}
+
+/**
+ * bpf_iter_bits_next() - Get the next bit in a bpf_iter_bits
+ * @it: The bpf_iter_bits to be checked
+ *
+ * This function returns a pointer to a number representing the value of the
+ * next bit in the bits.
+ *
+ * If there are no further bits available, it returns NULL.
+ */
+__bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it)
+{
+ struct bpf_iter_bits_kern *kit = (void *)it;
+ int bit = kit->bit, nr_bits = kit->nr_bits;
+ const void *bits;
+
+ if (!nr_bits || bit >= nr_bits)
+ return NULL;
+
+ bits = nr_bits == 64 ? &kit->bits_copy : kit->bits;
+ bit = find_next_bit(bits, nr_bits, bit + 1);
+ if (bit >= nr_bits) {
+ kit->bit = bit;
+ return NULL;
+ }
+
+ kit->bit = bit;
+ return &kit->bit;
+}
+
+/**
+ * bpf_iter_bits_destroy() - Destroy a bpf_iter_bits
+ * @it: The bpf_iter_bits to be destroyed
+ *
+ * Destroy the resource associated with the bpf_iter_bits.
+ */
+__bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it)
+{
+ struct bpf_iter_bits_kern *kit = (void *)it;
+
+ if (kit->nr_bits <= 64)
+ return;
+ bpf_mem_free(&bpf_global_ma, kit->bits);
+}
+
+/**
+ * bpf_copy_from_user_str() - Copy a string from an unsafe user address
+ * @dst: Destination address, in kernel space. This buffer must be
+ * at least @dst__sz bytes long.
+ * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL.
+ * @unsafe_ptr__ign: Source address, in user space.
+ * @flags: The only supported flag is BPF_F_PAD_ZEROS
+ *
+ * Copies a NUL-terminated string from userspace to BPF space. If user string is
+ * too long this will still ensure zero termination in the dst buffer unless
+ * buffer size is 0.
+ *
+ * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst to 0 on success and
+ * memset all of @dst on failure.
+ */
+__bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user *unsafe_ptr__ign, u64 flags)
+{
+ int ret;
+
+ if (unlikely(flags & ~BPF_F_PAD_ZEROS))
+ return -EINVAL;
+
+ if (unlikely(!dst__sz))
+ return 0;
+
+ ret = strncpy_from_user(dst, unsafe_ptr__ign, dst__sz - 1);
+ if (ret < 0) {
+ if (flags & BPF_F_PAD_ZEROS)
+ memset((char *)dst, 0, dst__sz);
+
+ return ret;
+ }
+
+ if (flags & BPF_F_PAD_ZEROS)
+ memset((char *)dst + ret, 0, dst__sz - ret);
+ else
+ ((char *)dst)[ret] = '\0';
+
+ return ret + 1;
+}
+
+/* Keep unsinged long in prototype so that kfunc is usable when emitted to
+ * vmlinux.h in BPF programs directly, but note that while in BPF prog, the
+ * unsigned long always points to 8-byte region on stack, the kernel may only
+ * read and write the 4-bytes on 32-bit.
+ */
+__bpf_kfunc void bpf_local_irq_save(unsigned long *flags__irq_flag)
+{
+ local_irq_save(*flags__irq_flag);
+}
+
+__bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag)
+{
+ local_irq_restore(*flags__irq_flag);
+}
+
__bpf_kfunc_end_defs();
-BTF_SET8_START(generic_btf_ids)
-#ifdef CONFIG_KEXEC_CORE
+BTF_KFUNCS_START(generic_btf_ids)
+#ifdef CONFIG_CRASH_DUMP
BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
#endif
BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
@@ -2574,8 +3112,12 @@ BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
#endif
BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_throw)
-BTF_SET8_END(generic_btf_ids)
+#ifdef CONFIG_BPF_EVENTS
+BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS)
+#endif
+BTF_KFUNCS_END(generic_btf_ids)
static const struct btf_kfunc_id_set generic_kfunc_set = {
.owner = THIS_MODULE,
@@ -2591,9 +3133,9 @@ BTF_ID(struct, cgroup)
BTF_ID(func, bpf_cgroup_release_dtor)
#endif
-BTF_SET8_START(common_btf_ids)
-BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
-BTF_ID_FLAGS(func, bpf_rdonly_cast)
+BTF_KFUNCS_START(common_btf_ids)
+BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx, KF_FASTCALL)
+BTF_ID_FLAGS(func, bpf_rdonly_cast, KF_FASTCALL)
BTF_ID_FLAGS(func, bpf_rcu_read_lock)
BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
@@ -2620,7 +3162,25 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null)
BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
BTF_ID_FLAGS(func, bpf_dynptr_size)
BTF_ID_FLAGS(func, bpf_dynptr_clone)
-BTF_SET8_END(common_btf_ids)
+#ifdef CONFIG_NET
+BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
+#endif
+BTF_ID_FLAGS(func, bpf_wq_init)
+BTF_ID_FLAGS(func, bpf_wq_set_callback_impl)
+BTF_ID_FLAGS(func, bpf_wq_start)
+BTF_ID_FLAGS(func, bpf_preempt_disable)
+BTF_ID_FLAGS(func, bpf_preempt_enable)
+BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_get_kmem_cache)
+BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_local_irq_save)
+BTF_ID_FLAGS(func, bpf_local_irq_restore)
+BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
.owner = THIS_MODULE,
@@ -2647,6 +3207,8 @@ static int __init kfunc_init(void)
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &generic_kfunc_set);
ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
ARRAY_SIZE(generic_dtors),
THIS_MODULE);
@@ -2660,7 +3222,9 @@ late_initcall(kfunc_init);
*/
const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len)
{
- return bpf_dynptr_slice(ptr, 0, NULL, len);
+ const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr;
+
+ return bpf_dynptr_slice(p, 0, NULL, len);
}
/* Get a pointer to dynptr data up to len bytes for read write access. If
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 41e0a55c35f5..9aaf5124648b 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -20,6 +20,7 @@
#include <linux/filter.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
+#include <linux/kstrtox.h>
#include "preload/bpf_preload.h"
enum bpf_type {
@@ -98,9 +99,9 @@ static const struct inode_operations bpf_prog_iops = { };
static const struct inode_operations bpf_map_iops = { };
static const struct inode_operations bpf_link_iops = { };
-static struct inode *bpf_get_inode(struct super_block *sb,
- const struct inode *dir,
- umode_t mode)
+struct inode *bpf_get_inode(struct super_block *sb,
+ const struct inode *dir,
+ umode_t mode)
{
struct inode *inode;
@@ -594,6 +595,136 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ
}
EXPORT_SYMBOL(bpf_prog_get_type_path);
+struct bpffs_btf_enums {
+ const struct btf *btf;
+ const struct btf_type *cmd_t;
+ const struct btf_type *map_t;
+ const struct btf_type *prog_t;
+ const struct btf_type *attach_t;
+};
+
+static int find_bpffs_btf_enums(struct bpffs_btf_enums *info)
+{
+ const struct btf *btf;
+ const struct btf_type *t;
+ const char *name;
+ int i, n;
+
+ memset(info, 0, sizeof(*info));
+
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+ if (!btf)
+ return -ENOENT;
+
+ info->btf = btf;
+
+ for (i = 1, n = btf_nr_types(btf); i < n; i++) {
+ t = btf_type_by_id(btf, i);
+ if (!btf_type_is_enum(t))
+ continue;
+
+ name = btf_name_by_offset(btf, t->name_off);
+ if (!name)
+ continue;
+
+ if (strcmp(name, "bpf_cmd") == 0)
+ info->cmd_t = t;
+ else if (strcmp(name, "bpf_map_type") == 0)
+ info->map_t = t;
+ else if (strcmp(name, "bpf_prog_type") == 0)
+ info->prog_t = t;
+ else if (strcmp(name, "bpf_attach_type") == 0)
+ info->attach_t = t;
+ else
+ continue;
+
+ if (info->cmd_t && info->map_t && info->prog_t && info->attach_t)
+ return 0;
+ }
+
+ return -ESRCH;
+}
+
+static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t,
+ const char *prefix, const char *str, int *value)
+{
+ const struct btf_enum *e;
+ const char *name;
+ int i, n, pfx_len = strlen(prefix);
+
+ *value = 0;
+
+ if (!btf || !enum_t)
+ return false;
+
+ for (i = 0, n = btf_vlen(enum_t); i < n; i++) {
+ e = &btf_enum(enum_t)[i];
+
+ name = btf_name_by_offset(btf, e->name_off);
+ if (!name || strncasecmp(name, prefix, pfx_len) != 0)
+ continue;
+
+ /* match symbolic name case insensitive and ignoring prefix */
+ if (strcasecmp(name + pfx_len, str) == 0) {
+ *value = e->val;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static void seq_print_delegate_opts(struct seq_file *m,
+ const char *opt_name,
+ const struct btf *btf,
+ const struct btf_type *enum_t,
+ const char *prefix,
+ u64 delegate_msk, u64 any_msk)
+{
+ const struct btf_enum *e;
+ bool first = true;
+ const char *name;
+ u64 msk;
+ int i, n, pfx_len = strlen(prefix);
+
+ delegate_msk &= any_msk; /* clear unknown bits */
+
+ if (delegate_msk == 0)
+ return;
+
+ seq_printf(m, ",%s", opt_name);
+ if (delegate_msk == any_msk) {
+ seq_printf(m, "=any");
+ return;
+ }
+
+ if (btf && enum_t) {
+ for (i = 0, n = btf_vlen(enum_t); i < n; i++) {
+ e = &btf_enum(enum_t)[i];
+ name = btf_name_by_offset(btf, e->name_off);
+ if (!name || strncasecmp(name, prefix, pfx_len) != 0)
+ continue;
+ msk = 1ULL << e->val;
+ if (delegate_msk & msk) {
+ /* emit lower-case name without prefix */
+ seq_putc(m, first ? '=' : ':');
+ name += pfx_len;
+ while (*name) {
+ seq_putc(m, tolower(*name));
+ name++;
+ }
+
+ delegate_msk &= ~msk;
+ first = false;
+ }
+ }
+ }
+ if (delegate_msk)
+ seq_printf(m, "%c0x%llx", first ? '=' : ':', delegate_msk);
+}
+
/*
* Display the mount options in /proc/mounts.
*/
@@ -601,6 +732,8 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
{
struct inode *inode = d_inode(root);
umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX;
+ struct bpf_mount_opts *opts = root->d_sb->s_fs_info;
+ u64 mask;
if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID))
seq_printf(m, ",uid=%u",
@@ -610,6 +743,35 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
from_kgid_munged(&init_user_ns, inode->i_gid));
if (mode != S_IRWXUGO)
seq_printf(m, ",mode=%o", mode);
+
+ if (opts->delegate_cmds || opts->delegate_maps ||
+ opts->delegate_progs || opts->delegate_attachs) {
+ struct bpffs_btf_enums info;
+
+ /* ignore errors, fallback to hex */
+ (void)find_bpffs_btf_enums(&info);
+
+ mask = (1ULL << __MAX_BPF_CMD) - 1;
+ seq_print_delegate_opts(m, "delegate_cmds",
+ info.btf, info.cmd_t, "BPF_",
+ opts->delegate_cmds, mask);
+
+ mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1;
+ seq_print_delegate_opts(m, "delegate_maps",
+ info.btf, info.map_t, "BPF_MAP_TYPE_",
+ opts->delegate_maps, mask);
+
+ mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1;
+ seq_print_delegate_opts(m, "delegate_progs",
+ info.btf, info.prog_t, "BPF_PROG_TYPE_",
+ opts->delegate_progs, mask);
+
+ mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1;
+ seq_print_delegate_opts(m, "delegate_attachs",
+ info.btf, info.attach_t, "BPF_",
+ opts->delegate_attachs, mask);
+ }
+
return 0;
}
@@ -624,7 +786,7 @@ static void bpf_free_inode(struct inode *inode)
free_inode_nonrcu(inode);
}
-static const struct super_operations bpf_super_ops = {
+const struct super_operations bpf_super_ops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
.show_options = bpf_show_options,
@@ -635,28 +797,30 @@ enum {
OPT_UID,
OPT_GID,
OPT_MODE,
+ OPT_DELEGATE_CMDS,
+ OPT_DELEGATE_MAPS,
+ OPT_DELEGATE_PROGS,
+ OPT_DELEGATE_ATTACHS,
};
static const struct fs_parameter_spec bpf_fs_parameters[] = {
fsparam_u32 ("uid", OPT_UID),
fsparam_u32 ("gid", OPT_GID),
fsparam_u32oct ("mode", OPT_MODE),
+ fsparam_string ("delegate_cmds", OPT_DELEGATE_CMDS),
+ fsparam_string ("delegate_maps", OPT_DELEGATE_MAPS),
+ fsparam_string ("delegate_progs", OPT_DELEGATE_PROGS),
+ fsparam_string ("delegate_attachs", OPT_DELEGATE_ATTACHS),
{}
};
-struct bpf_mount_opts {
- kuid_t uid;
- kgid_t gid;
- umode_t mode;
-};
-
static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- struct bpf_mount_opts *opts = fc->fs_private;
+ struct bpf_mount_opts *opts = fc->s_fs_info;
struct fs_parse_result result;
kuid_t uid;
kgid_t gid;
- int opt;
+ int opt, err;
opt = fs_parse(fc, bpf_fs_parameters, param, &result);
if (opt < 0) {
@@ -708,6 +872,68 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
case OPT_MODE:
opts->mode = result.uint_32 & S_IALLUGO;
break;
+ case OPT_DELEGATE_CMDS:
+ case OPT_DELEGATE_MAPS:
+ case OPT_DELEGATE_PROGS:
+ case OPT_DELEGATE_ATTACHS: {
+ struct bpffs_btf_enums info;
+ const struct btf_type *enum_t;
+ const char *enum_pfx;
+ u64 *delegate_msk, msk = 0;
+ char *p, *str;
+ int val;
+
+ /* ignore errors, fallback to hex */
+ (void)find_bpffs_btf_enums(&info);
+
+ switch (opt) {
+ case OPT_DELEGATE_CMDS:
+ delegate_msk = &opts->delegate_cmds;
+ enum_t = info.cmd_t;
+ enum_pfx = "BPF_";
+ break;
+ case OPT_DELEGATE_MAPS:
+ delegate_msk = &opts->delegate_maps;
+ enum_t = info.map_t;
+ enum_pfx = "BPF_MAP_TYPE_";
+ break;
+ case OPT_DELEGATE_PROGS:
+ delegate_msk = &opts->delegate_progs;
+ enum_t = info.prog_t;
+ enum_pfx = "BPF_PROG_TYPE_";
+ break;
+ case OPT_DELEGATE_ATTACHS:
+ delegate_msk = &opts->delegate_attachs;
+ enum_t = info.attach_t;
+ enum_pfx = "BPF_";
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ str = param->string;
+ while ((p = strsep(&str, ":"))) {
+ if (strcmp(p, "any") == 0) {
+ msk |= ~0ULL;
+ } else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) {
+ msk |= 1ULL << val;
+ } else {
+ err = kstrtou64(p, 0, &msk);
+ if (err)
+ return err;
+ }
+ }
+
+ /* Setting delegation mount options requires privileges */
+ if (msk && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ *delegate_msk |= msk;
+ break;
+ }
+ default:
+ /* ignore unknown mount options */
+ break;
}
return 0;
@@ -784,10 +1010,14 @@ out:
static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
{
static const struct tree_descr bpf_rfiles[] = { { "" } };
- struct bpf_mount_opts *opts = fc->fs_private;
+ struct bpf_mount_opts *opts = sb->s_fs_info;
struct inode *inode;
int ret;
+ /* Mounting an instance of BPF FS requires privileges */
+ if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
if (ret)
return ret;
@@ -811,7 +1041,7 @@ static int bpf_get_tree(struct fs_context *fc)
static void bpf_free_fc(struct fs_context *fc)
{
- kfree(fc->fs_private);
+ kfree(fc->s_fs_info);
}
static const struct fs_context_operations bpf_context_ops = {
@@ -835,17 +1065,32 @@ static int bpf_init_fs_context(struct fs_context *fc)
opts->uid = current_fsuid();
opts->gid = current_fsgid();
- fc->fs_private = opts;
+ /* start out with no BPF token delegation enabled */
+ opts->delegate_cmds = 0;
+ opts->delegate_maps = 0;
+ opts->delegate_progs = 0;
+ opts->delegate_attachs = 0;
+
+ fc->s_fs_info = opts;
fc->ops = &bpf_context_ops;
return 0;
}
+static void bpf_kill_super(struct super_block *sb)
+{
+ struct bpf_mount_opts *opts = sb->s_fs_info;
+
+ kill_litter_super(sb);
+ kfree(opts);
+}
+
static struct file_system_type bpf_fs_type = {
.owner = THIS_MODULE,
.name = "bpf",
.init_fs_context = bpf_init_fs_context,
.parameters = bpf_fs_parameters,
- .kill_sb = kill_litter_super,
+ .kill_sb = bpf_kill_super,
+ .fs_flags = FS_USERNS_MOUNT,
};
static int __init bpf_init(void)
diff --git a/kernel/bpf/kmem_cache_iter.c b/kernel/bpf/kmem_cache_iter.c
new file mode 100644
index 000000000000..3ae2158d767f
--- /dev/null
+++ b/kernel/bpf/kmem_cache_iter.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Google */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+#include "../../mm/slab.h" /* kmem_cache, slab_caches and slab_mutex */
+
+/* open-coded version */
+struct bpf_iter_kmem_cache {
+ __u64 __opaque[1];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_kmem_cache_kern {
+ struct kmem_cache *pos;
+} __attribute__((aligned(8)));
+
+#define KMEM_CACHE_POS_START ((void *)1L)
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_kmem_cache_new(struct bpf_iter_kmem_cache *it)
+{
+ struct bpf_iter_kmem_cache_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(*kit) > sizeof(*it));
+ BUILD_BUG_ON(__alignof__(*kit) != __alignof__(*it));
+
+ kit->pos = KMEM_CACHE_POS_START;
+ return 0;
+}
+
+__bpf_kfunc struct kmem_cache *bpf_iter_kmem_cache_next(struct bpf_iter_kmem_cache *it)
+{
+ struct bpf_iter_kmem_cache_kern *kit = (void *)it;
+ struct kmem_cache *prev = kit->pos;
+ struct kmem_cache *next;
+ bool destroy = false;
+
+ if (!prev)
+ return NULL;
+
+ mutex_lock(&slab_mutex);
+
+ if (list_empty(&slab_caches)) {
+ mutex_unlock(&slab_mutex);
+ return NULL;
+ }
+
+ if (prev == KMEM_CACHE_POS_START)
+ next = list_first_entry(&slab_caches, struct kmem_cache, list);
+ else if (list_last_entry(&slab_caches, struct kmem_cache, list) == prev)
+ next = NULL;
+ else
+ next = list_next_entry(prev, list);
+
+ /* boot_caches have negative refcount, don't touch them */
+ if (next && next->refcount > 0)
+ next->refcount++;
+
+ /* Skip kmem_cache_destroy() for active entries */
+ if (prev && prev != KMEM_CACHE_POS_START) {
+ if (prev->refcount > 1)
+ prev->refcount--;
+ else if (prev->refcount == 1)
+ destroy = true;
+ }
+
+ mutex_unlock(&slab_mutex);
+
+ if (destroy)
+ kmem_cache_destroy(prev);
+
+ kit->pos = next;
+ return next;
+}
+
+__bpf_kfunc void bpf_iter_kmem_cache_destroy(struct bpf_iter_kmem_cache *it)
+{
+ struct bpf_iter_kmem_cache_kern *kit = (void *)it;
+ struct kmem_cache *s = kit->pos;
+ bool destroy = false;
+
+ if (s == NULL || s == KMEM_CACHE_POS_START)
+ return;
+
+ mutex_lock(&slab_mutex);
+
+ /* Skip kmem_cache_destroy() for active entries */
+ if (s->refcount > 1)
+ s->refcount--;
+ else if (s->refcount == 1)
+ destroy = true;
+
+ mutex_unlock(&slab_mutex);
+
+ if (destroy)
+ kmem_cache_destroy(s);
+}
+
+__bpf_kfunc_end_defs();
+
+struct bpf_iter__kmem_cache {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct kmem_cache *, s);
+};
+
+union kmem_cache_iter_priv {
+ struct bpf_iter_kmem_cache it;
+ struct bpf_iter_kmem_cache_kern kit;
+};
+
+static void *kmem_cache_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ loff_t cnt = 0;
+ bool found = false;
+ struct kmem_cache *s;
+ union kmem_cache_iter_priv *p = seq->private;
+
+ mutex_lock(&slab_mutex);
+
+ /* Find an entry at the given position in the slab_caches list instead
+ * of keeping a reference (of the last visited entry, if any) out of
+ * slab_mutex. It might miss something if one is deleted in the middle
+ * while it releases the lock. But it should be rare and there's not
+ * much we can do about it.
+ */
+ list_for_each_entry(s, &slab_caches, list) {
+ if (cnt == *pos) {
+ /* Make sure this entry remains in the list by getting
+ * a new reference count. Note that boot_cache entries
+ * have a negative refcount, so don't touch them.
+ */
+ if (s->refcount > 0)
+ s->refcount++;
+ found = true;
+ break;
+ }
+ cnt++;
+ }
+ mutex_unlock(&slab_mutex);
+
+ if (!found)
+ s = NULL;
+
+ p->kit.pos = s;
+ return s;
+}
+
+static void kmem_cache_iter_seq_stop(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_meta meta;
+ struct bpf_iter__kmem_cache ctx = {
+ .meta = &meta,
+ .s = v,
+ };
+ union kmem_cache_iter_priv *p = seq->private;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, true);
+ if (prog && !ctx.s)
+ bpf_iter_run_prog(prog, &ctx);
+
+ bpf_iter_kmem_cache_destroy(&p->it);
+}
+
+static void *kmem_cache_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ union kmem_cache_iter_priv *p = seq->private;
+
+ ++*pos;
+
+ return bpf_iter_kmem_cache_next(&p->it);
+}
+
+static int kmem_cache_iter_seq_show(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_meta meta;
+ struct bpf_iter__kmem_cache ctx = {
+ .meta = &meta,
+ .s = v,
+ };
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, false);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ return ret;
+}
+
+static const struct seq_operations kmem_cache_iter_seq_ops = {
+ .start = kmem_cache_iter_seq_start,
+ .next = kmem_cache_iter_seq_next,
+ .stop = kmem_cache_iter_seq_stop,
+ .show = kmem_cache_iter_seq_show,
+};
+
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_kmem_cache_btf_id, struct, kmem_cache)
+
+static const struct bpf_iter_seq_info kmem_cache_iter_seq_info = {
+ .seq_ops = &kmem_cache_iter_seq_ops,
+ .seq_priv_size = sizeof(union kmem_cache_iter_priv),
+};
+
+static void bpf_iter_kmem_cache_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ seq_puts(seq, "kmem_cache iter\n");
+}
+
+DEFINE_BPF_ITER_FUNC(kmem_cache, struct bpf_iter_meta *meta,
+ struct kmem_cache *s)
+
+static struct bpf_iter_reg bpf_kmem_cache_reg_info = {
+ .target = "kmem_cache",
+ .feature = BPF_ITER_RESCHED,
+ .show_fdinfo = bpf_iter_kmem_cache_show_fdinfo,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__kmem_cache, s),
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
+ },
+ .seq_info = &kmem_cache_iter_seq_info,
+};
+
+static int __init bpf_kmem_cache_iter_init(void)
+{
+ bpf_kmem_cache_reg_info.ctx_arg_info[0].btf_id = bpf_kmem_cache_btf_id[0];
+ return bpf_iter_reg_target(&bpf_kmem_cache_reg_info);
+}
+
+late_initcall(bpf_kmem_cache_iter_init);
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index a04f505aefe9..3969eb0382af 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -431,7 +431,7 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
seq_puts(m, ": ");
btf_type_seq_show(map->btf, map->btf_value_type_id,
&READ_ONCE(storage->buf)->data[0], m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
} else {
seq_puts(m, ": {\n");
for_each_possible_cpu(cpu) {
@@ -439,7 +439,7 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
btf_type_seq_show(map->btf, map->btf_value_type_id,
per_cpu_ptr(storage->percpu_buf, cpu),
m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
}
seq_puts(m, "}\n");
}
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 594a234f122b..38050f4ee400 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -9,6 +9,7 @@
#include <linux/bpf.h>
#include <linux/bpf_verifier.h>
#include <linux/math64.h>
+#include <linux/string.h>
#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
@@ -90,7 +91,7 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
goto fail;
} else {
u64 new_end, new_start;
- u32 buf_start, buf_end, new_n;
+ u32 buf_start, buf_end;
new_end = log->end_pos + n;
if (new_end - log->start_pos >= log->len_total)
@@ -333,7 +334,8 @@ find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
{
const struct bpf_line_info *linfo;
const struct bpf_prog *prog;
- u32 i, nr_linfo;
+ u32 nr_linfo;
+ int l, r, m;
prog = env->prog;
nr_linfo = prog->aux->nr_linfo;
@@ -342,11 +344,30 @@ find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
return NULL;
linfo = prog->aux->linfo;
- for (i = 1; i < nr_linfo; i++)
- if (insn_off < linfo[i].insn_off)
- break;
+ /* Loop invariant: linfo[l].insn_off <= insns_off.
+ * linfo[0].insn_off == 0 which always satisfies above condition.
+ * Binary search is searching for rightmost linfo entry that satisfies
+ * the above invariant, giving us the desired record that covers given
+ * instruction offset.
+ */
+ l = 0;
+ r = nr_linfo - 1;
+ while (l < r) {
+ /* (r - l + 1) / 2 means we break a tie to the right, so if:
+ * l=1, r=2, linfo[l].insn_off <= insn_off, linfo[r].insn_off > insn_off,
+ * then m=2, we see that linfo[m].insn_off > insn_off, and so
+ * r becomes 1 and we exit the loop with correct l==1.
+ * If the tie was broken to the left, m=1 would end us up in
+ * an endless loop where l and m stay at 1 and r stays at 2.
+ */
+ m = l + (r - l + 1) / 2;
+ if (linfo[m].insn_off <= insn_off)
+ l = m;
+ else
+ r = m - 1;
+ }
- return &linfo[i - 1];
+ return &linfo[l];
}
static const char *ltrim(const char *s)
@@ -361,13 +382,28 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
u32 insn_off,
const char *prefix_fmt, ...)
{
- const struct bpf_line_info *linfo;
+ const struct bpf_line_info *linfo, *prev_linfo;
+ const struct btf *btf;
+ const char *s, *fname;
if (!bpf_verifier_log_needed(&env->log))
return;
+ prev_linfo = env->prev_linfo;
linfo = find_linfo(env, insn_off);
- if (!linfo || linfo == env->prev_linfo)
+ if (!linfo || linfo == prev_linfo)
+ return;
+
+ /* It often happens that two separate linfo records point to the same
+ * source code line, but have differing column numbers. Given verifier
+ * log doesn't emit column information, from user perspective we just
+ * end up emitting the same source code line twice unnecessarily.
+ * So instead check that previous and current linfo record point to
+ * the same file (file_name_offs match) and the same line number, and
+ * avoid emitting duplicated source code line in such case.
+ */
+ if (prev_linfo && linfo->file_name_off == prev_linfo->file_name_off &&
+ BPF_LINE_INFO_LINE_NUM(linfo->line_col) == BPF_LINE_INFO_LINE_NUM(prev_linfo->line_col))
return;
if (prefix_fmt) {
@@ -378,9 +414,15 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
va_end(args);
}
- verbose(env, "%s\n",
- ltrim(btf_name_by_offset(env->prog->aux->btf,
- linfo->line_off)));
+ btf = env->prog->aux->btf;
+ s = ltrim(btf_name_by_offset(btf, linfo->line_off));
+ verbose(env, "%s", s); /* source code line */
+
+ s = btf_name_by_offset(btf, linfo->file_name_off);
+ /* leave only file name */
+ fname = strrchr(s, '/');
+ fname = fname ? fname + 1 : s;
+ verbose(env, " @ %s:%u\n", fname, BPF_LINE_INFO_LINE_NUM(linfo->line_col));
env->prev_linfo = linfo;
}
@@ -416,6 +458,7 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type)
[PTR_TO_XDP_SOCK] = "xdp_sock",
[PTR_TO_BTF_ID] = "ptr_",
[PTR_TO_MEM] = "mem",
+ [PTR_TO_ARENA] = "arena",
[PTR_TO_BUF] = "buf",
[PTR_TO_FUNC] = "func",
[PTR_TO_MAP_KEY] = "map_key",
@@ -424,9 +467,9 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type)
if (type & PTR_MAYBE_NULL) {
if (base_type(type) == PTR_TO_BTF_ID)
- strncpy(postfix, "or_null_", 16);
+ strscpy(postfix, "or_null_");
else
- strncpy(postfix, "_or_null", 16);
+ strscpy(postfix, "_or_null");
}
snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s",
@@ -494,6 +537,7 @@ static char slot_type_char[] = {
[STACK_ZERO] = '0',
[STACK_DYNPTR] = 'd',
[STACK_ITER] = 'i',
+ [STACK_IRQ_FLAG] = 'f'
};
static void print_liveness(struct bpf_verifier_env *env,
@@ -645,12 +689,13 @@ static void print_reg_state(struct bpf_verifier_env *env,
if (t == SCALAR_VALUE && reg->precise)
verbose(env, "P");
if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) {
- /* reg->off should be 0 for SCALAR_VALUE */
- verbose_snum(env, reg->var_off.value + reg->off);
+ verbose_snum(env, reg->var_off.value);
return;
}
verbose(env, "%s", reg_type_str(env, t));
+ if (t == PTR_TO_ARENA)
+ return;
if (t == PTR_TO_STACK) {
if (state->frameno != reg->frameno)
verbose(env, "[%d]", reg->frameno);
@@ -663,7 +708,9 @@ static void print_reg_state(struct bpf_verifier_env *env,
verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
verbose(env, "(");
if (reg->id)
- verbose_a("id=%d", reg->id);
+ verbose_a("id=%d", reg->id & ~BPF_ADD_CONST);
+ if (reg->id & BPF_ADD_CONST)
+ verbose(env, "%+d", reg->off);
if (reg->ref_obj_id)
verbose_a("ref_obj_id=%d", reg->ref_obj_id);
if (type_is_non_owning_ref(reg->type))
@@ -707,9 +754,10 @@ static void print_reg_state(struct bpf_verifier_env *env,
verbose(env, ")");
}
-void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state,
- bool print_all)
+void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate,
+ u32 frameno, bool print_all)
{
+ const struct bpf_func_state *state = vstate->frame[frameno];
const struct bpf_reg_state *reg;
int i;
@@ -797,11 +845,11 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st
break;
}
}
- if (state->acquired_refs && state->refs[0].id) {
- verbose(env, " refs=%d", state->refs[0].id);
- for (i = 1; i < state->acquired_refs; i++)
- if (state->refs[i].id)
- verbose(env, ",%d", state->refs[i].id);
+ if (vstate->acquired_refs && vstate->refs[0].id) {
+ verbose(env, " refs=%d", vstate->refs[0].id);
+ for (i = 1; i < vstate->acquired_refs; i++)
+ if (vstate->refs[i].id)
+ verbose(env, ",%d", vstate->refs[i].id);
}
if (state->in_callback_fn)
verbose(env, " cb");
@@ -818,7 +866,8 @@ static inline u32 vlog_alignment(u32 pos)
BPF_LOG_MIN_ALIGNMENT) - pos - 1;
}
-void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state)
+void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate,
+ u32 frameno)
{
if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
/* remove new line character */
@@ -827,5 +876,5 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state
} else {
verbose(env, "%d:", env->insn_idx);
}
- print_verifier_state(env, state, false);
+ print_verifier_state(env, vstate, frameno, false);
}
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index b32be680da6c..e8a772e64324 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -15,6 +15,7 @@
#include <net/ipv6.h>
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
+#include <linux/bpf_mem_alloc.h>
/* Intermediate node */
#define LPM_TREE_NODE_FLAG_IM BIT(0)
@@ -22,7 +23,6 @@
struct lpm_trie_node;
struct lpm_trie_node {
- struct rcu_head rcu;
struct lpm_trie_node __rcu *child[2];
u32 prefixlen;
u32 flags;
@@ -32,10 +32,11 @@ struct lpm_trie_node {
struct lpm_trie {
struct bpf_map map;
struct lpm_trie_node __rcu *root;
+ struct bpf_mem_alloc ma;
size_t n_entries;
size_t max_prefixlen;
size_t data_size;
- spinlock_t lock;
+ raw_spinlock_t lock;
};
/* This trie implements a longest prefix match algorithm that can be used to
@@ -155,22 +156,23 @@ static inline int extract_bit(const u8 *data, size_t index)
}
/**
- * longest_prefix_match() - determine the longest prefix
+ * __longest_prefix_match() - determine the longest prefix
* @trie: The trie to get internal sizes from
* @node: The node to operate on
* @key: The key to compare to @node
*
* Determine the longest prefix of @node that matches the bits in @key.
*/
-static size_t longest_prefix_match(const struct lpm_trie *trie,
- const struct lpm_trie_node *node,
- const struct bpf_lpm_trie_key *key)
+static __always_inline
+size_t __longest_prefix_match(const struct lpm_trie *trie,
+ const struct lpm_trie_node *node,
+ const struct bpf_lpm_trie_key_u8 *key)
{
u32 limit = min(node->prefixlen, key->prefixlen);
u32 prefixlen = 0, i = 0;
BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32));
- BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32));
+ BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key_u8, data) % sizeof(u32));
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT)
@@ -224,12 +226,19 @@ static size_t longest_prefix_match(const struct lpm_trie *trie,
return prefixlen;
}
+static size_t longest_prefix_match(const struct lpm_trie *trie,
+ const struct lpm_trie_node *node,
+ const struct bpf_lpm_trie_key_u8 *key)
+{
+ return __longest_prefix_match(trie, node, key);
+}
+
/* Called from syscall or from eBPF program */
static void *trie_lookup_elem(struct bpf_map *map, void *_key)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
struct lpm_trie_node *node, *found = NULL;
- struct bpf_lpm_trie_key *key = _key;
+ struct bpf_lpm_trie_key_u8 *key = _key;
if (key->prefixlen > trie->max_prefixlen)
return NULL;
@@ -245,7 +254,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
* If it's the maximum possible prefix for this trie, we have
* an exact match and can return it directly.
*/
- matchlen = longest_prefix_match(trie, node, key);
+ matchlen = __longest_prefix_match(trie, node, key);
if (matchlen == trie->max_prefixlen) {
found = node;
break;
@@ -279,17 +288,13 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
return found->data + trie->data_size;
}
-static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
+static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie,
const void *value)
{
struct lpm_trie_node *node;
- size_t size = sizeof(struct lpm_trie_node) + trie->data_size;
- if (value)
- size += trie->map.value_size;
+ node = bpf_mem_cache_alloc(&trie->ma);
- node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN,
- trie->map.numa_node);
if (!node)
return NULL;
@@ -302,14 +307,25 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
return node;
}
+static int trie_check_add_elem(struct lpm_trie *trie, u64 flags)
+{
+ if (flags == BPF_EXIST)
+ return -ENOENT;
+ if (trie->n_entries == trie->map.max_entries)
+ return -ENOSPC;
+ trie->n_entries++;
+ return 0;
+}
+
/* Called from syscall or from eBPF program */
static long trie_update_elem(struct bpf_map *map,
void *_key, void *value, u64 flags)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
- struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
+ struct lpm_trie_node *node, *im_node, *new_node;
+ struct lpm_trie_node *free_node = NULL;
struct lpm_trie_node __rcu **slot;
- struct bpf_lpm_trie_key *key = _key;
+ struct bpf_lpm_trie_key_u8 *key = _key;
unsigned long irq_flags;
unsigned int next_bit;
size_t matchlen = 0;
@@ -321,22 +337,12 @@ static long trie_update_elem(struct bpf_map *map,
if (key->prefixlen > trie->max_prefixlen)
return -EINVAL;
- spin_lock_irqsave(&trie->lock, irq_flags);
-
/* Allocate and fill a new node */
-
- if (trie->n_entries == trie->map.max_entries) {
- ret = -ENOSPC;
- goto out;
- }
-
new_node = lpm_trie_node_alloc(trie, value);
- if (!new_node) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!new_node)
+ return -ENOMEM;
- trie->n_entries++;
+ raw_spin_lock_irqsave(&trie->lock, irq_flags);
new_node->prefixlen = key->prefixlen;
RCU_INIT_POINTER(new_node->child[0], NULL);
@@ -355,8 +361,7 @@ static long trie_update_elem(struct bpf_map *map,
matchlen = longest_prefix_match(trie, node, key);
if (node->prefixlen != matchlen ||
- node->prefixlen == key->prefixlen ||
- node->prefixlen == trie->max_prefixlen)
+ node->prefixlen == key->prefixlen)
break;
next_bit = extract_bit(key->data, node->prefixlen);
@@ -367,6 +372,10 @@ static long trie_update_elem(struct bpf_map *map,
* simply assign the @new_node to that slot and be done.
*/
if (!node) {
+ ret = trie_check_add_elem(trie, flags);
+ if (ret)
+ goto out;
+
rcu_assign_pointer(*slot, new_node);
goto out;
}
@@ -375,18 +384,30 @@ static long trie_update_elem(struct bpf_map *map,
* which already has the correct data array set.
*/
if (node->prefixlen == matchlen) {
+ if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) {
+ if (flags == BPF_NOEXIST) {
+ ret = -EEXIST;
+ goto out;
+ }
+ } else {
+ ret = trie_check_add_elem(trie, flags);
+ if (ret)
+ goto out;
+ }
+
new_node->child[0] = node->child[0];
new_node->child[1] = node->child[1];
- if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
- trie->n_entries--;
-
rcu_assign_pointer(*slot, new_node);
- kfree_rcu(node, rcu);
+ free_node = node;
goto out;
}
+ ret = trie_check_add_elem(trie, flags);
+ if (ret)
+ goto out;
+
/* If the new node matches the prefix completely, it must be inserted
* as an ancestor. Simply insert it between @node and *@slot.
*/
@@ -399,6 +420,7 @@ static long trie_update_elem(struct bpf_map *map,
im_node = lpm_trie_node_alloc(trie, NULL);
if (!im_node) {
+ trie->n_entries--;
ret = -ENOMEM;
goto out;
}
@@ -420,15 +442,11 @@ static long trie_update_elem(struct bpf_map *map,
rcu_assign_pointer(*slot, im_node);
out:
- if (ret) {
- if (new_node)
- trie->n_entries--;
-
- kfree(new_node);
- kfree(im_node);
- }
+ raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
- spin_unlock_irqrestore(&trie->lock, irq_flags);
+ if (ret)
+ bpf_mem_cache_free(&trie->ma, new_node);
+ bpf_mem_cache_free_rcu(&trie->ma, free_node);
return ret;
}
@@ -437,7 +455,8 @@ out:
static long trie_delete_elem(struct bpf_map *map, void *_key)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
- struct bpf_lpm_trie_key *key = _key;
+ struct lpm_trie_node *free_node = NULL, *free_parent = NULL;
+ struct bpf_lpm_trie_key_u8 *key = _key;
struct lpm_trie_node __rcu **trim, **trim2;
struct lpm_trie_node *node, *parent;
unsigned long irq_flags;
@@ -448,7 +467,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
if (key->prefixlen > trie->max_prefixlen)
return -EINVAL;
- spin_lock_irqsave(&trie->lock, irq_flags);
+ raw_spin_lock_irqsave(&trie->lock, irq_flags);
/* Walk the tree looking for an exact key/length match and keeping
* track of the path we traverse. We will need to know the node
@@ -506,8 +525,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
else
rcu_assign_pointer(
*trim2, rcu_access_pointer(parent->child[0]));
- kfree_rcu(parent, rcu);
- kfree_rcu(node, rcu);
+ free_parent = parent;
+ free_node = node;
goto out;
}
@@ -521,10 +540,13 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1]));
else
RCU_INIT_POINTER(*trim, NULL);
- kfree_rcu(node, rcu);
+ free_node = node;
out:
- spin_unlock_irqrestore(&trie->lock, irq_flags);
+ raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
+
+ bpf_mem_cache_free_rcu(&trie->ma, free_parent);
+ bpf_mem_cache_free_rcu(&trie->ma, free_node);
return ret;
}
@@ -536,7 +558,7 @@ out:
sizeof(struct lpm_trie_node))
#define LPM_VAL_SIZE_MIN 1
-#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key) + (X))
+#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key_u8) + (X))
#define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
@@ -546,6 +568,8 @@ out:
static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
struct lpm_trie *trie;
+ size_t leaf_size;
+ int err;
/* check sanity of attributes */
if (attr->max_entries == 0 ||
@@ -565,12 +589,22 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
/* copy mandatory map attributes */
bpf_map_init_from_attr(&trie->map, attr);
trie->data_size = attr->key_size -
- offsetof(struct bpf_lpm_trie_key, data);
+ offsetof(struct bpf_lpm_trie_key_u8, data);
trie->max_prefixlen = trie->data_size * 8;
- spin_lock_init(&trie->lock);
+ raw_spin_lock_init(&trie->lock);
+ /* Allocate intermediate and leaf nodes from the same allocator */
+ leaf_size = sizeof(struct lpm_trie_node) + trie->data_size +
+ trie->map.value_size;
+ err = bpf_mem_alloc_init(&trie->ma, leaf_size, false);
+ if (err)
+ goto free_out;
return &trie->map;
+
+free_out:
+ bpf_map_area_free(trie);
+ return ERR_PTR(err);
}
static void trie_free(struct bpf_map *map)
@@ -602,13 +636,17 @@ static void trie_free(struct bpf_map *map)
continue;
}
- kfree(node);
+ /* No bpf program may access the map, so freeing the
+ * node without waiting for the extra RCU GP.
+ */
+ bpf_mem_cache_raw_free(node);
RCU_INIT_POINTER(*slot, NULL);
break;
}
}
out:
+ bpf_mem_alloc_destroy(&trie->ma);
bpf_map_area_free(trie);
}
@@ -616,11 +654,11 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
{
struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root;
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
- struct bpf_lpm_trie_key *key = _key, *next_key = _next_key;
+ struct bpf_lpm_trie_key_u8 *key = _key, *next_key = _next_key;
struct lpm_trie_node **node_stack = NULL;
int err = 0, stack_ptr = -1;
unsigned int next_bit;
- size_t matchlen;
+ size_t matchlen = 0;
/* The get_next_key follows postorder. For the 4 node example in
* the top of this file, the trie_get_next_key() returns the following
@@ -642,7 +680,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
if (!key || key->prefixlen > trie->max_prefixlen)
goto find_leftmost;
- node_stack = kmalloc_array(trie->max_prefixlen,
+ node_stack = kmalloc_array(trie->max_prefixlen + 1,
sizeof(struct lpm_trie_node *),
GFP_ATOMIC | __GFP_NOWARN);
if (!node_stack)
@@ -659,7 +697,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
next_bit = extract_bit(key->data, node->prefixlen);
node = rcu_dereference(node->child[next_bit]);
}
- if (!node || node->prefixlen != key->prefixlen ||
+ if (!node || node->prefixlen != matchlen ||
(node->flags & LPM_TREE_NODE_FLAG_IM))
goto find_leftmost;
@@ -703,7 +741,7 @@ find_leftmost:
}
do_copy:
next_key->prefixlen = next_node->prefixlen;
- memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data),
+ memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key_u8, data),
next_node->data, trie->data_size);
free_stack:
kfree(node_stack);
@@ -715,7 +753,7 @@ static int trie_check_btf(const struct bpf_map *map,
const struct btf_type *key_type,
const struct btf_type *value_type)
{
- /* Keys must have struct bpf_lpm_trie_key embedded. */
+ /* Keys must have struct bpf_lpm_trie_key_u8 embedded. */
return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ?
-EINVAL : 0;
}
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 8ef269e66ba5..645bd30bc9a9 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -11,35 +11,27 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
{
struct bpf_map *inner_map, *inner_map_meta;
u32 inner_map_meta_size;
- struct fd f;
- int ret;
+ CLASS(fd, f)(inner_map_ufd);
- f = fdget(inner_map_ufd);
inner_map = __bpf_map_get(f);
if (IS_ERR(inner_map))
return inner_map;
/* Does not support >1 level map-in-map */
- if (inner_map->inner_map_meta) {
- ret = -EINVAL;
- goto put;
- }
+ if (inner_map->inner_map_meta)
+ return ERR_PTR(-EINVAL);
- if (!inner_map->ops->map_meta_equal) {
- ret = -ENOTSUPP;
- goto put;
- }
+ if (!inner_map->ops->map_meta_equal)
+ return ERR_PTR(-ENOTSUPP);
inner_map_meta_size = sizeof(*inner_map_meta);
/* In some cases verifier needs to access beyond just base map. */
- if (inner_map->ops == &array_map_ops)
+ if (inner_map->ops == &array_map_ops || inner_map->ops == &percpu_array_map_ops)
inner_map_meta_size = sizeof(struct bpf_array);
inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER);
- if (!inner_map_meta) {
- ret = -ENOMEM;
- goto put;
- }
+ if (!inner_map_meta)
+ return ERR_PTR(-ENOMEM);
inner_map_meta->map_type = inner_map->map_type;
inner_map_meta->key_size = inner_map->key_size;
@@ -53,8 +45,9 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
* invalid/empty/valid, but ERR_PTR in case of errors. During
* equality NULL or IS_ERR is equivalent.
*/
- ret = PTR_ERR(inner_map_meta->record);
- goto free;
+ struct bpf_map *ret = ERR_CAST(inner_map_meta->record);
+ kfree(inner_map_meta);
+ return ret;
}
/* Note: We must use the same BTF, as we also used btf_record_dup above
* which relies on BTF being same for both maps, as some members like
@@ -68,7 +61,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
/* Misc members not needed in bpf_map_meta_equal() check. */
inner_map_meta->ops = inner_map->ops;
- if (inner_map->ops == &array_map_ops) {
+ if (inner_map->ops == &array_map_ops || inner_map->ops == &percpu_array_map_ops) {
struct bpf_array *inner_array_meta =
container_of(inner_map_meta, struct bpf_array, map);
struct bpf_array *inner_array = container_of(inner_map, struct bpf_array, map);
@@ -77,14 +70,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_array_meta->elem_size = inner_array->elem_size;
inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1;
}
-
- fdput(f);
return inner_map_meta;
-free:
- kfree(inner_map_meta);
-put:
- fdput(f);
- return ERR_PTR(ret);
}
void bpf_map_meta_free(struct bpf_map *map_meta)
@@ -110,9 +96,8 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
int ufd)
{
struct bpf_map *inner_map, *inner_map_meta;
- struct fd f;
+ CLASS(fd, f)(ufd);
- f = fdget(ufd);
inner_map = __bpf_map_get(f);
if (IS_ERR(inner_map))
return inner_map;
@@ -123,7 +108,6 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
else
inner_map = ERR_PTR(-EINVAL);
- fdput(f);
return inner_map;
}
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 6abd7c5df4b3..9575314f40a6 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -213,9 +213,9 @@ __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map)
__bpf_kfunc_end_defs();
-BTF_SET8_START(bpf_map_iter_kfunc_ids)
+BTF_KFUNCS_START(bpf_map_iter_kfunc_ids)
BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS)
-BTF_SET8_END(bpf_map_iter_kfunc_ids)
+BTF_KFUNCS_END(bpf_map_iter_kfunc_ids)
static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = {
.owner = THIS_MODULE,
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 550f02e2cb13..889374722d0a 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -35,6 +35,8 @@
*/
#define LLIST_NODE_SZ sizeof(struct llist_node)
+#define BPF_MEM_ALLOC_SIZE_MAX 4096
+
/* similar to kmalloc, but sizeof == 8 bucket is gone */
static u8 size_index[24] __ro_after_init = {
3, /* 8 */
@@ -65,7 +67,7 @@ static u8 size_index[24] __ro_after_init = {
static int bpf_mem_cache_idx(size_t size)
{
- if (!size || size > 4096)
+ if (!size || size > BPF_MEM_ALLOC_SIZE_MAX)
return -1;
if (size <= 192)
@@ -138,8 +140,8 @@ static struct llist_node notrace *__llist_del_first(struct llist_head *head)
static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)
{
if (c->percpu_size) {
- void **obj = kmalloc_node(c->percpu_size, flags, node);
- void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
+ void __percpu **obj = kmalloc_node(c->percpu_size, flags, node);
+ void __percpu *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
if (!obj || !pptr) {
free_percpu(pptr);
@@ -155,12 +157,9 @@ static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)
static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
{
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
if (c->objcg)
return get_mem_cgroup_from_objcg(c->objcg);
-#endif
-
-#ifdef CONFIG_MEMCG
return root_mem_cgroup;
#else
return NULL;
@@ -255,11 +254,8 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic)
static void free_one(void *obj, bool percpu)
{
- if (percpu) {
- free_percpu(((void **)obj)[1]);
- kfree(obj);
- return;
- }
+ if (percpu)
+ free_percpu(((void __percpu **)obj)[1]);
kfree(obj);
}
@@ -512,8 +508,8 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
*/
int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
{
- struct bpf_mem_caches *cc, __percpu *pcc;
- struct bpf_mem_cache *c, __percpu *pc;
+ struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc;
+ struct bpf_mem_cache *c; struct bpf_mem_cache __percpu *pc;
struct obj_cgroup *objcg = NULL;
int cpu, i, unit_size, percpu_size = 0;
@@ -534,7 +530,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
size += LLIST_NODE_SZ; /* room for llist_node */
unit_size = size;
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
if (memcg_bpf_enabled())
objcg = get_obj_cgroup_from_current();
#endif
@@ -556,7 +552,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
if (!pcc)
return -ENOMEM;
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
objcg = get_obj_cgroup_from_current();
#endif
ma->objcg = objcg;
@@ -594,7 +590,7 @@ int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg
int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size)
{
- struct bpf_mem_caches *cc, __percpu *pcc;
+ struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc;
int cpu, i, unit_size, percpu_size;
struct obj_cgroup *objcg;
struct bpf_mem_cache *c;
@@ -759,8 +755,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
}
- if (ma->objcg)
- obj_cgroup_put(ma->objcg);
+ obj_cgroup_put(ma->objcg);
destroy_mem_alloc(ma, rcu_in_progress);
}
if (ma->caches) {
@@ -776,8 +771,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
}
}
- if (ma->objcg)
- obj_cgroup_put(ma->objcg);
+ obj_cgroup_put(ma->objcg);
destroy_mem_alloc(ma, rcu_in_progress);
}
}
@@ -1010,3 +1004,13 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
+
+int bpf_mem_alloc_check_size(bool percpu, size_t size)
+{
+ /* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */
+ if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) ||
+ (!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ))
+ return -E2BIG;
+
+ return 0;
+}
diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c
new file mode 100644
index 000000000000..37b80a23ae1a
--- /dev/null
+++ b/kernel/bpf/range_tree.c
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <linux/interval_tree_generic.h>
+#include <linux/slab.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/bpf.h>
+#include "range_tree.h"
+
+/*
+ * struct range_tree is a data structure used to allocate contiguous memory
+ * ranges in bpf arena. It's a large bitmap. The contiguous sequence of bits is
+ * represented by struct range_node or 'rn' for short.
+ * rn->rn_rbnode links it into an interval tree while
+ * rn->rb_range_size links it into a second rbtree sorted by size of the range.
+ * __find_range() performs binary search and best fit algorithm to find the
+ * range less or equal requested size.
+ * range_tree_clear/set() clears or sets a range of bits in this bitmap. The
+ * adjacent ranges are merged or split at the same time.
+ *
+ * The split/merge logic is based/borrowed from XFS's xbitmap32 added
+ * in commit 6772fcc8890a ("xfs: convert xbitmap to interval tree").
+ *
+ * The implementation relies on external lock to protect rbtree-s.
+ * The alloc/free of range_node-s is done via bpf_mem_alloc.
+ *
+ * bpf arena is using range_tree to represent unallocated slots.
+ * At init time:
+ * range_tree_set(rt, 0, max);
+ * Then:
+ * start = range_tree_find(rt, len);
+ * if (start >= 0)
+ * range_tree_clear(rt, start, len);
+ * to find free range and mark slots as allocated and later:
+ * range_tree_set(rt, start, len);
+ * to mark as unallocated after use.
+ */
+struct range_node {
+ struct rb_node rn_rbnode;
+ struct rb_node rb_range_size;
+ u32 rn_start;
+ u32 rn_last; /* inclusive */
+ u32 __rn_subtree_last;
+};
+
+static struct range_node *rb_to_range_node(struct rb_node *rb)
+{
+ return rb_entry(rb, struct range_node, rb_range_size);
+}
+
+static u32 rn_size(struct range_node *rn)
+{
+ return rn->rn_last - rn->rn_start + 1;
+}
+
+/* Find range that fits best to requested size */
+static inline struct range_node *__find_range(struct range_tree *rt, u32 len)
+{
+ struct rb_node *rb = rt->range_size_root.rb_root.rb_node;
+ struct range_node *best = NULL;
+
+ while (rb) {
+ struct range_node *rn = rb_to_range_node(rb);
+
+ if (len <= rn_size(rn)) {
+ best = rn;
+ rb = rb->rb_right;
+ } else {
+ rb = rb->rb_left;
+ }
+ }
+
+ return best;
+}
+
+s64 range_tree_find(struct range_tree *rt, u32 len)
+{
+ struct range_node *rn;
+
+ rn = __find_range(rt, len);
+ if (!rn)
+ return -ENOENT;
+ return rn->rn_start;
+}
+
+/* Insert the range into rbtree sorted by the range size */
+static inline void __range_size_insert(struct range_node *rn,
+ struct rb_root_cached *root)
+{
+ struct rb_node **link = &root->rb_root.rb_node, *rb = NULL;
+ u64 size = rn_size(rn);
+ bool leftmost = true;
+
+ while (*link) {
+ rb = *link;
+ if (size > rn_size(rb_to_range_node(rb))) {
+ link = &rb->rb_left;
+ } else {
+ link = &rb->rb_right;
+ leftmost = false;
+ }
+ }
+
+ rb_link_node(&rn->rb_range_size, rb, link);
+ rb_insert_color_cached(&rn->rb_range_size, root, leftmost);
+}
+
+#define START(node) ((node)->rn_start)
+#define LAST(node) ((node)->rn_last)
+
+INTERVAL_TREE_DEFINE(struct range_node, rn_rbnode, u32,
+ __rn_subtree_last, START, LAST,
+ static inline __maybe_unused,
+ __range_it)
+
+static inline __maybe_unused void
+range_it_insert(struct range_node *rn, struct range_tree *rt)
+{
+ __range_size_insert(rn, &rt->range_size_root);
+ __range_it_insert(rn, &rt->it_root);
+}
+
+static inline __maybe_unused void
+range_it_remove(struct range_node *rn, struct range_tree *rt)
+{
+ rb_erase_cached(&rn->rb_range_size, &rt->range_size_root);
+ RB_CLEAR_NODE(&rn->rb_range_size);
+ __range_it_remove(rn, &rt->it_root);
+}
+
+static inline __maybe_unused struct range_node *
+range_it_iter_first(struct range_tree *rt, u32 start, u32 last)
+{
+ return __range_it_iter_first(&rt->it_root, start, last);
+}
+
+/* Clear the range in this range tree */
+int range_tree_clear(struct range_tree *rt, u32 start, u32 len)
+{
+ u32 last = start + len - 1;
+ struct range_node *new_rn;
+ struct range_node *rn;
+
+ while ((rn = range_it_iter_first(rt, start, last))) {
+ if (rn->rn_start < start && rn->rn_last > last) {
+ u32 old_last = rn->rn_last;
+
+ /* Overlaps with the entire clearing range */
+ range_it_remove(rn, rt);
+ rn->rn_last = start - 1;
+ range_it_insert(rn, rt);
+
+ /* Add a range */
+ migrate_disable();
+ new_rn = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node));
+ migrate_enable();
+ if (!new_rn)
+ return -ENOMEM;
+ new_rn->rn_start = last + 1;
+ new_rn->rn_last = old_last;
+ range_it_insert(new_rn, rt);
+ } else if (rn->rn_start < start) {
+ /* Overlaps with the left side of the clearing range */
+ range_it_remove(rn, rt);
+ rn->rn_last = start - 1;
+ range_it_insert(rn, rt);
+ } else if (rn->rn_last > last) {
+ /* Overlaps with the right side of the clearing range */
+ range_it_remove(rn, rt);
+ rn->rn_start = last + 1;
+ range_it_insert(rn, rt);
+ break;
+ } else {
+ /* in the middle of the clearing range */
+ range_it_remove(rn, rt);
+ migrate_disable();
+ bpf_mem_free(&bpf_global_ma, rn);
+ migrate_enable();
+ }
+ }
+ return 0;
+}
+
+/* Is the whole range set ? */
+int is_range_tree_set(struct range_tree *rt, u32 start, u32 len)
+{
+ u32 last = start + len - 1;
+ struct range_node *left;
+
+ /* Is this whole range set ? */
+ left = range_it_iter_first(rt, start, last);
+ if (left && left->rn_start <= start && left->rn_last >= last)
+ return 0;
+ return -ESRCH;
+}
+
+/* Set the range in this range tree */
+int range_tree_set(struct range_tree *rt, u32 start, u32 len)
+{
+ u32 last = start + len - 1;
+ struct range_node *right;
+ struct range_node *left;
+ int err;
+
+ /* Is this whole range already set ? */
+ left = range_it_iter_first(rt, start, last);
+ if (left && left->rn_start <= start && left->rn_last >= last)
+ return 0;
+
+ /* Clear out everything in the range we want to set. */
+ err = range_tree_clear(rt, start, len);
+ if (err)
+ return err;
+
+ /* Do we have a left-adjacent range ? */
+ left = range_it_iter_first(rt, start - 1, start - 1);
+ if (left && left->rn_last + 1 != start)
+ return -EFAULT;
+
+ /* Do we have a right-adjacent range ? */
+ right = range_it_iter_first(rt, last + 1, last + 1);
+ if (right && right->rn_start != last + 1)
+ return -EFAULT;
+
+ if (left && right) {
+ /* Combine left and right adjacent ranges */
+ range_it_remove(left, rt);
+ range_it_remove(right, rt);
+ left->rn_last = right->rn_last;
+ range_it_insert(left, rt);
+ migrate_disable();
+ bpf_mem_free(&bpf_global_ma, right);
+ migrate_enable();
+ } else if (left) {
+ /* Combine with the left range */
+ range_it_remove(left, rt);
+ left->rn_last = last;
+ range_it_insert(left, rt);
+ } else if (right) {
+ /* Combine with the right range */
+ range_it_remove(right, rt);
+ right->rn_start = start;
+ range_it_insert(right, rt);
+ } else {
+ migrate_disable();
+ left = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node));
+ migrate_enable();
+ if (!left)
+ return -ENOMEM;
+ left->rn_start = start;
+ left->rn_last = last;
+ range_it_insert(left, rt);
+ }
+ return 0;
+}
+
+void range_tree_destroy(struct range_tree *rt)
+{
+ struct range_node *rn;
+
+ while ((rn = range_it_iter_first(rt, 0, -1U))) {
+ range_it_remove(rn, rt);
+ bpf_mem_free(&bpf_global_ma, rn);
+ }
+}
+
+void range_tree_init(struct range_tree *rt)
+{
+ rt->it_root = RB_ROOT_CACHED;
+ rt->range_size_root = RB_ROOT_CACHED;
+}
diff --git a/kernel/bpf/range_tree.h b/kernel/bpf/range_tree.h
new file mode 100644
index 000000000000..ff0b9110eb71
--- /dev/null
+++ b/kernel/bpf/range_tree.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#ifndef _RANGE_TREE_H
+#define _RANGE_TREE_H 1
+
+struct range_tree {
+ /* root of interval tree */
+ struct rb_root_cached it_root;
+ /* root of rbtree of interval sizes */
+ struct rb_root_cached range_size_root;
+};
+
+void range_tree_init(struct range_tree *rt);
+void range_tree_destroy(struct range_tree *rt);
+
+int range_tree_clear(struct range_tree *rt, u32 start, u32 len);
+int range_tree_set(struct range_tree *rt, u32 start, u32 len);
+int is_range_tree_set(struct range_tree *rt, u32 start, u32 len);
+s64 range_tree_find(struct range_tree *rt, u32 len);
+
+#endif
diff --git a/kernel/bpf/relo_core.c b/kernel/bpf/relo_core.c
new file mode 100644
index 000000000000..aa822c9fcfde
--- /dev/null
+++ b/kernel/bpf/relo_core.c
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+#include "../../tools/lib/bpf/relo_core.c"
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 4b4f9670f1a9..49b8e5a0c6b4 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -308,7 +308,7 @@ put_file_unlock:
spin_unlock_bh(&reuseport_lock);
put_file:
- fput(socket->file);
+ sockfd_put(socket);
return err;
}
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 0ee653a936ea..1499d8caa9a3 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -29,7 +29,7 @@ struct bpf_ringbuf {
u64 mask;
struct page **pages;
int nr_pages;
- spinlock_t spinlock ____cacheline_aligned_in_smp;
+ raw_spinlock_t spinlock ____cacheline_aligned_in_smp;
/* For user-space producer ring buffers, an atomic_t busy bit is used
* to synchronize access to the ring buffers in the kernel, rather than
* the spinlock that is used for kernel-producer ring buffers. This is
@@ -51,7 +51,8 @@ struct bpf_ringbuf {
* This prevents a user-space application from modifying the
* position and ruining in-kernel tracking. The permissions of the
* pages depend on who is producing samples: user-space or the
- * kernel.
+ * kernel. Note that the pending counter is placed in the same
+ * page as the producer, so that it shares the same cache line.
*
* Kernel-producer
* ---------------
@@ -70,6 +71,7 @@ struct bpf_ringbuf {
*/
unsigned long consumer_pos __aligned(PAGE_SIZE);
unsigned long producer_pos __aligned(PAGE_SIZE);
+ unsigned long pending_pos;
char data[] __aligned(PAGE_SIZE);
};
@@ -171,7 +173,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
if (!rb)
return NULL;
- spin_lock_init(&rb->spinlock);
+ raw_spin_lock_init(&rb->spinlock);
atomic_set(&rb->busy, 0);
init_waitqueue_head(&rb->waitq);
init_irq_work(&rb->work, bpf_ringbuf_notify);
@@ -179,6 +181,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
rb->mask = data_sz - 1;
rb->consumer_pos = 0;
rb->producer_pos = 0;
+ rb->pending_pos = 0;
return rb;
}
@@ -265,8 +268,6 @@ static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma
/* allow writable mapping for the consumer_pos only */
if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE)
return -EPERM;
- } else {
- vm_flags_clear(vma, VM_MAYWRITE);
}
/* remap_vmalloc_range() checks size and offset constraints */
return remap_vmalloc_range(vma, rb_map->rb,
@@ -286,8 +287,6 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma
* position, and the ring buffer data itself.
*/
return -EPERM;
- } else {
- vm_flags_clear(vma, VM_MAYWRITE);
}
/* remap_vmalloc_range() checks size and offset constraints */
return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
@@ -404,9 +403,9 @@ bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr)
static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
{
- unsigned long cons_pos, prod_pos, new_prod_pos, flags;
- u32 len, pg_off;
+ unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, flags;
struct bpf_ringbuf_hdr *hdr;
+ u32 len, pg_off, tmp_size, hdr_len;
if (unlikely(size > RINGBUF_MAX_RECORD_SZ))
return NULL;
@@ -418,20 +417,36 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
cons_pos = smp_load_acquire(&rb->consumer_pos);
if (in_nmi()) {
- if (!spin_trylock_irqsave(&rb->spinlock, flags))
+ if (!raw_spin_trylock_irqsave(&rb->spinlock, flags))
return NULL;
} else {
- spin_lock_irqsave(&rb->spinlock, flags);
+ raw_spin_lock_irqsave(&rb->spinlock, flags);
}
+ pend_pos = rb->pending_pos;
prod_pos = rb->producer_pos;
new_prod_pos = prod_pos + len;
- /* check for out of ringbuf space by ensuring producer position
- * doesn't advance more than (ringbuf_size - 1) ahead
+ while (pend_pos < prod_pos) {
+ hdr = (void *)rb->data + (pend_pos & rb->mask);
+ hdr_len = READ_ONCE(hdr->len);
+ if (hdr_len & BPF_RINGBUF_BUSY_BIT)
+ break;
+ tmp_size = hdr_len & ~BPF_RINGBUF_DISCARD_BIT;
+ tmp_size = round_up(tmp_size + BPF_RINGBUF_HDR_SZ, 8);
+ pend_pos += tmp_size;
+ }
+ rb->pending_pos = pend_pos;
+
+ /* check for out of ringbuf space:
+ * - by ensuring producer position doesn't advance more than
+ * (ringbuf_size - 1) ahead
+ * - by ensuring oldest not yet committed record until newest
+ * record does not span more than (ringbuf_size - 1)
*/
- if (new_prod_pos - cons_pos > rb->mask) {
- spin_unlock_irqrestore(&rb->spinlock, flags);
+ if (new_prod_pos - cons_pos > rb->mask ||
+ new_prod_pos - pend_pos > rb->mask) {
+ raw_spin_unlock_irqrestore(&rb->spinlock, flags);
return NULL;
}
@@ -443,7 +458,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
/* pairs with consumer's smp_load_acquire() */
smp_store_release(&rb->producer_pos, new_prod_pos);
- spin_unlock_irqrestore(&rb->spinlock, flags);
+ raw_spin_unlock_irqrestore(&rb->spinlock, flags);
return (void *)hdr + BPF_RINGBUF_HDR_SZ;
}
@@ -613,7 +628,7 @@ const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = {
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT,
+ .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT | MEM_WRITE,
};
BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index dff7ba539701..3615c06b7dfa 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -91,11 +91,14 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
} else if (value_size / 8 > sysctl_perf_event_max_stack)
return ERR_PTR(-EINVAL);
- /* hash table size must be power of 2 */
- n_buckets = roundup_pow_of_two(attr->max_entries);
- if (!n_buckets)
+ /* hash table size must be power of 2; roundup_pow_of_two() can overflow
+ * into UB on 32-bit arches, so check that first
+ */
+ if (attr->max_entries > 1UL << 31)
return ERR_PTR(-E2BIG);
+ n_buckets = roundup_pow_of_two(attr->max_entries);
+
cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
if (!smap)
@@ -121,8 +124,24 @@ free_smap:
return ERR_PTR(err);
}
+static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, bool may_fault)
+{
+ return may_fault ? build_id_parse(vma, build_id, NULL)
+ : build_id_parse_nofault(vma, build_id, NULL);
+}
+
+/*
+ * Expects all id_offs[i].ip values to be set to correct initial IPs.
+ * They will be subsequently:
+ * - either adjusted in place to a file offset, if build ID fetching
+ * succeeds; in this case id_offs[i].build_id is set to correct build ID,
+ * and id_offs[i].status is set to BPF_STACK_BUILD_ID_VALID;
+ * - or IP will be kept intact, if build ID fetching failed; in this case
+ * id_offs[i].build_id is zeroed out and id_offs[i].status is set to
+ * BPF_STACK_BUILD_ID_IP.
+ */
static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
- u64 *ips, u32 trace_nr, bool user)
+ u32 trace_nr, bool user, bool may_fault)
{
int i;
struct mmap_unlock_irq_work *work = NULL;
@@ -139,30 +158,28 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
/* cannot access current->mm, fall back to ips */
for (i = 0; i < trace_nr; i++) {
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
- id_offs[i].ip = ips[i];
memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
}
return;
}
for (i = 0; i < trace_nr; i++) {
- if (range_in_vma(prev_vma, ips[i], ips[i])) {
+ u64 ip = READ_ONCE(id_offs[i].ip);
+
+ if (range_in_vma(prev_vma, ip, ip)) {
vma = prev_vma;
- memcpy(id_offs[i].build_id, prev_build_id,
- BUILD_ID_SIZE_MAX);
+ memcpy(id_offs[i].build_id, prev_build_id, BUILD_ID_SIZE_MAX);
goto build_id_valid;
}
- vma = find_vma(current->mm, ips[i]);
- if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) {
+ vma = find_vma(current->mm, ip);
+ if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) {
/* per entry fall back to ips */
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
- id_offs[i].ip = ips[i];
memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
continue;
}
build_id_valid:
- id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
- - vma->vm_start;
+ id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ip - vma->vm_start;
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
prev_vma = vma;
prev_build_id = id_offs[i].build_id;
@@ -213,7 +230,7 @@ static long __bpf_get_stackid(struct bpf_map *map,
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
- u32 hash, id, trace_nr, trace_len;
+ u32 hash, id, trace_nr, trace_len, i;
bool user = flags & BPF_F_USER_STACK;
u64 *ips;
bool hash_matches;
@@ -235,15 +252,18 @@ static long __bpf_get_stackid(struct bpf_map *map,
return id;
if (stack_map_use_build_id(map)) {
+ struct bpf_stack_build_id *id_offs;
+
/* for build_id+offset, pop a bucket before slow cmp */
new_bucket = (struct stack_map_bucket *)
pcpu_freelist_pop(&smap->freelist);
if (unlikely(!new_bucket))
return -ENOMEM;
new_bucket->nr = trace_nr;
- stack_map_get_build_id_offset(
- (struct bpf_stack_build_id *)new_bucket->data,
- ips, trace_nr, user);
+ id_offs = (struct bpf_stack_build_id *)new_bucket->data;
+ for (i = 0; i < trace_nr; i++)
+ id_offs[i].ip = ips[i];
+ stack_map_get_build_id_offset(id_offs, trace_nr, user, false /* !may_fault */);
trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
if (hash_matches && bucket->nr == trace_nr &&
memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
@@ -384,7 +404,7 @@ const struct bpf_func_proto bpf_get_stackid_proto_pe = {
static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
struct perf_callchain_entry *trace_in,
- void *buf, u32 size, u64 flags)
+ void *buf, u32 size, u64 flags, bool may_fault)
{
u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
bool user_build_id = flags & BPF_F_USER_BUILD_ID;
@@ -402,8 +422,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
if (kernel && user_build_id)
goto clear;
- elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
- : sizeof(u64);
+ elem_size = user_build_id ? sizeof(struct bpf_stack_build_id) : sizeof(u64);
if (unlikely(size % elem_size))
goto clear;
@@ -424,6 +443,9 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
if (sysctl_perf_event_max_stack < max_depth)
max_depth = sysctl_perf_event_max_stack;
+ if (may_fault)
+ rcu_read_lock(); /* need RCU for perf's callchain below */
+
if (trace_in)
trace = trace_in;
else if (kernel && task)
@@ -431,21 +453,34 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
else
trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
crosstask, false);
- if (unlikely(!trace))
- goto err_fault;
- if (trace->nr < skip)
+ if (unlikely(!trace) || trace->nr < skip) {
+ if (may_fault)
+ rcu_read_unlock();
goto err_fault;
+ }
trace_nr = trace->nr - skip;
trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
copy_len = trace_nr * elem_size;
ips = trace->ip + skip;
- if (user && user_build_id)
- stack_map_get_build_id_offset(buf, ips, trace_nr, user);
- else
+ if (user_build_id) {
+ struct bpf_stack_build_id *id_offs = buf;
+ u32 i;
+
+ for (i = 0; i < trace_nr; i++)
+ id_offs[i].ip = ips[i];
+ } else {
memcpy(buf, ips, copy_len);
+ }
+
+ /* trace/ips should not be dereferenced after this point */
+ if (may_fault)
+ rcu_read_unlock();
+
+ if (user_build_id)
+ stack_map_get_build_id_offset(buf, trace_nr, user, may_fault);
if (size > copy_len)
memset(buf + copy_len, 0, size - copy_len);
@@ -461,7 +496,7 @@ clear:
BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
u64, flags)
{
- return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);
}
const struct bpf_func_proto bpf_get_stack_proto = {
@@ -474,8 +509,24 @@ const struct bpf_func_proto bpf_get_stack_proto = {
.arg4_type = ARG_ANYTHING,
};
-BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
- u32, size, u64, flags)
+BPF_CALL_4(bpf_get_stack_sleepable, struct pt_regs *, regs, void *, buf, u32, size,
+ u64, flags)
+{
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, true /* may_fault */);
+}
+
+const struct bpf_func_proto bpf_get_stack_sleepable_proto = {
+ .func = bpf_get_stack_sleepable,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static long __bpf_get_task_stack(struct task_struct *task, void *buf, u32 size,
+ u64 flags, bool may_fault)
{
struct pt_regs *regs;
long res = -EINVAL;
@@ -485,12 +536,18 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
regs = task_pt_regs(task);
if (regs)
- res = __bpf_get_stack(regs, task, NULL, buf, size, flags);
+ res = __bpf_get_stack(regs, task, NULL, buf, size, flags, may_fault);
put_task_stack(task);
return res;
}
+BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
+ u32, size, u64, flags)
+{
+ return __bpf_get_task_stack(task, buf, size, flags, false /* !may_fault */);
+}
+
const struct bpf_func_proto bpf_get_task_stack_proto = {
.func = bpf_get_task_stack,
.gpl_only = false,
@@ -502,6 +559,23 @@ const struct bpf_func_proto bpf_get_task_stack_proto = {
.arg4_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_get_task_stack_sleepable, struct task_struct *, task, void *, buf,
+ u32, size, u64, flags)
+{
+ return __bpf_get_task_stack(task, buf, size, flags, true /* !may_fault */);
+}
+
+const struct bpf_func_proto bpf_get_task_stack_sleepable_proto = {
+ .func = bpf_get_task_stack_sleepable,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
void *, buf, u32, size, u64, flags)
{
@@ -513,7 +587,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
__u64 nr_kernel;
if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
- return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
BPF_F_USER_BUILD_ID)))
@@ -533,7 +607,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
__u64 nr = trace->nr;
trace->nr = nr_kernel;
- err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
+ err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);
/* restore nr */
trace->nr = nr;
@@ -545,7 +619,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
goto clear;
flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
- err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
+ err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);
}
return err;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a1f18681721c..e1e42e918ba7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -35,6 +35,7 @@
#include <linux/rcupdate_trace.h>
#include <linux/memcontrol.h>
#include <linux/trace_events.h>
+#include <linux/tracepoint.h>
#include <net/netfilter/nf_bpf_link.h>
#include <net/netkit.h>
@@ -155,6 +156,89 @@ static void maybe_wait_bpf_programs(struct bpf_map *map)
synchronize_rcu();
}
+static void unpin_uptr_kaddr(void *kaddr)
+{
+ if (kaddr)
+ unpin_user_page(virt_to_page(kaddr));
+}
+
+static void __bpf_obj_unpin_uptrs(struct btf_record *rec, u32 cnt, void *obj)
+{
+ const struct btf_field *field;
+ void **uptr_addr;
+ int i;
+
+ for (i = 0, field = rec->fields; i < cnt; i++, field++) {
+ if (field->type != BPF_UPTR)
+ continue;
+
+ uptr_addr = obj + field->offset;
+ unpin_uptr_kaddr(*uptr_addr);
+ }
+}
+
+static void bpf_obj_unpin_uptrs(struct btf_record *rec, void *obj)
+{
+ if (!btf_record_has_field(rec, BPF_UPTR))
+ return;
+
+ __bpf_obj_unpin_uptrs(rec, rec->cnt, obj);
+}
+
+static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj)
+{
+ const struct btf_field *field;
+ const struct btf_type *t;
+ unsigned long start, end;
+ struct page *page;
+ void **uptr_addr;
+ int i, err;
+
+ if (!btf_record_has_field(rec, BPF_UPTR))
+ return 0;
+
+ for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) {
+ if (field->type != BPF_UPTR)
+ continue;
+
+ uptr_addr = obj + field->offset;
+ start = *(unsigned long *)uptr_addr;
+ if (!start)
+ continue;
+
+ t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id);
+ /* t->size was checked for zero before */
+ if (check_add_overflow(start, t->size - 1, &end)) {
+ err = -EFAULT;
+ goto unpin_all;
+ }
+
+ /* The uptr's struct cannot span across two pages */
+ if ((start & PAGE_MASK) != (end & PAGE_MASK)) {
+ err = -EOPNOTSUPP;
+ goto unpin_all;
+ }
+
+ err = pin_user_pages_fast(start, 1, FOLL_LONGTERM | FOLL_WRITE, &page);
+ if (err != 1)
+ goto unpin_all;
+
+ if (PageHighMem(page)) {
+ err = -EOPNOTSUPP;
+ unpin_user_page(page);
+ goto unpin_all;
+ }
+
+ *uptr_addr = page_address(page) + offset_in_page(start);
+ }
+
+ return 0;
+
+unpin_all:
+ __bpf_obj_unpin_uptrs(rec, i, obj);
+ return err;
+}
+
static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
void *key, void *value, __u64 flags)
{
@@ -164,6 +248,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
if (bpf_map_is_offloaded(map)) {
return bpf_map_offload_update_elem(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
+ map->map_type == BPF_MAP_TYPE_ARENA ||
map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
return map->ops->map_update_elem(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
@@ -198,9 +283,14 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
err = map->ops->map_push_elem(map, value, flags);
} else {
- rcu_read_lock();
- err = map->ops->map_update_elem(map, key, value, flags);
- rcu_read_unlock();
+ err = bpf_obj_pin_uptrs(map->record, value);
+ if (!err) {
+ rcu_read_lock();
+ err = map->ops->map_update_elem(map, key, value, flags);
+ rcu_read_unlock();
+ if (err)
+ bpf_obj_unpin_uptrs(map->record, value);
+ }
}
bpf_enable_instrumentation();
@@ -384,7 +474,7 @@ void bpf_map_free_id(struct bpf_map *map)
spin_unlock_irqrestore(&map_idr_lock, flags);
}
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
static void bpf_map_save_memcg(struct bpf_map *map)
{
/* Currently if a map is created by a process belonging to the root
@@ -479,6 +569,39 @@ static void bpf_map_release_memcg(struct bpf_map *map)
}
#endif
+int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
+ unsigned long nr_pages, struct page **pages)
+{
+ unsigned long i, j;
+ struct page *pg;
+ int ret = 0;
+#ifdef CONFIG_MEMCG
+ struct mem_cgroup *memcg, *old_memcg;
+
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+#endif
+ for (i = 0; i < nr_pages; i++) {
+ pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0);
+
+ if (pg) {
+ pages[i] = pg;
+ continue;
+ }
+ for (j = 0; j < i; j++)
+ __free_page(pages[j]);
+ ret = -ENOMEM;
+ break;
+ }
+
+#ifdef CONFIG_MEMCG
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+#endif
+ return ret;
+}
+
+
static int btf_field_cmp(const void *a, const void *b)
{
const struct btf_field *f1 = a, *f2 = b;
@@ -514,9 +637,11 @@ void btf_record_free(struct btf_record *rec)
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
if (rec->fields[i].kptr.module)
module_put(rec->fields[i].kptr.module);
- btf_put(rec->fields[i].kptr.btf);
+ if (btf_is_kernel(rec->fields[i].kptr.btf))
+ btf_put(rec->fields[i].kptr.btf);
break;
case BPF_LIST_HEAD:
case BPF_LIST_NODE:
@@ -525,6 +650,7 @@ void btf_record_free(struct btf_record *rec)
case BPF_SPIN_LOCK:
case BPF_TIMER:
case BPF_REFCOUNT:
+ case BPF_WORKQUEUE:
/* Nothing to release */
break;
default:
@@ -561,7 +687,9 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
- btf_get(fields[i].kptr.btf);
+ case BPF_UPTR:
+ if (btf_is_kernel(fields[i].kptr.btf))
+ btf_get(fields[i].kptr.btf);
if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
ret = -ENXIO;
goto free;
@@ -574,6 +702,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
case BPF_SPIN_LOCK:
case BPF_TIMER:
case BPF_REFCOUNT:
+ case BPF_WORKQUEUE:
/* Nothing to acquire */
break;
default:
@@ -625,6 +754,13 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
bpf_timer_cancel_and_free(obj + rec->timer_off);
}
+void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj)
+{
+ if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE)))
+ return;
+ bpf_wq_cancel_and_free(obj + rec->wq_off);
+}
+
void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
{
const struct btf_field *fields;
@@ -645,6 +781,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
case BPF_TIMER:
bpf_timer_cancel_and_free(field_ptr);
break;
+ case BPF_WORKQUEUE:
+ bpf_wq_cancel_and_free(field_ptr);
+ break;
case BPF_KPTR_UNREF:
WRITE_ONCE(*(u64 *)field_ptr, 0);
break;
@@ -657,15 +796,17 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
if (!btf_is_kernel(field->kptr.btf)) {
pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
field->kptr.btf_id);
- migrate_disable();
__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
pointee_struct_meta->record : NULL,
fields[i].type == BPF_KPTR_PERCPU);
- migrate_enable();
} else {
field->kptr.dtor(xchgd_field);
}
break;
+ case BPF_UPTR:
+ /* The caller ensured that no one is using the uptr */
+ unpin_uptr_kaddr(*(void **)field_ptr);
+ break;
case BPF_LIST_HEAD:
if (WARN_ON_ONCE(rec->spin_lock_off < 0))
continue;
@@ -687,17 +828,19 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
}
}
-/* called from workqueue */
-static void bpf_map_free_deferred(struct work_struct *work)
+static void bpf_map_free(struct bpf_map *map)
{
- struct bpf_map *map = container_of(work, struct bpf_map, work);
struct btf_record *rec = map->record;
struct btf *btf = map->btf;
- security_bpf_map_free(map);
- bpf_map_release_memcg(map);
- /* implementation dependent freeing */
+ /* implementation dependent freeing. Disabling migration to simplify
+ * the free of values or special fields allocated from bpf memory
+ * allocator.
+ */
+ migrate_disable();
map->ops->map_free(map);
+ migrate_enable();
+
/* Delay freeing of btf_record for maps, as map_free
* callback usually needs access to them. It is better to do it here
* than require each callback to do the free itself manually.
@@ -714,6 +857,16 @@ static void bpf_map_free_deferred(struct work_struct *work)
btf_put(btf);
}
+/* called from workqueue */
+static void bpf_map_free_deferred(struct work_struct *work)
+{
+ struct bpf_map *map = container_of(work, struct bpf_map, work);
+
+ security_bpf_map_free(map);
+ bpf_map_release_memcg(map);
+ bpf_map_free(map);
+}
+
static void bpf_map_put_uref(struct bpf_map *map)
{
if (atomic64_dec_and_test(&map->usercnt)) {
@@ -783,7 +936,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
{
- fmode_t mode = f.file->f_mode;
+ fmode_t mode = fd_file(f)->f_mode;
/* Our file permissions may have been overridden by global
* map permissions facing syscall side.
@@ -882,7 +1035,7 @@ static const struct vm_operations_struct bpf_map_default_vmops = {
static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct bpf_map *map = filp->private_data;
- int err;
+ int err = 0;
if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record))
return -ENOTSUPP;
@@ -906,24 +1059,33 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
err = -EACCES;
goto out;
}
+ bpf_map_write_active_inc(map);
}
+out:
+ mutex_unlock(&map->freeze_mutex);
+ if (err)
+ return err;
/* set default open/close callbacks */
vma->vm_ops = &bpf_map_default_vmops;
vma->vm_private_data = map;
vm_flags_clear(vma, VM_MAYEXEC);
+ /* If mapping is read-only, then disallow potentially re-mapping with
+ * PROT_WRITE by dropping VM_MAYWRITE flag. This VM_MAYWRITE clearing
+ * means that as far as BPF map's memory-mapped VMAs are concerned,
+ * VM_WRITE and VM_MAYWRITE and equivalent, if one of them is set,
+ * both should be set, so we can forget about VM_MAYWRITE and always
+ * check just VM_WRITE
+ */
if (!(vma->vm_flags & VM_WRITE))
- /* disallow re-mapping with PROT_WRITE */
vm_flags_clear(vma, VM_MAYWRITE);
err = map->ops->map_mmap(map, vma);
- if (err)
- goto out;
+ if (err) {
+ if (vma->vm_flags & VM_WRITE)
+ bpf_map_write_active_dec(map);
+ }
- if (vma->vm_flags & VM_MAYWRITE)
- bpf_map_write_active_inc(map);
-out:
- mutex_unlock(&map->freeze_mutex);
return err;
}
@@ -937,6 +1099,21 @@ static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
return EPOLLERR;
}
+static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ struct bpf_map *map = filp->private_data;
+
+ if (map->ops->map_get_unmapped_area)
+ return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags);
+#ifdef CONFIG_MMU
+ return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
+#else
+ return addr;
+#endif
+}
+
const struct file_operations bpf_map_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_map_show_fdinfo,
@@ -946,6 +1123,7 @@ const struct file_operations bpf_map_fops = {
.write = bpf_dummy_write,
.mmap = bpf_map_mmap,
.poll = bpf_map_poll,
+ .get_unmapped_area = bpf_get_unmapped_area,
};
int bpf_map_new_fd(struct bpf_map *map, int flags)
@@ -1011,8 +1189,8 @@ int map_check_no_btf(const struct bpf_map *map,
return -ENOTSUPP;
}
-static int map_check_btf(struct bpf_map *map, const struct btf *btf,
- u32 btf_key_id, u32 btf_value_id)
+static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
+ const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
{
const struct btf_type *key_type, *value_type;
u32 key_size, value_size;
@@ -1035,12 +1213,12 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
map->record = btf_parse_fields(btf, value_type,
BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
- BPF_RB_ROOT | BPF_REFCOUNT,
+ BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR,
map->value_size);
if (!IS_ERR_OR_NULL(map->record)) {
int i;
- if (!bpf_capable()) {
+ if (!bpf_token_capable(token, CAP_BPF)) {
ret = -EPERM;
goto free_map_tab;
}
@@ -1065,6 +1243,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
}
break;
case BPF_TIMER:
+ case BPF_WORKQUEUE:
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY) {
@@ -1090,6 +1269,12 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
goto free_map_tab;
}
break;
+ case BPF_UPTR:
+ if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
case BPF_LIST_HEAD:
case BPF_RB_ROOT:
if (map->map_type != BPF_MAP_TYPE_HASH &&
@@ -1123,14 +1308,21 @@ free_map_tab:
return ret;
}
-#define BPF_MAP_CREATE_LAST_FIELD map_extra
+static bool bpf_net_capable(void)
+{
+ return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
+}
+
+#define BPF_MAP_CREATE_LAST_FIELD map_token_fd
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
const struct bpf_map_ops *ops;
+ struct bpf_token *token = NULL;
int numa_node = bpf_map_attr_numa_node(attr);
u32 map_type = attr->map_type;
struct bpf_map *map;
+ bool token_flag;
int f_flags;
int err;
@@ -1138,6 +1330,12 @@ static int map_create(union bpf_attr *attr)
if (err)
return -EINVAL;
+ /* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it
+ * to avoid per-map type checks tripping on unknown flag
+ */
+ token_flag = attr->map_flags & BPF_F_TOKEN_FD;
+ attr->map_flags &= ~BPF_F_TOKEN_FD;
+
if (attr->btf_vmlinux_value_type_id) {
if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
attr->btf_key_type_id || attr->btf_value_type_id)
@@ -1147,6 +1345,7 @@ static int map_create(union bpf_attr *attr)
}
if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
+ attr->map_type != BPF_MAP_TYPE_ARENA &&
attr->map_extra != 0)
return -EINVAL;
@@ -1178,14 +1377,32 @@ static int map_create(union bpf_attr *attr)
if (!ops->map_mem_usage)
return -EINVAL;
+ if (token_flag) {
+ token = bpf_token_get_from_fd(attr->map_token_fd);
+ if (IS_ERR(token))
+ return PTR_ERR(token);
+
+ /* if current token doesn't grant map creation permissions,
+ * then we can't use this token, so ignore it and rely on
+ * system-wide capabilities checks
+ */
+ if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) ||
+ !bpf_token_allow_map_type(token, attr->map_type)) {
+ bpf_token_put(token);
+ token = NULL;
+ }
+ }
+
+ err = -EPERM;
+
/* Intent here is for unprivileged_bpf_disabled to block BPF map
* creation for unprivileged users; other actions depend
* on fd availability and access to bpffs, so are dependent on
* object creation success. Even with unprivileged BPF disabled,
* capability checks are still carried out.
*/
- if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
- return -EPERM;
+ if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF))
+ goto put_token;
/* check privileged map type permissions */
switch (map_type) {
@@ -1218,25 +1435,28 @@ static int map_create(union bpf_attr *attr)
case BPF_MAP_TYPE_LRU_PERCPU_HASH:
case BPF_MAP_TYPE_STRUCT_OPS:
case BPF_MAP_TYPE_CPUMAP:
- if (!bpf_capable())
- return -EPERM;
+ case BPF_MAP_TYPE_ARENA:
+ if (!bpf_token_capable(token, CAP_BPF))
+ goto put_token;
break;
case BPF_MAP_TYPE_SOCKMAP:
case BPF_MAP_TYPE_SOCKHASH:
case BPF_MAP_TYPE_DEVMAP:
case BPF_MAP_TYPE_DEVMAP_HASH:
case BPF_MAP_TYPE_XSKMAP:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
+ if (!bpf_token_capable(token, CAP_NET_ADMIN))
+ goto put_token;
break;
default:
WARN(1, "unsupported map type %d", map_type);
- return -EPERM;
+ goto put_token;
}
map = ops->map_alloc(attr);
- if (IS_ERR(map))
- return PTR_ERR(map);
+ if (IS_ERR(map)) {
+ err = PTR_ERR(map);
+ goto put_token;
+ }
map->ops = ops;
map->map_type = map_type;
@@ -1273,7 +1493,7 @@ static int map_create(union bpf_attr *attr)
map->btf = btf;
if (attr->btf_value_type_id) {
- err = map_check_btf(map, btf, attr->btf_key_type_id,
+ err = map_check_btf(map, token, btf, attr->btf_key_type_id,
attr->btf_value_type_id);
if (err)
goto free_map;
@@ -1285,15 +1505,16 @@ static int map_create(union bpf_attr *attr)
attr->btf_vmlinux_value_type_id;
}
- err = security_bpf_map_alloc(map);
+ err = security_bpf_map_create(map, attr, token);
if (err)
- goto free_map;
+ goto free_map_sec;
err = bpf_map_alloc_id(map);
if (err)
goto free_map_sec;
bpf_map_save_memcg(map);
+ bpf_token_put(token);
err = bpf_map_new_fd(map, f_flags);
if (err < 0) {
@@ -1312,26 +1533,12 @@ static int map_create(union bpf_attr *attr)
free_map_sec:
security_bpf_map_free(map);
free_map:
- btf_put(map->btf);
- map->ops->map_free(map);
+ bpf_map_free(map);
+put_token:
+ bpf_token_put(token);
return err;
}
-/* if error is returned, fd is released.
- * On success caller should complete fd access with matching fdput()
- */
-struct bpf_map *__bpf_map_get(struct fd f)
-{
- if (!f.file)
- return ERR_PTR(-EBADF);
- if (f.file->f_op != &bpf_map_fops) {
- fdput(f);
- return ERR_PTR(-EINVAL);
- }
-
- return f.file->private_data;
-}
-
void bpf_map_inc(struct bpf_map *map)
{
atomic64_inc(&map->refcnt);
@@ -1347,15 +1554,11 @@ EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
struct bpf_map *bpf_map_get(u32 ufd)
{
- struct fd f = fdget(ufd);
- struct bpf_map *map;
+ CLASS(fd, f)(ufd);
+ struct bpf_map *map = __bpf_map_get(f);
- map = __bpf_map_get(f);
- if (IS_ERR(map))
- return map;
-
- bpf_map_inc(map);
- fdput(f);
+ if (!IS_ERR(map))
+ bpf_map_inc(map);
return map;
}
@@ -1363,15 +1566,11 @@ EXPORT_SYMBOL(bpf_map_get);
struct bpf_map *bpf_map_get_with_uref(u32 ufd)
{
- struct fd f = fdget(ufd);
- struct bpf_map *map;
+ CLASS(fd, f)(ufd);
+ struct bpf_map *map = __bpf_map_get(f);
- map = __bpf_map_get(f);
- if (IS_ERR(map))
- return map;
-
- bpf_map_inc_with_uref(map);
- fdput(f);
+ if (!IS_ERR(map))
+ bpf_map_inc_with_uref(map);
return map;
}
@@ -1436,11 +1635,9 @@ static int map_lookup_elem(union bpf_attr *attr)
{
void __user *ukey = u64_to_user_ptr(attr->key);
void __user *uvalue = u64_to_user_ptr(attr->value);
- int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
u32 value_size;
- struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
@@ -1449,26 +1646,20 @@ static int map_lookup_elem(union bpf_attr *attr)
if (attr->flags & ~BPF_F_LOCK)
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
- if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
- err = -EPERM;
- goto err_put;
- }
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
+ return -EPERM;
if ((attr->flags & BPF_F_LOCK) &&
- !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
- err = -EINVAL;
- goto err_put;
- }
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK))
+ return -EINVAL;
key = __bpf_copy_key(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
- }
+ if (IS_ERR(key))
+ return PTR_ERR(key);
value_size = bpf_map_value_size(map);
@@ -1499,8 +1690,6 @@ free_value:
kvfree(value);
free_key:
kvfree(key);
-err_put:
- fdput(f);
return err;
}
@@ -1511,17 +1700,15 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
{
bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
- int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
u32 value_size;
- struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -1550,7 +1737,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
goto free_key;
}
- err = bpf_map_update_value(map, f.file, key, value, attr->flags);
+ err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags);
if (!err)
maybe_wait_bpf_programs(map);
@@ -1559,7 +1746,6 @@ free_key:
kvfree(key);
err_put:
bpf_map_write_active_dec(map);
- fdput(f);
return err;
}
@@ -1568,16 +1754,14 @@ err_put:
static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
{
bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
- int ufd = attr->map_fd;
struct bpf_map *map;
- struct fd f;
void *key;
int err;
if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -1614,7 +1798,6 @@ out:
kvfree(key);
err_put:
bpf_map_write_active_dec(map);
- fdput(f);
return err;
}
@@ -1625,30 +1808,24 @@ static int map_get_next_key(union bpf_attr *attr)
{
void __user *ukey = u64_to_user_ptr(attr->key);
void __user *unext_key = u64_to_user_ptr(attr->next_key);
- int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *next_key;
- struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
- if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
- err = -EPERM;
- goto err_put;
- }
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
+ return -EPERM;
if (ukey) {
key = __bpf_copy_key(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
- }
+ if (IS_ERR(key))
+ return PTR_ERR(key);
} else {
key = NULL;
}
@@ -1680,8 +1857,6 @@ free_next_key:
kvfree(next_key);
free_key:
kvfree(key);
-err_put:
- fdput(f);
return err;
}
@@ -1802,8 +1977,6 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
return err;
}
-#define MAP_LOOKUP_RETRIES 3
-
int generic_map_lookup_batch(struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -1813,8 +1986,8 @@ int generic_map_lookup_batch(struct bpf_map *map,
void __user *values = u64_to_user_ptr(attr->batch.values);
void __user *keys = u64_to_user_ptr(attr->batch.keys);
void *buf, *buf_prevkey, *prev_key, *key, *value;
- int err, retry = MAP_LOOKUP_RETRIES;
u32 value_size, cp, max_count;
+ int err;
if (attr->batch.elem_flags & ~BPF_F_LOCK)
return -EINVAL;
@@ -1860,14 +2033,8 @@ int generic_map_lookup_batch(struct bpf_map *map,
err = bpf_map_copy_value(map, key, value,
attr->batch.elem_flags);
- if (err == -ENOENT) {
- if (retry) {
- retry--;
- continue;
- }
- err = -EINTR;
- break;
- }
+ if (err == -ENOENT)
+ goto next_key;
if (err)
goto free_buf;
@@ -1882,12 +2049,12 @@ int generic_map_lookup_batch(struct bpf_map *map,
goto free_buf;
}
+ cp++;
+next_key:
if (!prev_key)
prev_key = buf_prevkey;
swap(prev_key, key);
- retry = MAP_LOOKUP_RETRIES;
- cp++;
cond_resched();
}
@@ -1910,11 +2077,9 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
{
void __user *ukey = u64_to_user_ptr(attr->key);
void __user *uvalue = u64_to_user_ptr(attr->value);
- int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
u32 value_size;
- struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
@@ -1923,7 +2088,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
if (attr->flags & ~BPF_F_LOCK)
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -1993,7 +2158,6 @@ free_key:
kvfree(key);
err_put:
bpf_map_write_active_dec(map);
- fdput(f);
return err;
}
@@ -2001,27 +2165,22 @@ err_put:
static int map_freeze(const union bpf_attr *attr)
{
- int err = 0, ufd = attr->map_fd;
+ int err = 0;
struct bpf_map *map;
- struct fd f;
if (CHECK_ATTR(BPF_MAP_FREEZE))
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
- if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) {
- fdput(f);
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record))
return -ENOTSUPP;
- }
- if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
- fdput(f);
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE))
return -EPERM;
- }
mutex_lock(&map->freeze_mutex);
if (bpf_map_write_active(map)) {
@@ -2036,7 +2195,6 @@ static int map_freeze(const union bpf_attr *attr)
WRITE_ONCE(map->frozen, true);
err_put:
mutex_unlock(&map->freeze_mutex);
- fdput(f);
return err;
}
@@ -2144,7 +2302,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
kvfree(aux->func_info);
kfree(aux->func_info_aux);
free_uid(aux->user);
- security_bpf_prog_free(aux);
+ security_bpf_prog_free(aux->prog);
bpf_prog_free(aux->prog);
}
@@ -2160,7 +2318,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
btf_put(prog->aux->attach_btf);
if (deferred) {
- if (prog->aux->sleepable)
+ if (prog->sleepable)
call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
else
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
@@ -2306,18 +2464,6 @@ int bpf_prog_new_fd(struct bpf_prog *prog)
O_RDWR | O_CLOEXEC);
}
-static struct bpf_prog *____bpf_prog_get(struct fd f)
-{
- if (!f.file)
- return ERR_PTR(-EBADF);
- if (f.file->f_op != &bpf_prog_fops) {
- fdput(f);
- return ERR_PTR(-EINVAL);
- }
-
- return f.file->private_data;
-}
-
void bpf_prog_add(struct bpf_prog *prog, int i)
{
atomic64_add(i, &prog->aux->refcnt);
@@ -2373,20 +2519,19 @@ bool bpf_prog_get_ok(struct bpf_prog *prog,
static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
bool attach_drv)
{
- struct fd f = fdget(ufd);
+ CLASS(fd, f)(ufd);
struct bpf_prog *prog;
- prog = ____bpf_prog_get(f);
- if (IS_ERR(prog))
- return prog;
- if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
- prog = ERR_PTR(-EINVAL);
- goto out;
- }
+ if (fd_empty(f))
+ return ERR_PTR(-EBADF);
+ if (fd_file(f)->f_op != &bpf_prog_fops)
+ return ERR_PTR(-EINVAL);
+
+ prog = fd_file(f)->private_data;
+ if (!bpf_prog_get_ok(prog, attach_type, attach_drv))
+ return ERR_PTR(-EINVAL);
bpf_prog_inc(prog);
-out:
- fdput(f);
return prog;
}
@@ -2590,13 +2735,15 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD log_true_size
+#define BPF_PROG_LOAD_LAST_FIELD fd_array_cnt
static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog, *dst_prog = NULL;
struct btf *attach_btf = NULL;
+ struct bpf_token *token = NULL;
+ bool bpf_cap;
int err;
char license[128];
@@ -2610,13 +2757,35 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
BPF_F_TEST_RND_HI32 |
BPF_F_XDP_HAS_FRAGS |
BPF_F_XDP_DEV_BOUND_ONLY |
- BPF_F_TEST_REG_INVARIANTS))
+ BPF_F_TEST_REG_INVARIANTS |
+ BPF_F_TOKEN_FD))
return -EINVAL;
+ bpf_prog_load_fixup_attach_type(attr);
+
+ if (attr->prog_flags & BPF_F_TOKEN_FD) {
+ token = bpf_token_get_from_fd(attr->prog_token_fd);
+ if (IS_ERR(token))
+ return PTR_ERR(token);
+ /* if current token doesn't grant prog loading permissions,
+ * then we can't use this token, so ignore it and rely on
+ * system-wide capabilities checks
+ */
+ if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) ||
+ !bpf_token_allow_prog_type(token, attr->prog_type,
+ attr->expected_attach_type)) {
+ bpf_token_put(token);
+ token = NULL;
+ }
+ }
+
+ bpf_cap = bpf_token_capable(token, CAP_BPF);
+ err = -EPERM;
+
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
(attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
- !bpf_capable())
- return -EPERM;
+ !bpf_cap)
+ goto put_token;
/* Intent here is for unprivileged_bpf_disabled to block BPF program
* creation for unprivileged users; other actions depend
@@ -2625,21 +2794,23 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
* capability checks are still carried out for these
* and other operations.
*/
- if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
- return -EPERM;
+ if (sysctl_unprivileged_bpf_disabled && !bpf_cap)
+ goto put_token;
if (attr->insn_cnt == 0 ||
- attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
- return -E2BIG;
+ attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) {
+ err = -E2BIG;
+ goto put_token;
+ }
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
- !bpf_capable())
- return -EPERM;
+ !bpf_cap)
+ goto put_token;
- if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN))
- return -EPERM;
- if (is_perfmon_prog_type(type) && !perfmon_capable())
- return -EPERM;
+ if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN))
+ goto put_token;
+ if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON))
+ goto put_token;
/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
* or btf, we need to check which one it is
@@ -2649,27 +2820,33 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
if (IS_ERR(dst_prog)) {
dst_prog = NULL;
attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
- if (IS_ERR(attach_btf))
- return -EINVAL;
+ if (IS_ERR(attach_btf)) {
+ err = -EINVAL;
+ goto put_token;
+ }
if (!btf_is_kernel(attach_btf)) {
/* attaching through specifying bpf_prog's BTF
* objects directly might be supported eventually
*/
btf_put(attach_btf);
- return -ENOTSUPP;
+ err = -ENOTSUPP;
+ goto put_token;
}
}
} else if (attr->attach_btf_id) {
/* fall back to vmlinux BTF, if BTF type ID is specified */
attach_btf = bpf_get_btf_vmlinux();
- if (IS_ERR(attach_btf))
- return PTR_ERR(attach_btf);
- if (!attach_btf)
- return -EINVAL;
+ if (IS_ERR(attach_btf)) {
+ err = PTR_ERR(attach_btf);
+ goto put_token;
+ }
+ if (!attach_btf) {
+ err = -EINVAL;
+ goto put_token;
+ }
btf_get(attach_btf);
}
- bpf_prog_load_fixup_attach_type(attr);
if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
attach_btf, attr->attach_btf_id,
dst_prog)) {
@@ -2677,7 +2854,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
bpf_prog_put(dst_prog);
if (attach_btf)
btf_put(attach_btf);
- return -EINVAL;
+ err = -EINVAL;
+ goto put_token;
}
/* plain bpf_prog allocation */
@@ -2687,20 +2865,21 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
bpf_prog_put(dst_prog);
if (attach_btf)
btf_put(attach_btf);
- return -ENOMEM;
+ err = -EINVAL;
+ goto put_token;
}
prog->expected_attach_type = attr->expected_attach_type;
+ prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE);
prog->aux->attach_btf = attach_btf;
prog->aux->attach_btf_id = attr->attach_btf_id;
prog->aux->dst_prog = dst_prog;
prog->aux->dev_bound = !!attr->prog_ifindex;
- prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
- err = security_bpf_prog_alloc(prog->aux);
- if (err)
- goto free_prog;
+ /* move token into prog->aux, reuse taken refcnt */
+ prog->aux->token = token;
+ token = NULL;
prog->aux->user = get_current_user();
prog->len = attr->insn_cnt;
@@ -2709,12 +2888,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
if (copy_from_bpfptr(prog->insns,
make_bpfptr(attr->insns, uattr.is_kernel),
bpf_prog_insn_size(prog)) != 0)
- goto free_prog_sec;
+ goto free_prog;
/* copy eBPF program license from user space */
if (strncpy_from_bpfptr(license,
make_bpfptr(attr->license, uattr.is_kernel),
sizeof(license) - 1) < 0)
- goto free_prog_sec;
+ goto free_prog;
license[sizeof(license) - 1] = 0;
/* eBPF programs must be GPL compatible to use GPL-ed functions */
@@ -2728,14 +2907,14 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
if (bpf_prog_is_dev_bound(prog->aux)) {
err = bpf_prog_dev_bound_init(prog, attr);
if (err)
- goto free_prog_sec;
+ goto free_prog;
}
if (type == BPF_PROG_TYPE_EXT && dst_prog &&
bpf_prog_is_dev_bound(dst_prog->aux)) {
err = bpf_prog_dev_bound_inherit(prog, dst_prog);
if (err)
- goto free_prog_sec;
+ goto free_prog;
}
/*
@@ -2757,12 +2936,16 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
/* find program type: socket_filter vs tracing_filter */
err = find_prog_type(type, prog);
if (err < 0)
- goto free_prog_sec;
+ goto free_prog;
prog->aux->load_time = ktime_get_boottime_ns();
err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
sizeof(attr->prog_name));
if (err < 0)
+ goto free_prog;
+
+ err = security_bpf_prog_load(prog, attr, token);
+ if (err)
goto free_prog_sec;
/* run eBPF verifier */
@@ -2808,13 +2991,16 @@ free_used_maps:
*/
__bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
return err;
+
free_prog_sec:
- free_uid(prog->aux->user);
- security_bpf_prog_free(prog->aux);
+ security_bpf_prog_free(prog);
free_prog:
+ free_uid(prog->aux->user);
if (prog->aux->attach_btf)
btf_put(prog->aux->attach_btf);
bpf_prog_free(prog);
+put_token:
+ bpf_token_put(token);
return err;
}
@@ -2853,16 +3039,33 @@ static int bpf_obj_get(const union bpf_attr *attr)
attr->file_flags);
}
-void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
- const struct bpf_link_ops *ops, struct bpf_prog *prog)
+/* bpf_link_init_sleepable() allows to specify whether BPF link itself has
+ * "sleepable" semantics, which normally would mean that BPF link's attach
+ * hook can dereference link or link's underlying program for some time after
+ * detachment due to RCU Tasks Trace-based lifetime protection scheme.
+ * BPF program itself can be non-sleepable, yet, because it's transitively
+ * reachable through BPF link, its freeing has to be delayed until after RCU
+ * Tasks Trace GP.
+ */
+void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
+ const struct bpf_link_ops *ops, struct bpf_prog *prog,
+ bool sleepable)
{
+ WARN_ON(ops->dealloc && ops->dealloc_deferred);
atomic64_set(&link->refcnt, 1);
link->type = type;
+ link->sleepable = sleepable;
link->id = 0;
link->ops = ops;
link->prog = prog;
}
+void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
+ const struct bpf_link_ops *ops, struct bpf_prog *prog)
+{
+ bpf_link_init_sleepable(link, type, ops, prog, false);
+}
+
static void bpf_link_free_id(int id)
{
if (!id)
@@ -2895,17 +3098,59 @@ void bpf_link_inc(struct bpf_link *link)
atomic64_inc(&link->refcnt);
}
+static void bpf_link_dealloc(struct bpf_link *link)
+{
+ /* now that we know that bpf_link itself can't be reached, put underlying BPF program */
+ if (link->prog)
+ bpf_prog_put(link->prog);
+
+ /* free bpf_link and its containing memory */
+ if (link->ops->dealloc_deferred)
+ link->ops->dealloc_deferred(link);
+ else
+ link->ops->dealloc(link);
+}
+
+static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu)
+{
+ struct bpf_link *link = container_of(rcu, struct bpf_link, rcu);
+
+ bpf_link_dealloc(link);
+}
+
+static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_link_defer_dealloc_rcu_gp(rcu);
+ else
+ call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp);
+}
+
/* bpf_link_free is guaranteed to be called from process context */
static void bpf_link_free(struct bpf_link *link)
{
+ const struct bpf_link_ops *ops = link->ops;
+
bpf_link_free_id(link->id);
- if (link->prog) {
- /* detach BPF program, clean up used resources */
- link->ops->release(link);
- bpf_prog_put(link->prog);
+ /* detach BPF program, clean up used resources */
+ if (link->prog)
+ ops->release(link);
+ if (ops->dealloc_deferred) {
+ /* Schedule BPF link deallocation, which will only then
+ * trigger putting BPF program refcount.
+ * If underlying BPF program is sleepable or BPF link's target
+ * attach hookpoint is sleepable or otherwise requires RCU GPs
+ * to ensure link and its underlying BPF program is not
+ * reachable anymore, we need to first wait for RCU tasks
+ * trace sync, and then go through "classic" RCU grace period
+ */
+ if (link->sleepable || (link->prog && link->prog->sleepable))
+ call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
+ else
+ call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
+ } else if (ops->dealloc) {
+ bpf_link_dealloc(link);
}
- /* free bpf_link and its containing memory */
- link->ops->dealloc(link);
}
static void bpf_link_put_deferred(struct work_struct *work)
@@ -2959,13 +3204,17 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_link *link = filp->private_data;
const struct bpf_prog *prog = link->prog;
+ enum bpf_link_type type = link->type;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- seq_printf(m,
- "link_type:\t%s\n"
- "link_id:\t%u\n",
- bpf_link_type_strs[link->type],
- link->id);
+ if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) {
+ seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]);
+ } else {
+ WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type);
+ seq_printf(m, "link_type:\t<%u>\n", type);
+ }
+ seq_printf(m, "link_id:\t%u\n", link->id);
+
if (prog) {
bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
@@ -2979,6 +3228,13 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
}
#endif
+static __poll_t bpf_link_poll(struct file *file, struct poll_table_struct *pts)
+{
+ struct bpf_link *link = file->private_data;
+
+ return link->ops->poll(file, pts);
+}
+
static const struct file_operations bpf_link_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_link_show_fdinfo,
@@ -2988,6 +3244,16 @@ static const struct file_operations bpf_link_fops = {
.write = bpf_dummy_write,
};
+static const struct file_operations bpf_link_fops_poll = {
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_link_show_fdinfo,
+#endif
+ .release = bpf_link_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
+ .poll = bpf_link_poll,
+};
+
static int bpf_link_alloc_id(struct bpf_link *link)
{
int id;
@@ -3030,7 +3296,9 @@ int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
return id;
}
- file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
+ file = anon_inode_getfile("bpf_link",
+ link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops,
+ link, O_CLOEXEC);
if (IS_ERR(file)) {
bpf_link_free_id(id);
put_unused_fd(fd);
@@ -3058,25 +3326,23 @@ int bpf_link_settle(struct bpf_link_primer *primer)
int bpf_link_new_fd(struct bpf_link *link)
{
- return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
+ return anon_inode_getfd("bpf-link",
+ link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops,
+ link, O_CLOEXEC);
}
struct bpf_link *bpf_link_get_from_fd(u32 ufd)
{
- struct fd f = fdget(ufd);
+ CLASS(fd, f)(ufd);
struct bpf_link *link;
- if (!f.file)
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- if (f.file->f_op != &bpf_link_fops) {
- fdput(f);
+ if (fd_file(f)->f_op != &bpf_link_fops && fd_file(f)->f_op != &bpf_link_fops_poll)
return ERR_PTR(-EINVAL);
- }
- link = f.file->private_data;
+ link = fd_file(f)->private_data;
bpf_link_inc(link);
- fdput(f);
-
return link;
}
EXPORT_SYMBOL(bpf_link_get_from_fd);
@@ -3087,7 +3353,8 @@ static void bpf_tracing_link_release(struct bpf_link *link)
container_of(link, struct bpf_tracing_link, link.link);
WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
- tr_link->trampoline));
+ tr_link->trampoline,
+ tr_link->tgt_prog));
bpf_trampoline_put(tr_link->trampoline);
@@ -3227,7 +3494,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
* in prog->aux
*
* - if prog->aux->dst_trampoline is NULL, the program has already been
- * attached to a target and its initial target was cleared (below)
+ * attached to a target and its initial target was cleared (below)
*
* - if tgt_prog != NULL, the caller specified tgt_prog_fd +
* target_btf_id using the link_create API.
@@ -3302,7 +3569,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
if (err)
goto out_unlock;
- err = bpf_trampoline_link_prog(&link->link, tr);
+ err = bpf_trampoline_link_prog(&link->link, tr, tgt_prog);
if (err) {
bpf_link_cleanup(&link_primer);
link = NULL;
@@ -3340,17 +3607,12 @@ out_put_prog:
return err;
}
-struct bpf_raw_tp_link {
- struct bpf_link link;
- struct bpf_raw_event_map *btp;
-};
-
static void bpf_raw_tp_link_release(struct bpf_link *link)
{
struct bpf_raw_tp_link *raw_tp =
container_of(link, struct bpf_raw_tp_link, link);
- bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog);
+ bpf_probe_unregister(raw_tp->btp, raw_tp);
bpf_put_raw_tracepoint(raw_tp->btp);
}
@@ -3415,7 +3677,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_raw_tp_link_lops = {
.release = bpf_raw_tp_link_release,
- .dealloc = bpf_raw_tp_link_dealloc,
+ .dealloc_deferred = bpf_raw_tp_link_dealloc,
.show_fdinfo = bpf_raw_tp_link_show_fdinfo,
.fill_link_info = bpf_raw_tp_link_fill_link_info,
};
@@ -3443,15 +3705,16 @@ static void bpf_perf_link_dealloc(struct bpf_link *link)
}
static int bpf_perf_link_fill_common(const struct perf_event *event,
- char __user *uname, u32 ulen,
+ char __user *uname, u32 *ulenp,
u64 *probe_offset, u64 *probe_addr,
u32 *fd_type, unsigned long *missed)
{
const char *buf;
- u32 prog_id;
+ u32 prog_id, ulen;
size_t len;
int err;
+ ulen = *ulenp;
if (!ulen ^ !uname)
return -EINVAL;
@@ -3459,10 +3722,17 @@ static int bpf_perf_link_fill_common(const struct perf_event *event,
probe_offset, probe_addr, missed);
if (err)
return err;
+
+ if (buf) {
+ len = strlen(buf);
+ *ulenp = len + 1;
+ } else {
+ *ulenp = 1;
+ }
if (!uname)
return 0;
+
if (buf) {
- len = strlen(buf);
err = bpf_copy_to_user(uname, buf, ulen, len);
if (err)
return err;
@@ -3487,7 +3757,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
uname = u64_to_user_ptr(info->perf_event.kprobe.func_name);
ulen = info->perf_event.kprobe.name_len;
- err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
+ err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr,
&type, &missed);
if (err)
return err;
@@ -3495,12 +3765,13 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
info->perf_event.type = BPF_PERF_EVENT_KRETPROBE;
else
info->perf_event.type = BPF_PERF_EVENT_KPROBE;
-
+ info->perf_event.kprobe.name_len = ulen;
info->perf_event.kprobe.offset = offset;
info->perf_event.kprobe.missed = missed;
if (!kallsyms_show_value(current_cred()))
addr = 0;
info->perf_event.kprobe.addr = addr;
+ info->perf_event.kprobe.cookie = event->bpf_cookie;
return 0;
}
#endif
@@ -3516,7 +3787,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
ulen = info->perf_event.uprobe.name_len;
- err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
+ err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr,
&type, NULL);
if (err)
return err;
@@ -3525,7 +3796,9 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
info->perf_event.type = BPF_PERF_EVENT_URETPROBE;
else
info->perf_event.type = BPF_PERF_EVENT_UPROBE;
+ info->perf_event.uprobe.name_len = ulen;
info->perf_event.uprobe.offset = offset;
+ info->perf_event.uprobe.cookie = event->bpf_cookie;
return 0;
}
#endif
@@ -3549,11 +3822,18 @@ static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
{
char __user *uname;
u32 ulen;
+ int err;
uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
ulen = info->perf_event.tracepoint.name_len;
+ err = bpf_perf_link_fill_common(event, uname, &ulen, NULL, NULL, NULL, NULL);
+ if (err)
+ return err;
+
info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
- return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL);
+ info->perf_event.tracepoint.name_len = ulen;
+ info->perf_event.tracepoint.cookie = event->bpf_cookie;
+ return 0;
}
static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
@@ -3561,6 +3841,7 @@ static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
{
info->perf_event.event.type = event->attr.type;
info->perf_event.event.config = event->attr.config;
+ info->perf_event.event.cookie = event->bpf_cookie;
info->perf_event.type = BPF_PERF_EVENT_EVENT;
return 0;
}
@@ -3646,7 +3927,7 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro
#endif /* CONFIG_PERF_EVENTS */
static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
- const char __user *user_tp_name)
+ const char __user *user_tp_name, u64 cookie)
{
struct bpf_link_primer link_primer;
struct bpf_raw_tp_link *link;
@@ -3690,9 +3971,11 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
err = -ENOMEM;
goto out_put_btp;
}
- bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
- &bpf_raw_tp_link_lops, prog);
+ bpf_link_init_sleepable(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
+ &bpf_raw_tp_link_lops, prog,
+ tracepoint_is_faultable(btp->tp));
link->btp = btp;
+ link->cookie = cookie;
err = bpf_link_prime(&link->link, &link_primer);
if (err) {
@@ -3700,7 +3983,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
goto out_put_btp;
}
- err = bpf_probe_register(link->btp, prog);
+ err = bpf_probe_register(link->btp, link);
if (err) {
bpf_link_cleanup(&link_primer);
goto out_put_btp;
@@ -3713,11 +3996,13 @@ out_put_btp:
return err;
}
-#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
+#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.cookie
static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
{
struct bpf_prog *prog;
+ void __user *tp_name;
+ __u64 cookie;
int fd;
if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
@@ -3727,7 +4012,9 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
if (IS_ERR(prog))
return PTR_ERR(prog);
- fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name));
+ tp_name = u64_to_user_ptr(attr->raw_tracepoint.name);
+ cookie = attr->raw_tracepoint.cookie;
+ fd = bpf_raw_tp_link_attach(prog, tp_name, cookie);
if (fd < 0)
bpf_prog_put(prog);
return fd;
@@ -3818,11 +4105,16 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
case BPF_PROG_TYPE_SK_LOOKUP:
return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
case BPF_PROG_TYPE_CGROUP_SKB:
- if (!capable(CAP_NET_ADMIN))
+ if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN))
/* cg-skb progs can be loaded by unpriv user.
* check permissions at attach time.
*/
return -EPERM;
+
+ ptype = attach_type_to_prog_type(attach_type);
+ if (prog->type != ptype)
+ return -EINVAL;
+
return prog->enforce_expected_attach_type &&
prog->expected_attach_type != attach_type ?
-EINVAL : 0;
@@ -3841,12 +4133,20 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
attach_type != BPF_TRACE_KPROBE_MULTI)
return -EINVAL;
+ if (prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION &&
+ attach_type != BPF_TRACE_KPROBE_SESSION)
+ return -EINVAL;
if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
attach_type != BPF_TRACE_UPROBE_MULTI)
return -EINVAL;
+ if (prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION &&
+ attach_type != BPF_TRACE_UPROBE_SESSION)
+ return -EINVAL;
if (attach_type != BPF_PERF_EVENT &&
attach_type != BPF_TRACE_KPROBE_MULTI &&
- attach_type != BPF_TRACE_UPROBE_MULTI)
+ attach_type != BPF_TRACE_KPROBE_SESSION &&
+ attach_type != BPF_TRACE_UPROBE_MULTI &&
+ attach_type != BPF_TRACE_UPROBE_SESSION)
return -EINVAL;
return 0;
case BPF_PROG_TYPE_SCHED_CLS:
@@ -4021,7 +4321,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
static int bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
- if (!capable(CAP_NET_ADMIN))
+ if (!bpf_net_capable())
return -EPERM;
if (CHECK_ATTR(BPF_PROG_QUERY))
return -EINVAL;
@@ -4320,6 +4620,12 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
continue;
}
+ if ((BPF_CLASS(code) == BPF_LDX || BPF_CLASS(code) == BPF_STX ||
+ BPF_CLASS(code) == BPF_ST) && BPF_MODE(code) == BPF_PROBE_MEM32) {
+ insns[i].code = BPF_CLASS(code) | BPF_SIZE(code) | BPF_MEM;
+ continue;
+ }
+
if (code != (BPF_LD | BPF_IMM | BPF_DW))
continue;
@@ -4687,6 +4993,8 @@ static int bpf_map_get_info_by_fd(struct file *file,
info.btf_value_type_id = map->btf_value_type_id;
}
info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS)
+ bpf_map_struct_ops_info_fill(&info, map);
if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_info_fill(&info, map);
@@ -4760,44 +5068,55 @@ static int bpf_link_get_info_by_fd(struct file *file,
static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
- int ufd = attr->info.bpf_fd;
- struct fd f;
- int err;
-
if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
return -EINVAL;
- f = fdget(ufd);
- if (!f.file)
+ CLASS(fd, f)(attr->info.bpf_fd);
+ if (fd_empty(f))
return -EBADFD;
- if (f.file->f_op == &bpf_prog_fops)
- err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr,
+ if (fd_file(f)->f_op == &bpf_prog_fops)
+ return bpf_prog_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr,
uattr);
- else if (f.file->f_op == &bpf_map_fops)
- err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr,
+ else if (fd_file(f)->f_op == &bpf_map_fops)
+ return bpf_map_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr,
uattr);
- else if (f.file->f_op == &btf_fops)
- err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
- else if (f.file->f_op == &bpf_link_fops)
- err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
+ else if (fd_file(f)->f_op == &btf_fops)
+ return bpf_btf_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr);
+ else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll)
+ return bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data,
attr, uattr);
- else
- err = -EINVAL;
-
- fdput(f);
- return err;
+ return -EINVAL;
}
-#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size
+#define BPF_BTF_LOAD_LAST_FIELD btf_token_fd
static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
+ struct bpf_token *token = NULL;
+
if (CHECK_ATTR(BPF_BTF_LOAD))
return -EINVAL;
- if (!bpf_capable())
+ if (attr->btf_flags & ~BPF_F_TOKEN_FD)
+ return -EINVAL;
+
+ if (attr->btf_flags & BPF_F_TOKEN_FD) {
+ token = bpf_token_get_from_fd(attr->btf_token_fd);
+ if (IS_ERR(token))
+ return PTR_ERR(token);
+ if (!bpf_token_allow_cmd(token, BPF_BTF_LOAD)) {
+ bpf_token_put(token);
+ token = NULL;
+ }
+ }
+
+ if (!bpf_token_capable(token, CAP_BPF)) {
+ bpf_token_put(token);
return -EPERM;
+ }
+
+ bpf_token_put(token);
return btf_new_fd(attr, uattr, uattr_size);
}
@@ -4895,7 +5214,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (!file)
return -EBADF;
- if (file->f_op == &bpf_link_fops) {
+ if (file->f_op == &bpf_link_fops || file->f_op == &bpf_link_fops_poll) {
struct bpf_link *link = file->private_data;
if (link->ops == &bpf_raw_tp_link_lops) {
@@ -4955,14 +5274,13 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
struct bpf_map *map;
- int err, ufd;
- struct fd f;
+ int err;
if (CHECK_ATTR(BPF_MAP_BATCH))
return -EINVAL;
- ufd = attr->batch.map_fd;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->batch.map_fd);
+
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -4982,7 +5300,7 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
else if (cmd == BPF_MAP_UPDATE_BATCH)
- BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr);
+ BPF_DO_BATCH(map->ops->map_update_batch, map, fd_file(f), attr, uattr);
else
BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
err_put:
@@ -4990,7 +5308,6 @@ err_put:
maybe_wait_bpf_programs(map);
bpf_map_write_active_dec(map);
}
- fdput(f);
return err;
}
@@ -5038,7 +5355,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
goto out;
}
if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
- ret = bpf_raw_tp_link_attach(prog, NULL);
+ ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie);
else if (prog->expected_attach_type == BPF_TRACE_ITER)
ret = bpf_iter_link_attach(attr, uattr, prog);
else if (prog->expected_attach_type == BPF_LSM_CGROUP)
@@ -5053,6 +5370,10 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_PROG_TYPE_SK_LOOKUP:
ret = netns_bpf_link_create(attr, prog);
break;
+ case BPF_PROG_TYPE_SK_MSG:
+ case BPF_PROG_TYPE_SK_SKB:
+ ret = sock_map_link_create(attr, prog);
+ break;
#ifdef CONFIG_NET
case BPF_PROG_TYPE_XDP:
ret = bpf_xdp_link_attach(attr, prog);
@@ -5075,9 +5396,11 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_PROG_TYPE_KPROBE:
if (attr->link_create.attach_type == BPF_PERF_EVENT)
ret = bpf_perf_link_attach(attr, prog);
- else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI)
+ else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI ||
+ attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION)
ret = bpf_kprobe_multi_link_attach(attr, prog);
- else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI)
+ else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI ||
+ attr->link_create.attach_type == BPF_TRACE_UPROBE_SESSION)
ret = bpf_uprobe_multi_link_attach(attr, prog);
break;
default:
@@ -5200,10 +5523,11 @@ static int link_detach(union bpf_attr *attr)
return ret;
}
-static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
+struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
{
return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
}
+EXPORT_SYMBOL(bpf_link_inc_not_zero);
struct bpf_link *bpf_link_by_id(u32 id)
{
@@ -5394,7 +5718,7 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
/* The bpf program will not access the bpf map, but for the sake of
* simplicity, increase sleepable_refcnt for sleepable program as well.
*/
- if (prog->aux->sleepable)
+ if (prog->sleepable)
atomic64_inc(&map->sleepable_refcnt);
memcpy(used_maps_new, used_maps_old,
sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
@@ -5415,7 +5739,21 @@ out_prog_put:
return ret;
}
-static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
+#define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd
+
+static int token_create(union bpf_attr *attr)
+{
+ if (CHECK_ATTR(BPF_TOKEN_CREATE))
+ return -EINVAL;
+
+ /* no flags are supported yet */
+ if (attr->token_create.flags)
+ return -EINVAL;
+
+ return bpf_token_create(attr);
+}
+
+static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
{
union bpf_attr attr;
int err;
@@ -5548,6 +5886,9 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
case BPF_PROG_BIND_MAP:
err = bpf_prog_bind_map(&attr);
break;
+ case BPF_TOKEN_CREATE:
+ err = token_create(&attr);
+ break;
default:
err = -EINVAL;
break;
@@ -5654,7 +5995,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = {
const struct bpf_func_proto * __weak
tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
BPF_CALL_1(bpf_sys_close, u32, fd)
@@ -5676,6 +6017,7 @@ static const struct bpf_func_proto bpf_sys_close_proto = {
BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
{
+ *res = 0;
if (flags)
return -EINVAL;
@@ -5696,7 +6038,8 @@ static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
.arg1_type = ARG_PTR_TO_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_LONG,
+ .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
+ .arg4_size = sizeof(u64),
};
static const struct bpf_func_proto *
@@ -5704,7 +6047,8 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_sys_bpf:
- return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto;
+ return !bpf_token_capable(prog->aux->token, CAP_PERFMON)
+ ? NULL : &bpf_sys_bpf_proto;
case BPF_FUNC_btf_find_by_name_kind:
return &bpf_btf_find_by_name_kind_proto;
case BPF_FUNC_sys_close:
@@ -5726,7 +6070,7 @@ const struct bpf_prog_ops bpf_syscall_prog_ops = {
};
#ifdef CONFIG_SYSCTL
-static int bpf_stats_handler(struct ctl_table *table, int write,
+static int bpf_stats_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct static_key *key = (struct static_key *)table->data;
@@ -5761,7 +6105,7 @@ void __weak unpriv_ebpf_notify(int new_state)
{
}
-static int bpf_unpriv_handler(struct ctl_table *table, int write,
+static int bpf_unpriv_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret, unpriv_enable = *(int *)table->data;
@@ -5785,7 +6129,7 @@ static int bpf_unpriv_handler(struct ctl_table *table, int write,
return ret;
}
-static struct ctl_table bpf_syscall_table[] = {
+static const struct ctl_table bpf_syscall_table[] = {
{
.procname = "unprivileged_bpf_disabled",
.data = &sysctl_unprivileged_bpf_disabled,
@@ -5801,7 +6145,6 @@ static struct ctl_table bpf_syscall_table[] = {
.mode = 0644,
.proc_handler = bpf_stats_handler,
},
- { }
};
static int __init bpf_syscall_sysctl_init(void)
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
index ef6911aee3bb..81d6cf90584a 100644
--- a/kernel/bpf/sysfs_btf.c
+++ b/kernel/bpf/sysfs_btf.c
@@ -9,30 +9,22 @@
#include <linux/sysfs.h>
/* See scripts/link-vmlinux.sh, gen_btf() func for details */
-extern char __weak __start_BTF[];
-extern char __weak __stop_BTF[];
-
-static ssize_t
-btf_vmlinux_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t off, size_t len)
-{
- memcpy(buf, __start_BTF + off, len);
- return len;
-}
+extern char __start_BTF[];
+extern char __stop_BTF[];
static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = {
.attr = { .name = "vmlinux", .mode = 0444, },
- .read = btf_vmlinux_read,
+ .read_new = sysfs_bin_attr_simple_read,
};
struct kobject *btf_kobj;
static int __init btf_vmlinux_init(void)
{
+ bin_attr_btf_vmlinux.private = __start_BTF;
bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF;
- if (!__start_BTF || bin_attr_btf_vmlinux.size == 0)
+ if (bin_attr_btf_vmlinux.size == 0)
return 0;
btf_kobj = kobject_create_and_add("btf", kernel_kobj);
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index ec4e97c61eef..98d9b4c0daff 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -5,7 +5,6 @@
#include <linux/namei.h>
#include <linux/pid_namespace.h>
#include <linux/fs.h>
-#include <linux/fdtable.h>
#include <linux/filter.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/btf_ids.h>
@@ -99,7 +98,7 @@ static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *co
rcu_read_lock();
pid = find_pid_ns(common->pid, common->ns);
if (pid) {
- task = get_pid_task(pid, PIDTYPE_TGID);
+ task = get_pid_task(pid, PIDTYPE_PID);
*tid = common->pid;
}
rcu_read_unlock();
@@ -261,6 +260,7 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
u32 saved_tid = info->tid;
struct task_struct *curr_task;
unsigned int curr_fd = info->fd;
+ struct file *f;
/* If this function returns a non-NULL file object,
* it held a reference to the task/file.
@@ -285,21 +285,14 @@ again:
curr_fd = 0;
}
- rcu_read_lock();
- for (;; curr_fd++) {
- struct file *f;
- f = task_lookup_next_fdget_rcu(curr_task, &curr_fd);
- if (!f)
- break;
-
+ f = fget_task_next(curr_task, &curr_fd);
+ if (f) {
/* set info->fd */
info->fd = curr_fd;
- rcu_read_unlock();
return f;
}
/* the current task is done, go to the next task */
- rcu_read_unlock();
put_task_struct(curr_task);
if (info->common.type == BPF_TASK_ITER_TID) {
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
new file mode 100644
index 000000000000..26057aa13503
--- /dev/null
+++ b/kernel/bpf/token.c
@@ -0,0 +1,255 @@
+#include <linux/bpf.h>
+#include <linux/vmalloc.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/idr.h>
+#include <linux/namei.h>
+#include <linux/user_namespace.h>
+#include <linux/security.h>
+
+static bool bpf_ns_capable(struct user_namespace *ns, int cap)
+{
+ return ns_capable(ns, cap) || (cap != CAP_SYS_ADMIN && ns_capable(ns, CAP_SYS_ADMIN));
+}
+
+bool bpf_token_capable(const struct bpf_token *token, int cap)
+{
+ struct user_namespace *userns;
+
+ /* BPF token allows ns_capable() level of capabilities */
+ userns = token ? token->userns : &init_user_ns;
+ if (!bpf_ns_capable(userns, cap))
+ return false;
+ if (token && security_bpf_token_capable(token, cap) < 0)
+ return false;
+ return true;
+}
+
+void bpf_token_inc(struct bpf_token *token)
+{
+ atomic64_inc(&token->refcnt);
+}
+
+static void bpf_token_free(struct bpf_token *token)
+{
+ security_bpf_token_free(token);
+ put_user_ns(token->userns);
+ kfree(token);
+}
+
+static void bpf_token_put_deferred(struct work_struct *work)
+{
+ struct bpf_token *token = container_of(work, struct bpf_token, work);
+
+ bpf_token_free(token);
+}
+
+void bpf_token_put(struct bpf_token *token)
+{
+ if (!token)
+ return;
+
+ if (!atomic64_dec_and_test(&token->refcnt))
+ return;
+
+ INIT_WORK(&token->work, bpf_token_put_deferred);
+ schedule_work(&token->work);
+}
+
+static int bpf_token_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_token *token = filp->private_data;
+
+ bpf_token_put(token);
+ return 0;
+}
+
+static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+ struct bpf_token *token = filp->private_data;
+ u64 mask;
+
+ BUILD_BUG_ON(__MAX_BPF_CMD >= 64);
+ mask = BIT_ULL(__MAX_BPF_CMD) - 1;
+ if ((token->allowed_cmds & mask) == mask)
+ seq_printf(m, "allowed_cmds:\tany\n");
+ else
+ seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds);
+
+ BUILD_BUG_ON(__MAX_BPF_MAP_TYPE >= 64);
+ mask = BIT_ULL(__MAX_BPF_MAP_TYPE) - 1;
+ if ((token->allowed_maps & mask) == mask)
+ seq_printf(m, "allowed_maps:\tany\n");
+ else
+ seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps);
+
+ BUILD_BUG_ON(__MAX_BPF_PROG_TYPE >= 64);
+ mask = BIT_ULL(__MAX_BPF_PROG_TYPE) - 1;
+ if ((token->allowed_progs & mask) == mask)
+ seq_printf(m, "allowed_progs:\tany\n");
+ else
+ seq_printf(m, "allowed_progs:\t0x%llx\n", token->allowed_progs);
+
+ BUILD_BUG_ON(__MAX_BPF_ATTACH_TYPE >= 64);
+ mask = BIT_ULL(__MAX_BPF_ATTACH_TYPE) - 1;
+ if ((token->allowed_attachs & mask) == mask)
+ seq_printf(m, "allowed_attachs:\tany\n");
+ else
+ seq_printf(m, "allowed_attachs:\t0x%llx\n", token->allowed_attachs);
+}
+
+#define BPF_TOKEN_INODE_NAME "bpf-token"
+
+static const struct inode_operations bpf_token_iops = { };
+
+static const struct file_operations bpf_token_fops = {
+ .release = bpf_token_release,
+ .show_fdinfo = bpf_token_show_fdinfo,
+};
+
+int bpf_token_create(union bpf_attr *attr)
+{
+ struct bpf_mount_opts *mnt_opts;
+ struct bpf_token *token = NULL;
+ struct user_namespace *userns;
+ struct inode *inode;
+ struct file *file;
+ CLASS(fd, f)(attr->token_create.bpffs_fd);
+ struct path path;
+ struct super_block *sb;
+ umode_t mode;
+ int err, fd;
+
+ if (fd_empty(f))
+ return -EBADF;
+
+ path = fd_file(f)->f_path;
+ sb = path.dentry->d_sb;
+
+ if (path.dentry != sb->s_root)
+ return -EINVAL;
+ if (sb->s_op != &bpf_super_ops)
+ return -EINVAL;
+ err = path_permission(&path, MAY_ACCESS);
+ if (err)
+ return err;
+
+ userns = sb->s_user_ns;
+ /*
+ * Enforce that creators of BPF tokens are in the same user
+ * namespace as the BPF FS instance. This makes reasoning about
+ * permissions a lot easier and we can always relax this later.
+ */
+ if (current_user_ns() != userns)
+ return -EPERM;
+ if (!ns_capable(userns, CAP_BPF))
+ return -EPERM;
+
+ /* Creating BPF token in init_user_ns doesn't make much sense. */
+ if (current_user_ns() == &init_user_ns)
+ return -EOPNOTSUPP;
+
+ mnt_opts = sb->s_fs_info;
+ if (mnt_opts->delegate_cmds == 0 &&
+ mnt_opts->delegate_maps == 0 &&
+ mnt_opts->delegate_progs == 0 &&
+ mnt_opts->delegate_attachs == 0)
+ return -ENOENT; /* no BPF token delegation is set up */
+
+ mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
+ inode = bpf_get_inode(sb, NULL, mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_op = &bpf_token_iops;
+ inode->i_fop = &bpf_token_fops;
+ clear_nlink(inode); /* make sure it is unlinked */
+
+ file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops);
+ if (IS_ERR(file)) {
+ iput(inode);
+ return PTR_ERR(file);
+ }
+
+ token = kzalloc(sizeof(*token), GFP_USER);
+ if (!token) {
+ err = -ENOMEM;
+ goto out_file;
+ }
+
+ atomic64_set(&token->refcnt, 1);
+
+ /* remember bpffs owning userns for future ns_capable() checks */
+ token->userns = get_user_ns(userns);
+
+ token->allowed_cmds = mnt_opts->delegate_cmds;
+ token->allowed_maps = mnt_opts->delegate_maps;
+ token->allowed_progs = mnt_opts->delegate_progs;
+ token->allowed_attachs = mnt_opts->delegate_attachs;
+
+ err = security_bpf_token_create(token, attr, &path);
+ if (err)
+ goto out_token;
+
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0) {
+ err = fd;
+ goto out_token;
+ }
+
+ file->private_data = token;
+ fd_install(fd, file);
+
+ return fd;
+
+out_token:
+ bpf_token_free(token);
+out_file:
+ fput(file);
+ return err;
+}
+
+struct bpf_token *bpf_token_get_from_fd(u32 ufd)
+{
+ CLASS(fd, f)(ufd);
+ struct bpf_token *token;
+
+ if (fd_empty(f))
+ return ERR_PTR(-EBADF);
+ if (fd_file(f)->f_op != &bpf_token_fops)
+ return ERR_PTR(-EINVAL);
+
+ token = fd_file(f)->private_data;
+ bpf_token_inc(token);
+
+ return token;
+}
+
+bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
+{
+ if (!token)
+ return false;
+ if (!(token->allowed_cmds & BIT_ULL(cmd)))
+ return false;
+ return security_bpf_token_cmd(token, cmd) == 0;
+}
+
+bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type)
+{
+ if (!token || type >= __MAX_BPF_MAP_TYPE)
+ return false;
+
+ return token->allowed_maps & BIT_ULL(type);
+}
+
+bool bpf_token_allow_prog_type(const struct bpf_token *token,
+ enum bpf_prog_type prog_type,
+ enum bpf_attach_type attach_type)
+{
+ if (!token || prog_type >= __MAX_BPF_PROG_TYPE || attach_type >= __MAX_BPF_ATTACH_TYPE)
+ return false;
+
+ return (token->allowed_progs & BIT_ULL(prog_type)) &&
+ (token->allowed_attachs & BIT_ULL(attach_type));
+}
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index d382f5ebe06c..c4b1a98ff726 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -115,10 +115,14 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
(ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
}
-void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym)
+void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym)
{
ksym->start = (unsigned long) data;
ksym->end = ksym->start + size;
+}
+
+void bpf_image_ksym_add(struct bpf_ksym *ksym)
+{
bpf_ksym_add(ksym);
perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
PAGE_SIZE, false, ksym->name);
@@ -333,7 +337,7 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im)
int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP,
NULL, im->ip_epilogue);
WARN_ON(err);
- if (IS_ENABLED(CONFIG_PREEMPTION))
+ if (IS_ENABLED(CONFIG_TASKS_RCU))
call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
else
percpu_ref_kill(&im->pcref);
@@ -377,7 +381,8 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size)
ksym = &im->ksym;
INIT_LIST_HEAD_RCU(&ksym->lnode);
snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key);
- bpf_image_ksym_add(image, size, ksym);
+ bpf_image_ksym_init(image, size, ksym);
+ bpf_image_ksym_add(ksym);
return im;
out_free_image:
@@ -456,7 +461,9 @@ again:
if (err < 0)
goto out_free;
- arch_protect_bpf_trampoline(im->image, im->size);
+ err = arch_protect_bpf_trampoline(im->image, im->size);
+ if (err)
+ goto out_free;
WARN_ON(tr->cur_image && total == 0);
if (tr->cur_image)
@@ -521,7 +528,27 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
}
}
-static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog)
+{
+ struct bpf_prog_aux *aux = tgt_prog->aux;
+
+ guard(mutex)(&aux->ext_mutex);
+ if (aux->prog_array_member_cnt)
+ /* Program extensions can not extend target prog when the target
+ * prog has been updated to any prog_array map as tail callee.
+ * It's to prevent a potential infinite loop like:
+ * tgt prog entry -> tgt prog subprog -> freplace prog entry
+ * --tailcall-> tgt prog entry.
+ */
+ return -EBUSY;
+
+ aux->is_extended = true;
+ return 0;
+}
+
+static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
+ struct bpf_trampoline *tr,
+ struct bpf_prog *tgt_prog)
{
enum bpf_tramp_prog_type kind;
struct bpf_tramp_link *link_exiting;
@@ -542,6 +569,9 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_tr
/* Cannot attach extension if fentry/fexit are in use. */
if (cnt)
return -EBUSY;
+ err = bpf_freplace_check_tgt_prog(tgt_prog);
+ if (err)
+ return err;
tr->extension_prog = link->link.prog;
return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
link->link.prog->bpf_func);
@@ -568,17 +598,21 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_tr
return err;
}
-int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+int bpf_trampoline_link_prog(struct bpf_tramp_link *link,
+ struct bpf_trampoline *tr,
+ struct bpf_prog *tgt_prog)
{
int err;
mutex_lock(&tr->mutex);
- err = __bpf_trampoline_link_prog(link, tr);
+ err = __bpf_trampoline_link_prog(link, tr, tgt_prog);
mutex_unlock(&tr->mutex);
return err;
}
-static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
+ struct bpf_trampoline *tr,
+ struct bpf_prog *tgt_prog)
{
enum bpf_tramp_prog_type kind;
int err;
@@ -589,6 +623,8 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_
err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
tr->extension_prog->bpf_func, NULL);
tr->extension_prog = NULL;
+ guard(mutex)(&tgt_prog->aux->ext_mutex);
+ tgt_prog->aux->is_extended = false;
return err;
}
hlist_del_init(&link->tramp_hlist);
@@ -597,12 +633,14 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_
}
/* bpf_trampoline_unlink_prog() should never fail. */
-int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
+ struct bpf_trampoline *tr,
+ struct bpf_prog *tgt_prog)
{
int err;
mutex_lock(&tr->mutex);
- err = __bpf_trampoline_unlink_prog(link, tr);
+ err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog);
mutex_unlock(&tr->mutex);
return err;
}
@@ -617,7 +655,7 @@ static void bpf_shim_tramp_link_release(struct bpf_link *link)
if (!shim_link->trampoline)
return;
- WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline));
+ WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline, NULL));
bpf_trampoline_put(shim_link->trampoline);
}
@@ -731,7 +769,7 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
goto err;
}
- err = __bpf_trampoline_link_prog(&shim_link->link, tr);
+ err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL);
if (err)
goto err;
@@ -866,6 +904,8 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
bpf_prog_inc_misses_counter(prog);
+ if (prog->aux->recursion_detected)
+ prog->aux->recursion_detected(prog);
return 0;
}
return bpf_prog_start_time();
@@ -883,12 +923,13 @@ static void notrace update_prog_stats(struct bpf_prog *prog,
* Hence check that 'start' is valid.
*/
start > NO_START_TIME) {
+ u64 duration = sched_clock() - start;
unsigned long flags;
stats = this_cpu_ptr(prog->stats);
flags = u64_stats_update_begin_irqsave(&stats->syncp);
u64_stats_inc(&stats->cnt);
- u64_stats_add(&stats->nsecs, sched_clock() - start);
+ u64_stats_add(&stats->nsecs, duration);
u64_stats_update_end_irqrestore(&stats->syncp, flags);
}
}
@@ -941,6 +982,8 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
bpf_prog_inc_misses_counter(prog);
+ if (prog->aux->recursion_detected)
+ prog->aux->recursion_detected(prog);
return 0;
}
return bpf_prog_start_time();
@@ -1014,7 +1057,7 @@ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog)
{
- bool sleepable = prog->aux->sleepable;
+ bool sleepable = prog->sleepable;
if (bpf_prog_check_recur(prog))
return sleepable ? __bpf_prog_enter_sleepable_recur :
@@ -1029,7 +1072,7 @@ bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog)
bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog)
{
- bool sleepable = prog->aux->sleepable;
+ bool sleepable = prog->sleepable;
if (bpf_prog_check_recur(prog))
return sleepable ? __bpf_prog_exit_sleepable_recur :
@@ -1072,17 +1115,10 @@ void __weak arch_free_bpf_trampoline(void *image, unsigned int size)
bpf_jit_free_exec(image);
}
-void __weak arch_protect_bpf_trampoline(void *image, unsigned int size)
-{
- WARN_ON_ONCE(size > PAGE_SIZE);
- set_memory_rox((long)image, 1);
-}
-
-void __weak arch_unprotect_bpf_trampoline(void *image, unsigned int size)
+int __weak arch_protect_bpf_trampoline(void *image, unsigned int size)
{
WARN_ON_ONCE(size > PAGE_SIZE);
- set_memory_nx((long)image, 1);
- set_memory_rw((long)image, 1);
+ return set_memory_rox((long)image, 1);
}
int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ddea9567f755..60611df77957 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -28,6 +28,8 @@
#include <linux/cpumask.h>
#include <linux/bpf_mem_alloc.h>
#include <net/xdp.h>
+#include <linux/trace_events.h>
+#include <linux/kallsyms.h>
#include "disasm.h"
@@ -172,7 +174,7 @@ static bool bpf_global_percpu_ma_set;
/* verifier_state + insn_idx are pushed to stack when branch is encountered */
struct bpf_verifier_stack_elem {
- /* verifer state is 'st'
+ /* verifier state is 'st'
* before processing instruction 'insn_idx'
* and after processing instruction 'prev_insn_idx'
*/
@@ -190,14 +192,12 @@ struct bpf_verifier_stack_elem {
#define BPF_MAP_KEY_POISON (1ULL << 63)
#define BPF_MAP_KEY_SEEN (1ULL << 62)
-#define BPF_MAP_PTR_UNPRIV 1UL
-#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \
- POISON_POINTER_DELTA))
-#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
-
#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512
-static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
+#define BPF_PRIV_STACK_MIN_SIZE 64
+
+static int acquire_reference(struct bpf_verifier_env *env, int insn_idx);
+static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id);
static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
@@ -209,21 +209,22 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg);
static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
{
- return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON;
+ return aux->map_ptr_state.poison;
}
static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
{
- return aux->map_ptr_state & BPF_MAP_PTR_UNPRIV;
+ return aux->map_ptr_state.unpriv;
}
static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
- const struct bpf_map *map, bool unpriv)
+ struct bpf_map *map,
+ bool unpriv, bool poison)
{
- BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV);
unpriv |= bpf_map_ptr_unpriv(aux);
- aux->map_ptr_state = (unsigned long)map |
- (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
+ aux->map_ptr_state.unpriv = unpriv;
+ aux->map_ptr_state.poison = poison;
+ aux->map_ptr_state.map_ptr = map;
}
static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux)
@@ -286,6 +287,7 @@ struct bpf_call_arg_meta {
u32 ret_btf_id;
u32 subprogno;
struct btf_field *kptr_field;
+ s64 const_map_key;
};
struct bpf_kfunc_call_arg_meta {
@@ -336,6 +338,10 @@ struct bpf_kfunc_call_arg_meta {
u8 spi;
u8 frameno;
} iter;
+ struct {
+ struct bpf_map *ptr;
+ int uid;
+ } map;
u64 mem_size;
};
@@ -383,11 +389,6 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env,
verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval);
}
-static bool type_may_be_null(u32 type)
-{
- return type & PTR_MAYBE_NULL;
-}
-
static bool reg_not_null(const struct bpf_reg_state *reg)
{
enum bpf_reg_type type;
@@ -501,8 +502,12 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)
}
static bool is_sync_callback_calling_kfunc(u32 btf_id);
+static bool is_async_callback_calling_kfunc(u32 btf_id);
+static bool is_callback_calling_kfunc(u32 btf_id);
static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
+static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id);
+
static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_for_each_map_elem ||
@@ -528,6 +533,22 @@ static bool is_sync_callback_calling_insn(struct bpf_insn *insn)
(bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm));
}
+static bool is_async_callback_calling_insn(struct bpf_insn *insn)
+{
+ return (bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm)) ||
+ (bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm));
+}
+
+static bool is_may_goto_insn(struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO;
+}
+
+static bool is_may_goto_insn_at(struct bpf_verifier_env *env, int insn_idx)
+{
+ return is_may_goto_insn(&env->prog->insnsi[insn_idx]);
+}
+
static bool is_storage_get_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_sk_storage_get ||
@@ -622,6 +643,11 @@ static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
}
+static int irq_flag_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ return stack_slot_obj_get_spi(env, reg, "irq_flag", 1);
+}
+
static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
{
switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
@@ -733,7 +759,7 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
if (clone_ref_obj_id)
id = clone_ref_obj_id;
else
- id = acquire_reference_state(env, insn_idx);
+ id = acquire_reference(env, insn_idx);
if (id < 0)
return id;
@@ -995,7 +1021,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
if (spi < 0)
return spi;
- id = acquire_reference_state(env, insn_idx);
+ id = acquire_reference(env, insn_idx);
if (id < 0)
return id;
@@ -1117,10 +1143,136 @@ static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_s
return 0;
}
+static int acquire_irq_state(struct bpf_verifier_env *env, int insn_idx);
+static int release_irq_state(struct bpf_verifier_state *state, int id);
+
+static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env,
+ struct bpf_kfunc_call_arg_meta *meta,
+ struct bpf_reg_state *reg, int insn_idx)
+{
+ struct bpf_func_state *state = func(env, reg);
+ struct bpf_stack_state *slot;
+ struct bpf_reg_state *st;
+ int spi, i, id;
+
+ spi = irq_flag_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ id = acquire_irq_state(env, insn_idx);
+ if (id < 0)
+ return id;
+
+ slot = &state->stack[spi];
+ st = &slot->spilled_ptr;
+
+ __mark_reg_known_zero(st);
+ st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+ st->live |= REG_LIVE_WRITTEN;
+ st->ref_obj_id = id;
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ slot->slot_type[i] = STACK_IRQ_FLAG;
+
+ mark_stack_slot_scratched(env, spi);
+ return 0;
+}
+
+static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ struct bpf_stack_state *slot;
+ struct bpf_reg_state *st;
+ int spi, i, err;
+
+ spi = irq_flag_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ slot = &state->stack[spi];
+ st = &slot->spilled_ptr;
+
+ err = release_irq_state(env->cur_state, st->ref_obj_id);
+ WARN_ON_ONCE(err && err != -EACCES);
+ if (err) {
+ int insn_idx = 0;
+
+ for (int i = 0; i < env->cur_state->acquired_refs; i++) {
+ if (env->cur_state->refs[i].id == env->cur_state->active_irq_id) {
+ insn_idx = env->cur_state->refs[i].insn_idx;
+ break;
+ }
+ }
+
+ verbose(env, "cannot restore irq state out of order, expected id=%d acquired at insn_idx=%d\n",
+ env->cur_state->active_irq_id, insn_idx);
+ return err;
+ }
+
+ __mark_reg_not_init(env, st);
+
+ /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
+ st->live |= REG_LIVE_WRITTEN;
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ slot->slot_type[i] = STACK_INVALID;
+
+ mark_stack_slot_scratched(env, spi);
+ return 0;
+}
+
+static bool is_irq_flag_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ struct bpf_stack_state *slot;
+ int spi, i;
+
+ /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
+ * will do check_mem_access to check and update stack bounds later, so
+ * return true for that case.
+ */
+ spi = irq_flag_get_spi(env, reg);
+ if (spi == -ERANGE)
+ return true;
+ if (spi < 0)
+ return false;
+
+ slot = &state->stack[spi];
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ if (slot->slot_type[i] == STACK_IRQ_FLAG)
+ return false;
+ return true;
+}
+
+static int is_irq_flag_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ struct bpf_stack_state *slot;
+ struct bpf_reg_state *st;
+ int spi, i;
+
+ spi = irq_flag_get_spi(env, reg);
+ if (spi < 0)
+ return -EINVAL;
+
+ slot = &state->stack[spi];
+ st = &slot->spilled_ptr;
+
+ if (!st->ref_obj_id)
+ return -EINVAL;
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ if (slot->slot_type[i] != STACK_IRQ_FLAG)
+ return -EINVAL;
+ return 0;
+}
+
/* Check if given stack slot is "special":
* - spilled register state (STACK_SPILL);
* - dynptr state (STACK_DYNPTR);
* - iter state (STACK_ITER).
+ * - irq flag state (STACK_IRQ_FLAG)
*/
static bool is_stack_slot_special(const struct bpf_stack_state *stack)
{
@@ -1130,6 +1282,7 @@ static bool is_stack_slot_special(const struct bpf_stack_state *stack)
case STACK_SPILL:
case STACK_DYNPTR:
case STACK_ITER:
+ case STACK_IRQ_FLAG:
return true;
case STACK_INVALID:
case STACK_MISC:
@@ -1155,17 +1308,26 @@ static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack)
stack->spilled_ptr.type == SCALAR_VALUE;
}
+static bool is_spilled_scalar_reg64(const struct bpf_stack_state *stack)
+{
+ return stack->slot_type[0] == STACK_SPILL &&
+ stack->spilled_ptr.type == SCALAR_VALUE;
+}
+
/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which
* case they are equivalent, or it's STACK_ZERO, in which case we preserve
* more precise STACK_ZERO.
- * Note, in uprivileged mode leaving STACK_INVALID is wrong, so we take
- * env->allow_ptr_leaks into account and force STACK_MISC, if necessary.
+ * Regardless of allow_ptr_leaks setting (i.e., privileged or unprivileged
+ * mode), we won't promote STACK_INVALID to STACK_MISC. In privileged case it is
+ * unnecessary as both are considered equivalent when loading data and pruning,
+ * in case of unprivileged mode it will be incorrect to allow reads of invalid
+ * slots.
*/
static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype)
{
if (*stype == STACK_ZERO)
return;
- if (env->allow_ptr_leaks && *stype == STACK_INVALID)
+ if (*stype == STACK_INVALID)
return;
*stype = STACK_MISC;
}
@@ -1235,7 +1397,7 @@ out:
return arr ? arr : ZERO_SIZE_PTR;
}
-static int copy_reference_state(struct bpf_func_state *dst, const struct bpf_func_state *src)
+static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf_verifier_state *src)
{
dst->refs = copy_array(dst->refs, src->refs, src->acquired_refs,
sizeof(struct bpf_reference_state), GFP_KERNEL);
@@ -1243,6 +1405,10 @@ static int copy_reference_state(struct bpf_func_state *dst, const struct bpf_fun
return -ENOMEM;
dst->acquired_refs = src->acquired_refs;
+ dst->active_locks = src->active_locks;
+ dst->active_preempt_locks = src->active_preempt_locks;
+ dst->active_rcu_lock = src->active_rcu_lock;
+ dst->active_irq_id = src->active_irq_id;
return 0;
}
@@ -1259,7 +1425,7 @@ static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_st
return 0;
}
-static int resize_reference_state(struct bpf_func_state *state, size_t n)
+static int resize_reference_state(struct bpf_verifier_state *state, size_t n)
{
state->refs = realloc_array(state->refs, state->acquired_refs, n,
sizeof(struct bpf_reference_state));
@@ -1302,61 +1468,146 @@ static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state
* On success, returns a valid pointer id to associate with the register
* On failure, returns a negative errno.
*/
-static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
+static struct bpf_reference_state *acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
{
- struct bpf_func_state *state = cur_func(env);
+ struct bpf_verifier_state *state = env->cur_state;
int new_ofs = state->acquired_refs;
- int id, err;
+ int err;
err = resize_reference_state(state, state->acquired_refs + 1);
if (err)
- return err;
- id = ++env->id_gen;
- state->refs[new_ofs].id = id;
+ return NULL;
state->refs[new_ofs].insn_idx = insn_idx;
- state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0;
- return id;
+ return &state->refs[new_ofs];
+}
+
+static int acquire_reference(struct bpf_verifier_env *env, int insn_idx)
+{
+ struct bpf_reference_state *s;
+
+ s = acquire_reference_state(env, insn_idx);
+ if (!s)
+ return -ENOMEM;
+ s->type = REF_TYPE_PTR;
+ s->id = ++env->id_gen;
+ return s->id;
+}
+
+static int acquire_lock_state(struct bpf_verifier_env *env, int insn_idx, enum ref_state_type type,
+ int id, void *ptr)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_reference_state *s;
+
+ s = acquire_reference_state(env, insn_idx);
+ if (!s)
+ return -ENOMEM;
+ s->type = type;
+ s->id = id;
+ s->ptr = ptr;
+
+ state->active_locks++;
+ return 0;
+}
+
+static int acquire_irq_state(struct bpf_verifier_env *env, int insn_idx)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_reference_state *s;
+
+ s = acquire_reference_state(env, insn_idx);
+ if (!s)
+ return -ENOMEM;
+ s->type = REF_TYPE_IRQ;
+ s->id = ++env->id_gen;
+
+ state->active_irq_id = s->id;
+ return s->id;
}
-/* release function corresponding to acquire_reference_state(). Idempotent. */
-static int release_reference_state(struct bpf_func_state *state, int ptr_id)
+static void release_reference_state(struct bpf_verifier_state *state, int idx)
{
- int i, last_idx;
+ int last_idx;
+ size_t rem;
+ /* IRQ state requires the relative ordering of elements remaining the
+ * same, since it relies on the refs array to behave as a stack, so that
+ * it can detect out-of-order IRQ restore. Hence use memmove to shift
+ * the array instead of swapping the final element into the deleted idx.
+ */
last_idx = state->acquired_refs - 1;
+ rem = state->acquired_refs - idx - 1;
+ if (last_idx && idx != last_idx)
+ memmove(&state->refs[idx], &state->refs[idx + 1], sizeof(*state->refs) * rem);
+ memset(&state->refs[last_idx], 0, sizeof(*state->refs));
+ state->acquired_refs--;
+ return;
+}
+
+static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr)
+{
+ int i;
+
for (i = 0; i < state->acquired_refs; i++) {
- if (state->refs[i].id == ptr_id) {
- /* Cannot release caller references in callbacks */
- if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
- return -EINVAL;
- if (last_idx && i != last_idx)
- memcpy(&state->refs[i], &state->refs[last_idx],
- sizeof(*state->refs));
- memset(&state->refs[last_idx], 0, sizeof(*state->refs));
- state->acquired_refs--;
+ if (state->refs[i].type != type)
+ continue;
+ if (state->refs[i].id == id && state->refs[i].ptr == ptr) {
+ release_reference_state(state, i);
+ state->active_locks--;
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+static int release_irq_state(struct bpf_verifier_state *state, int id)
+{
+ u32 prev_id = 0;
+ int i;
+
+ if (id != state->active_irq_id)
+ return -EACCES;
+
+ for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].type != REF_TYPE_IRQ)
+ continue;
+ if (state->refs[i].id == id) {
+ release_reference_state(state, i);
+ state->active_irq_id = prev_id;
return 0;
+ } else {
+ prev_id = state->refs[i].id;
}
}
return -EINVAL;
}
+static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *state, enum ref_state_type type,
+ int id, void *ptr)
+{
+ int i;
+
+ for (i = 0; i < state->acquired_refs; i++) {
+ struct bpf_reference_state *s = &state->refs[i];
+
+ if (s->type != type)
+ continue;
+
+ if (s->id == id && s->ptr == ptr)
+ return s;
+ }
+ return NULL;
+}
+
static void free_func_state(struct bpf_func_state *state)
{
if (!state)
return;
- kfree(state->refs);
kfree(state->stack);
kfree(state);
}
-static void clear_jmp_history(struct bpf_verifier_state *state)
-{
- kfree(state->jmp_history);
- state->jmp_history = NULL;
- state->jmp_history_cnt = 0;
-}
-
static void free_verifier_state(struct bpf_verifier_state *state,
bool free_self)
{
@@ -1366,7 +1617,7 @@ static void free_verifier_state(struct bpf_verifier_state *state,
free_func_state(state->frame[i]);
state->frame[i] = NULL;
}
- clear_jmp_history(state);
+ kfree(state->refs);
if (free_self)
kfree(state);
}
@@ -1377,12 +1628,7 @@ static void free_verifier_state(struct bpf_verifier_state *state,
static int copy_func_state(struct bpf_func_state *dst,
const struct bpf_func_state *src)
{
- int err;
-
- memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs));
- err = copy_reference_state(dst, src);
- if (err)
- return err;
+ memcpy(dst, src, offsetof(struct bpf_func_state, stack));
return copy_stack_state(dst, src);
}
@@ -1392,13 +1638,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
struct bpf_func_state *dst;
int i, err;
- dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history,
- src->jmp_history_cnt, sizeof(*dst_state->jmp_history),
- GFP_USER);
- if (!dst_state->jmp_history)
- return -ENOMEM;
- dst_state->jmp_history_cnt = src->jmp_history_cnt;
-
/* if dst has more stack frames then src frame, free them, this is also
* necessary in case of exceptional exits using bpf_throw.
*/
@@ -1406,18 +1645,22 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
free_func_state(dst_state->frame[i]);
dst_state->frame[i] = NULL;
}
+ err = copy_reference_state(dst_state, src);
+ if (err)
+ return err;
dst_state->speculative = src->speculative;
- dst_state->active_rcu_lock = src->active_rcu_lock;
+ dst_state->in_sleepable = src->in_sleepable;
dst_state->curframe = src->curframe;
- dst_state->active_lock.ptr = src->active_lock.ptr;
- dst_state->active_lock.id = src->active_lock.id;
dst_state->branches = src->branches;
dst_state->parent = src->parent;
dst_state->first_insn_idx = src->first_insn_idx;
dst_state->last_insn_idx = src->last_insn_idx;
+ dst_state->insn_hist_start = src->insn_hist_start;
+ dst_state->insn_hist_end = src->insn_hist_end;
dst_state->dfs_depth = src->dfs_depth;
dst_state->callback_unroll_depth = src->callback_unroll_depth;
dst_state->used_as_loop_entry = src->used_as_loop_entry;
+ dst_state->may_goto_depth = src->may_goto_depth;
for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i];
if (!dst) {
@@ -1820,6 +2063,8 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
*/
if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER))
reg->map_uid = reg->id;
+ if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE))
+ reg->map_uid = reg->id;
} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
reg->type = PTR_TO_XDP_SOCK;
} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
@@ -2113,7 +2358,7 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
{
/* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit
- * values on both sides of 64-bit range in hope to have tigher range.
+ * values on both sides of 64-bit range in hope to have tighter range.
* E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from
* 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff].
* With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound
@@ -2121,7 +2366,7 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
* _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a
* better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff].
* We just need to make sure that derived bounds we are intersecting
- * with are well-formed ranges in respecitve s64 or u64 domain, just
+ * with are well-formed ranges in respective s64 or u64 domain, just
* like we do with similar kinds of 32-to-64 or 64-to-32 adjustments.
*/
__u64 new_umin, new_umax;
@@ -2151,6 +2396,44 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
reg->smin_value = max_t(s64, reg->smin_value, new_smin);
reg->smax_value = min_t(s64, reg->smax_value, new_smax);
}
+
+ /* Here we would like to handle a special case after sign extending load,
+ * when upper bits for a 64-bit range are all 1s or all 0s.
+ *
+ * Upper bits are all 1s when register is in a range:
+ * [0xffff_ffff_0000_0000, 0xffff_ffff_ffff_ffff]
+ * Upper bits are all 0s when register is in a range:
+ * [0x0000_0000_0000_0000, 0x0000_0000_ffff_ffff]
+ * Together this forms are continuous range:
+ * [0xffff_ffff_0000_0000, 0x0000_0000_ffff_ffff]
+ *
+ * Now, suppose that register range is in fact tighter:
+ * [0xffff_ffff_8000_0000, 0x0000_0000_ffff_ffff] (R)
+ * Also suppose that it's 32-bit range is positive,
+ * meaning that lower 32-bits of the full 64-bit register
+ * are in the range:
+ * [0x0000_0000, 0x7fff_ffff] (W)
+ *
+ * If this happens, then any value in a range:
+ * [0xffff_ffff_0000_0000, 0xffff_ffff_7fff_ffff]
+ * is smaller than a lowest bound of the range (R):
+ * 0xffff_ffff_8000_0000
+ * which means that upper bits of the full 64-bit register
+ * can't be all 1s, when lower bits are in range (W).
+ *
+ * Note that:
+ * - 0xffff_ffff_8000_0000 == (s64)S32_MIN
+ * - 0x0000_0000_7fff_ffff == (s64)S32_MAX
+ * These relations are used in the conditions below.
+ */
+ if (reg->s32_min_value >= 0 && reg->smin_value >= S32_MIN && reg->smax_value <= S32_MAX) {
+ reg->smin_value = reg->s32_min_value;
+ reg->smax_value = reg->s32_max_value;
+ reg->umin_value = reg->s32_min_value;
+ reg->umax_value = reg->s32_max_value;
+ reg->var_off = tnum_intersect(reg->var_off,
+ tnum_range(reg->smin_value, reg->smax_value));
+ }
}
static void __reg_deduce_bounds(struct bpf_reg_state *reg)
@@ -2264,8 +2547,7 @@ static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
}
/* Mark a register as having a completely unknown (scalar) value. */
-static void __mark_reg_unknown(const struct bpf_verifier_env *env,
- struct bpf_reg_state *reg)
+static void __mark_reg_unknown_imprecise(struct bpf_reg_state *reg)
{
/*
* Clear type, off, and union(map_ptr, range) and
@@ -2277,10 +2559,20 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env,
reg->ref_obj_id = 0;
reg->var_off = tnum_unknown;
reg->frameno = 0;
- reg->precise = !env->bpf_capable;
+ reg->precise = false;
__mark_reg_unbounded(reg);
}
+/* Mark a register as having a completely unknown (scalar) value,
+ * initialize .precise as true when not bpf capable.
+ */
+static void __mark_reg_unknown(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
+{
+ __mark_reg_unknown_imprecise(reg);
+ reg->precise = !env->bpf_capable;
+}
+
static void mark_reg_unknown(struct bpf_verifier_env *env,
struct bpf_reg_state *regs, u32 regno)
{
@@ -2294,6 +2586,25 @@ static void mark_reg_unknown(struct bpf_verifier_env *env,
__mark_reg_unknown(env, regs + regno);
}
+static int __mark_reg_s32_range(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs,
+ u32 regno,
+ s32 s32_min,
+ s32 s32_max)
+{
+ struct bpf_reg_state *reg = regs + regno;
+
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, s32_min);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, s32_max);
+
+ reg->smin_value = max_t(s64, reg->smin_value, s32_min);
+ reg->smax_value = min_t(s64, reg->smax_value, s32_max);
+
+ reg_bounds_sync(reg);
+
+ return reg_bounds_sanity_check(env, reg, "s32_range");
+}
+
static void __mark_reg_not_init(const struct bpf_verifier_env *env,
struct bpf_reg_state *reg)
{
@@ -2328,6 +2639,8 @@ static void mark_btf_ld_reg(struct bpf_verifier_env *env,
regs[regno].type = PTR_TO_BTF_ID | flag;
regs[regno].btf = btf;
regs[regno].btf_id = btf_id;
+ if (type_may_be_null(flag))
+ regs[regno].id = ++env->id_gen;
}
#define DEF_NOT_SUBREG (0)
@@ -2371,7 +2684,7 @@ static void init_func_state(struct bpf_verifier_env *env,
/* Similar to push_stack(), but for async callbacks */
static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
int insn_idx, int prev_insn_idx,
- int subprog)
+ int subprog, bool is_sleepable)
{
struct bpf_verifier_stack_elem *elem;
struct bpf_func_state *frame;
@@ -2396,8 +2709,14 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
* The caller state doesn't matter.
* This is async callback. It starts in a fresh stack.
* Initialize it similar to do_check_common().
+ * But we do need to make sure to not clobber insn_hist, so we keep
+ * chaining insn_hist_start/insn_hist_end indices as for a normal
+ * child state.
*/
elem->st.branches = 1;
+ elem->st.in_sleepable = is_sleepable;
+ elem->st.insn_hist_start = env->cur_state->insn_hist_end;
+ elem->st.insn_hist_end = elem->st.insn_hist_start;
frame = kzalloc(sizeof(*frame), GFP_KERNEL);
if (!frame)
goto err;
@@ -2428,16 +2747,36 @@ static int cmp_subprogs(const void *a, const void *b)
((struct bpf_subprog_info *)b)->start;
}
+/* Find subprogram that contains instruction at 'off' */
+static struct bpf_subprog_info *find_containing_subprog(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *vals = env->subprog_info;
+ int l, r, m;
+
+ if (off >= env->prog->len || off < 0 || env->subprog_cnt == 0)
+ return NULL;
+
+ l = 0;
+ r = env->subprog_cnt - 1;
+ while (l < r) {
+ m = l + (r - l + 1) / 2;
+ if (vals[m].start <= off)
+ l = m;
+ else
+ r = m - 1;
+ }
+ return &vals[l];
+}
+
+/* Find subprogram that starts exactly at 'off' */
static int find_subprog(struct bpf_verifier_env *env, int off)
{
struct bpf_subprog_info *p;
- p = bsearch(&off, env->subprog_info, env->subprog_cnt,
- sizeof(env->subprog_info[0]), cmp_subprogs);
- if (!p)
+ p = find_containing_subprog(env, off);
+ if (!p || p->start != off)
return -ENOENT;
return p - env->subprog_info;
-
}
static int add_subprog(struct bpf_verifier_env *env, int off)
@@ -2653,10 +2992,16 @@ static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
b->module = mod;
b->offset = offset;
+ /* sort() reorders entries by value, so b may no longer point
+ * to the right entry after this
+ */
sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
kfunc_btf_cmp_by_off, NULL);
+ } else {
+ btf = b->btf;
}
- return b->btf;
+
+ return btf;
}
void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab)
@@ -2939,8 +3284,10 @@ static int check_subprogs(struct bpf_verifier_env *env)
if (code == (BPF_JMP | BPF_CALL) &&
insn[i].src_reg == 0 &&
- insn[i].imm == BPF_FUNC_tail_call)
+ insn[i].imm == BPF_FUNC_tail_call) {
subprog[cur_subprog].has_tail_call = true;
+ subprog[cur_subprog].tail_call_reachable = true;
+ }
if (BPF_CLASS(code) == BPF_LD &&
(BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND))
subprog[cur_subprog].has_ld_abs = true;
@@ -3028,10 +3375,27 @@ static int mark_reg_read(struct bpf_verifier_env *env,
return 0;
}
-static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int spi, int nr_slots)
{
struct bpf_func_state *state = func(env, reg);
- int spi, ret;
+ int err, i;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr;
+
+ err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64);
+ if (err)
+ return err;
+
+ mark_stack_slot_scratched(env, spi - i);
+ }
+ return 0;
+}
+
+static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ int spi;
/* For CONST_PTR_TO_DYNPTR, it must have already been done by
* check_reg_arg in check_helper_call and mark_btf_func_reg_size in
@@ -3046,31 +3410,23 @@ static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *
* bounds and spi is the first dynptr slot. Simply mark stack slot as
* read.
*/
- ret = mark_reg_read(env, &state->stack[spi].spilled_ptr,
- state->stack[spi].spilled_ptr.parent, REG_LIVE_READ64);
- if (ret)
- return ret;
- return mark_reg_read(env, &state->stack[spi - 1].spilled_ptr,
- state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64);
+ return mark_stack_slot_obj_read(env, reg, spi, BPF_DYNPTR_NR_SLOTS);
}
static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
int spi, int nr_slots)
{
- struct bpf_func_state *state = func(env, reg);
- int err, i;
-
- for (i = 0; i < nr_slots; i++) {
- struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr;
-
- err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64);
- if (err)
- return err;
+ return mark_stack_slot_obj_read(env, reg, spi, nr_slots);
+}
- mark_stack_slot_scratched(env, spi - i);
- }
+static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ int spi;
- return 0;
+ spi = irq_flag_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ return mark_stack_slot_obj_read(env, reg, spi, 1);
}
/* This function is supposed to be used by the following 32-bit optimization
@@ -3172,7 +3528,8 @@ static int insn_def_regno(const struct bpf_insn *insn)
case BPF_ST:
return -1;
case BPF_STX:
- if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ if ((BPF_MODE(insn->code) == BPF_ATOMIC ||
+ BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) &&
(insn->imm & BPF_FETCH)) {
if (insn->imm == BPF_CMPXCHG)
return BPF_REG_0;
@@ -3289,12 +3646,89 @@ static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
return env->insn_aux_data[insn_idx].jmp_point;
}
+#define LR_FRAMENO_BITS 3
+#define LR_SPI_BITS 6
+#define LR_ENTRY_BITS (LR_SPI_BITS + LR_FRAMENO_BITS + 1)
+#define LR_SIZE_BITS 4
+#define LR_FRAMENO_MASK ((1ull << LR_FRAMENO_BITS) - 1)
+#define LR_SPI_MASK ((1ull << LR_SPI_BITS) - 1)
+#define LR_SIZE_MASK ((1ull << LR_SIZE_BITS) - 1)
+#define LR_SPI_OFF LR_FRAMENO_BITS
+#define LR_IS_REG_OFF (LR_SPI_BITS + LR_FRAMENO_BITS)
+#define LINKED_REGS_MAX 6
+
+struct linked_reg {
+ u8 frameno;
+ union {
+ u8 spi;
+ u8 regno;
+ };
+ bool is_reg;
+};
+
+struct linked_regs {
+ int cnt;
+ struct linked_reg entries[LINKED_REGS_MAX];
+};
+
+static struct linked_reg *linked_regs_push(struct linked_regs *s)
+{
+ if (s->cnt < LINKED_REGS_MAX)
+ return &s->entries[s->cnt++];
+
+ return NULL;
+}
+
+/* Use u64 as a vector of 6 10-bit values, use first 4-bits to track
+ * number of elements currently in stack.
+ * Pack one history entry for linked registers as 10 bits in the following format:
+ * - 3-bits frameno
+ * - 6-bits spi_or_reg
+ * - 1-bit is_reg
+ */
+static u64 linked_regs_pack(struct linked_regs *s)
+{
+ u64 val = 0;
+ int i;
+
+ for (i = 0; i < s->cnt; ++i) {
+ struct linked_reg *e = &s->entries[i];
+ u64 tmp = 0;
+
+ tmp |= e->frameno;
+ tmp |= e->spi << LR_SPI_OFF;
+ tmp |= (e->is_reg ? 1 : 0) << LR_IS_REG_OFF;
+
+ val <<= LR_ENTRY_BITS;
+ val |= tmp;
+ }
+ val <<= LR_SIZE_BITS;
+ val |= s->cnt;
+ return val;
+}
+
+static void linked_regs_unpack(u64 val, struct linked_regs *s)
+{
+ int i;
+
+ s->cnt = val & LR_SIZE_MASK;
+ val >>= LR_SIZE_BITS;
+
+ for (i = 0; i < s->cnt; ++i) {
+ struct linked_reg *e = &s->entries[i];
+
+ e->frameno = val & LR_FRAMENO_MASK;
+ e->spi = (val >> LR_SPI_OFF) & LR_SPI_MASK;
+ e->is_reg = (val >> LR_IS_REG_OFF) & 0x1;
+ val >>= LR_ENTRY_BITS;
+ }
+}
+
/* for any branch, call, exit record the history of jmps in the given state */
-static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
- int insn_flags)
+static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
+ int insn_flags, u64 linked_regs)
{
- u32 cnt = cur->jmp_history_cnt;
- struct bpf_jmp_history_entry *p;
+ struct bpf_insn_hist_entry *p;
size_t alloc_size;
/* combine instruction flags if we already recorded this instruction */
@@ -3307,31 +3741,39 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_st
"verifier insn history bug: insn_idx %d cur flags %x new flags %x\n",
env->insn_idx, env->cur_hist_ent->flags, insn_flags);
env->cur_hist_ent->flags |= insn_flags;
+ WARN_ONCE(env->cur_hist_ent->linked_regs != 0,
+ "verifier insn history bug: insn_idx %d linked_regs != 0: %#llx\n",
+ env->insn_idx, env->cur_hist_ent->linked_regs);
+ env->cur_hist_ent->linked_regs = linked_regs;
return 0;
}
- cnt++;
- alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
- p = krealloc(cur->jmp_history, alloc_size, GFP_USER);
- if (!p)
- return -ENOMEM;
- cur->jmp_history = p;
+ if (cur->insn_hist_end + 1 > env->insn_hist_cap) {
+ alloc_size = size_mul(cur->insn_hist_end + 1, sizeof(*p));
+ p = kvrealloc(env->insn_hist, alloc_size, GFP_USER);
+ if (!p)
+ return -ENOMEM;
+ env->insn_hist = p;
+ env->insn_hist_cap = alloc_size / sizeof(*p);
+ }
- p = &cur->jmp_history[cnt - 1];
+ p = &env->insn_hist[cur->insn_hist_end];
p->idx = env->insn_idx;
p->prev_idx = env->prev_insn_idx;
p->flags = insn_flags;
- cur->jmp_history_cnt = cnt;
+ p->linked_regs = linked_regs;
+
+ cur->insn_hist_end++;
env->cur_hist_ent = p;
return 0;
}
-static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st,
- u32 hist_end, int insn_idx)
+static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env *env,
+ u32 hist_start, u32 hist_end, int insn_idx)
{
- if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx)
- return &st->jmp_history[hist_end - 1];
+ if (hist_end > hist_start && env->insn_hist[hist_end - 1].idx == insn_idx)
+ return &env->insn_hist[hist_end - 1];
return NULL;
}
@@ -3348,25 +3790,26 @@ static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_stat
* history entry recording a jump from last instruction of parent state and
* first instruction of given state.
*/
-static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
- u32 *history)
+static int get_prev_insn_idx(const struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st,
+ int insn_idx, u32 hist_start, u32 *hist_endp)
{
- u32 cnt = *history;
+ u32 hist_end = *hist_endp;
+ u32 cnt = hist_end - hist_start;
- if (i == st->first_insn_idx) {
+ if (insn_idx == st->first_insn_idx) {
if (cnt == 0)
return -ENOENT;
- if (cnt == 1 && st->jmp_history[0].idx == i)
+ if (cnt == 1 && env->insn_hist[hist_start].idx == insn_idx)
return -ENOENT;
}
- if (cnt && st->jmp_history[cnt - 1].idx == i) {
- i = st->jmp_history[cnt - 1].prev_idx;
- (*history)--;
+ if (cnt && env->insn_hist[hist_end - 1].idx == insn_idx) {
+ (*hist_endp)--;
+ return env->insn_hist[hist_end - 1].prev_idx;
} else {
- i--;
+ return insn_idx - 1;
}
- return i;
}
static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
@@ -3486,6 +3929,11 @@ static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
return bt->reg_masks[bt->frame] & (1 << reg);
}
+static inline bool bt_is_frame_reg_set(struct backtrack_state *bt, u32 frame, u32 reg)
+{
+ return bt->reg_masks[frame] & (1 << reg);
+}
+
static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot)
{
return bt->stack_masks[frame] & (1ull << slot);
@@ -3530,6 +3978,42 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
}
}
+/* If any register R in hist->linked_regs is marked as precise in bt,
+ * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs.
+ */
+static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_insn_hist_entry *hist)
+{
+ struct linked_regs linked_regs;
+ bool some_precise = false;
+ int i;
+
+ if (!hist || hist->linked_regs == 0)
+ return;
+
+ linked_regs_unpack(hist->linked_regs, &linked_regs);
+ for (i = 0; i < linked_regs.cnt; ++i) {
+ struct linked_reg *e = &linked_regs.entries[i];
+
+ if ((e->is_reg && bt_is_frame_reg_set(bt, e->frameno, e->regno)) ||
+ (!e->is_reg && bt_is_frame_slot_set(bt, e->frameno, e->spi))) {
+ some_precise = true;
+ break;
+ }
+ }
+
+ if (!some_precise)
+ return;
+
+ for (i = 0; i < linked_regs.cnt; ++i) {
+ struct linked_reg *e = &linked_regs.entries[i];
+
+ if (e->is_reg)
+ bt_set_frame_reg(bt, e->frameno, e->regno);
+ else
+ bt_set_frame_slot(bt, e->frameno, e->spi);
+ }
+}
+
static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
/* For given verifier state backtrack_insn() is called from the last insn to
@@ -3542,7 +4026,7 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
* - *was* processed previously during backtracking.
*/
static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
- struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
+ struct bpf_insn_hist_entry *hist, struct backtrack_state *bt)
{
const struct bpf_insn_cbs cbs = {
.cb_call = disasm_kfunc_name,
@@ -3569,6 +4053,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
}
+ /* If there is a history record that some registers gained range at this insn,
+ * propagate precision marks to those registers, so that bt_is_reg_set()
+ * accounts for these registers.
+ */
+ bt_sync_linked_regs(bt, hist);
+
if (class == BPF_ALU || class == BPF_ALU64) {
if (!bt_is_reg_set(bt, dreg))
return 0;
@@ -3584,7 +4074,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* sreg needs precision before this insn
*/
bt_clear_reg(bt, dreg);
- bt_set_reg(bt, sreg);
+ if (sreg != BPF_REG_FP)
+ bt_set_reg(bt, sreg);
} else {
/* dreg = K
* dreg needs precision after this insn.
@@ -3600,7 +4091,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* both dreg and sreg need precision
* before this insn
*/
- bt_set_reg(bt, sreg);
+ if (sreg != BPF_REG_FP)
+ bt_set_reg(bt, sreg);
} /* else dreg += K
* dreg still needs precision before this insn
*/
@@ -3796,7 +4288,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
*/
bt_set_reg(bt, dreg);
bt_set_reg(bt, sreg);
- /* else dreg <cond> K
+ } else if (BPF_SRC(insn->code) == BPF_K) {
+ /* dreg <cond> K
* Only dreg still needs precision before
* this insn, so for the K-based conditional
* there is nothing new to be marked.
@@ -3814,6 +4307,10 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
/* to be analyzed */
return -ENOTSUPP;
}
+ /* Propagate precision marks to linked registers, to account for
+ * registers marked as precise in this function.
+ */
+ bt_sync_linked_regs(bt, hist);
return 0;
}
@@ -3941,96 +4438,6 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_
}
}
-static bool idset_contains(struct bpf_idset *s, u32 id)
-{
- u32 i;
-
- for (i = 0; i < s->count; ++i)
- if (s->ids[i] == id)
- return true;
-
- return false;
-}
-
-static int idset_push(struct bpf_idset *s, u32 id)
-{
- if (WARN_ON_ONCE(s->count >= ARRAY_SIZE(s->ids)))
- return -EFAULT;
- s->ids[s->count++] = id;
- return 0;
-}
-
-static void idset_reset(struct bpf_idset *s)
-{
- s->count = 0;
-}
-
-/* Collect a set of IDs for all registers currently marked as precise in env->bt.
- * Mark all registers with these IDs as precise.
- */
-static int mark_precise_scalar_ids(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
-{
- struct bpf_idset *precise_ids = &env->idset_scratch;
- struct backtrack_state *bt = &env->bt;
- struct bpf_func_state *func;
- struct bpf_reg_state *reg;
- DECLARE_BITMAP(mask, 64);
- int i, fr;
-
- idset_reset(precise_ids);
-
- for (fr = bt->frame; fr >= 0; fr--) {
- func = st->frame[fr];
-
- bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
- for_each_set_bit(i, mask, 32) {
- reg = &func->regs[i];
- if (!reg->id || reg->type != SCALAR_VALUE)
- continue;
- if (idset_push(precise_ids, reg->id))
- return -EFAULT;
- }
-
- bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
- for_each_set_bit(i, mask, 64) {
- if (i >= func->allocated_stack / BPF_REG_SIZE)
- break;
- if (!is_spilled_scalar_reg(&func->stack[i]))
- continue;
- reg = &func->stack[i].spilled_ptr;
- if (!reg->id)
- continue;
- if (idset_push(precise_ids, reg->id))
- return -EFAULT;
- }
- }
-
- for (fr = 0; fr <= st->curframe; ++fr) {
- func = st->frame[fr];
-
- for (i = BPF_REG_0; i < BPF_REG_10; ++i) {
- reg = &func->regs[i];
- if (!reg->id)
- continue;
- if (!idset_contains(precise_ids, reg->id))
- continue;
- bt_set_frame_reg(bt, fr, i);
- }
- for (i = 0; i < func->allocated_stack / BPF_REG_SIZE; ++i) {
- if (!is_spilled_scalar_reg(&func->stack[i]))
- continue;
- reg = &func->stack[i].spilled_ptr;
- if (!reg->id)
- continue;
- if (!idset_contains(precise_ids, reg->id))
- continue;
- bt_set_frame_slot(bt, fr, i);
- }
- }
-
- return 0;
-}
-
/*
* __mark_chain_precision() backtracks BPF program instruction sequence and
* chain of verifier states making sure that register *regno* (if regno >= 0)
@@ -4038,7 +4445,7 @@ static int mark_precise_scalar_ids(struct bpf_verifier_env *env, struct bpf_veri
* SCALARS, as well as any other registers and slots that contribute to
* a tracked state of given registers/stack slots, depending on specific BPF
* assembly instructions (see backtrack_insns() for exact instruction handling
- * logic). This backtracking relies on recorded jmp_history and is able to
+ * logic). This backtracking relies on recorded insn_hist and is able to
* traverse entire chain of parent states. This process ends only when all the
* necessary registers/slots and their transitive dependencies are marked as
* precise.
@@ -4155,39 +4562,15 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
for (;;) {
DECLARE_BITMAP(mask, 64);
- u32 history = st->jmp_history_cnt;
- struct bpf_jmp_history_entry *hist;
+ u32 hist_start = st->insn_hist_start;
+ u32 hist_end = st->insn_hist_end;
+ struct bpf_insn_hist_entry *hist;
if (env->log.level & BPF_LOG_LEVEL2) {
verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
bt->frame, last_idx, first_idx, subseq_idx);
}
- /* If some register with scalar ID is marked as precise,
- * make sure that all registers sharing this ID are also precise.
- * This is needed to estimate effect of find_equal_scalars().
- * Do this at the last instruction of each state,
- * bpf_reg_state::id fields are valid for these instructions.
- *
- * Allows to track precision in situation like below:
- *
- * r2 = unknown value
- * ...
- * --- state #0 ---
- * ...
- * r1 = r2 // r1 and r2 now share the same ID
- * ...
- * --- state #1 {r1.id = A, r2.id = A} ---
- * ...
- * if (r2 > 10) goto exit; // find_equal_scalars() assigns range to r1
- * ...
- * --- state #2 {r1.id = A, r2.id = A} ---
- * r3 = r10
- * r3 += r1 // need to mark both r1 and r2
- */
- if (mark_precise_scalar_ids(env, st))
- return -EFAULT;
-
if (last_idx < 0) {
/* we are at the entry into subprog, which
* is expected for global funcs, but only if
@@ -4220,7 +4603,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
err = 0;
skip_first = false;
} else {
- hist = get_jmp_hist_entry(st, history, i);
+ hist = get_insn_hist_entry(env, hist_start, hist_end, i);
err = backtrack_insn(env, i, subseq_idx, hist, bt);
}
if (err == -ENOTSUPP) {
@@ -4237,7 +4620,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
*/
return 0;
subseq_idx = i;
- i = get_prev_insn_idx(st, i, &history);
+ i = get_prev_insn_idx(env, st, i, hist_start, &hist_end);
if (i == -ENOENT)
break;
if (i >= env->prog->len) {
@@ -4298,7 +4681,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
bt_frame_stack_mask(bt, fr));
verbose(env, "stack=%s: ", env->tmp_str_buf);
- print_verifier_state(env, func, true);
+ print_verifier_state(env, st, fr, true);
}
}
@@ -4355,6 +4738,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_MEM:
case PTR_TO_FUNC:
case PTR_TO_MAP_KEY:
+ case PTR_TO_ARENA:
return true;
default:
return false;
@@ -4380,20 +4764,6 @@ static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32)
return subreg32 ? tnum_subreg(reg->var_off).value : reg->var_off.value;
}
-static bool __is_scalar_unbounded(struct bpf_reg_state *reg)
-{
- return tnum_is_unknown(reg->var_off) &&
- reg->smin_value == S64_MIN && reg->smax_value == S64_MAX &&
- reg->umin_value == 0 && reg->umax_value == U64_MAX &&
- reg->s32_min_value == S32_MIN && reg->s32_max_value == S32_MAX &&
- reg->u32_min_value == 0 && reg->u32_max_value == U32_MAX;
-}
-
-static bool register_is_bounded(struct bpf_reg_state *reg)
-{
- return reg->type == SCALAR_VALUE && !__is_scalar_unbounded(reg);
-}
-
static bool __is_pointer_value(bool allow_ptr_leaks,
const struct bpf_reg_state *reg)
{
@@ -4403,6 +4773,30 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
return reg->type != SCALAR_VALUE;
}
+static void assign_scalar_id_before_mov(struct bpf_verifier_env *env,
+ struct bpf_reg_state *src_reg)
+{
+ if (src_reg->type != SCALAR_VALUE)
+ return;
+
+ if (src_reg->id & BPF_ADD_CONST) {
+ /*
+ * The verifier is processing rX = rY insn and
+ * rY->id has special linked register already.
+ * Cleared it, since multiple rX += const are not supported.
+ */
+ src_reg->id = 0;
+ src_reg->off = 0;
+ }
+
+ if (!src_reg->id && !tnum_is_const(src_reg->var_off))
+ /* Ensure that src_reg has a valid ID that will be copied to
+ * dst_reg and then will be used by sync_linked_regs() to
+ * propagate min/max range.
+ */
+ src_reg->id = ++env->id_gen;
+}
+
/* Copy src state preserving dst->parent and dst->live fields */
static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
{
@@ -4438,6 +4832,36 @@ static bool is_bpf_st_mem(struct bpf_insn *insn)
return BPF_CLASS(insn->code) == BPF_ST && BPF_MODE(insn->code) == BPF_MEM;
}
+static int get_reg_width(struct bpf_reg_state *reg)
+{
+ return fls64(reg->umax_value);
+}
+
+/* See comment for mark_fastcall_pattern_for_call() */
+static void check_fastcall_stack_contract(struct bpf_verifier_env *env,
+ struct bpf_func_state *state, int insn_idx, int off)
+{
+ struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno];
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ int i;
+
+ if (subprog->fastcall_stack_off <= off || aux[insn_idx].fastcall_pattern)
+ return;
+ /* access to the region [max_stack_depth .. fastcall_stack_off)
+ * from something that is not a part of the fastcall pattern,
+ * disable fastcall rewrites for current subprogram by setting
+ * fastcall_stack_off to a value smaller than any possible offset.
+ */
+ subprog->fastcall_stack_off = S16_MIN;
+ /* reset fastcall aux flags within subprogram,
+ * happens at most once per subprogram
+ */
+ for (i = subprog->start; i < (subprog + 1)->start; ++i) {
+ aux[i].fastcall_spills_num = 0;
+ aux[i].fastcall_pattern = 0;
+ }
+}
+
/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
@@ -4458,6 +4882,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
*/
if (!env->allow_ptr_leaks &&
is_spilled_reg(&state->stack[spi]) &&
+ !is_spilled_scalar_reg(&state->stack[spi]) &&
size != BPF_REG_SIZE) {
verbose(env, "attempt to corrupt spilled pointer on stack\n");
return -EACCES;
@@ -4486,19 +4911,27 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
if (err)
return err;
+ check_fastcall_stack_contract(env, state, insn_idx, off);
mark_stack_slot_scratched(env, spi);
- if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && env->bpf_capable) {
+ if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) {
+ bool reg_value_fits;
+
+ reg_value_fits = get_reg_width(reg) <= BITS_PER_BYTE * size;
+ /* Make sure that reg had an ID to build a relation on spill. */
+ if (reg_value_fits)
+ assign_scalar_id_before_mov(env, reg);
save_register_state(env, state, spi, reg, size);
/* Break the relation on a narrowing spill. */
- if (fls64(reg->umax_value) > BITS_PER_BYTE * size)
+ if (!reg_value_fits)
state->stack[spi].spilled_ptr.id = 0;
} else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) &&
- insn->imm != 0 && env->bpf_capable) {
- struct bpf_reg_state fake_reg = {};
+ env->bpf_capable) {
+ struct bpf_reg_state *tmp_reg = &env->fake_reg[0];
- __mark_reg_known(&fake_reg, insn->imm);
- fake_reg.type = SCALAR_VALUE;
- save_register_state(env, state, spi, &fake_reg, size);
+ memset(tmp_reg, 0, sizeof(*tmp_reg));
+ __mark_reg_known(tmp_reg, insn->imm);
+ tmp_reg->type = SCALAR_VALUE;
+ save_register_state(env, state, spi, tmp_reg, size);
} else if (reg && is_spillable_regtype(reg->type)) {
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
@@ -4554,7 +4987,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
}
if (insn_flags)
- return push_jmp_history(env, env->cur_state, insn_flags);
+ return push_insn_history(env, env->cur_state, insn_flags, 0);
return 0;
}
@@ -4613,6 +5046,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
return err;
}
+ check_fastcall_stack_contract(env, state, insn_idx, min_off);
/* Variable offset writes destroy any spilled pointers in range. */
for (i = min_off; i < max_off; i++) {
u8 new_type, *stype;
@@ -4640,7 +5074,20 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
return -EINVAL;
}
- /* Erase all spilled pointers. */
+ /* If writing_zero and the spi slot contains a spill of value 0,
+ * maintain the spill type.
+ */
+ if (writing_zero && *stype == STACK_SPILL &&
+ is_spilled_scalar_reg(&state->stack[spi])) {
+ struct bpf_reg_state *spill_reg = &state->stack[spi].spilled_ptr;
+
+ if (tnum_is_const(spill_reg->var_off) && spill_reg->var_off.value == 0) {
+ zero_used = true;
+ continue;
+ }
+ }
+
+ /* Erase all other spilled pointers. */
state->stack[spi].spilled_ptr.type = NOT_INIT;
/* Update the slot type. */
@@ -4738,6 +5185,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
reg = &reg_state->stack[spi].spilled_ptr;
mark_stack_slot_scratched(env, spi);
+ check_fastcall_stack_contract(env, state, env->insn_idx, off);
if (is_spilled_reg(&reg_state->stack[spi])) {
u8 spill_size = 1;
@@ -4756,7 +5204,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
if (dst_regno < 0)
return 0;
- if (!(off % BPF_REG_SIZE) && size == spill_size) {
+ if (size <= spill_size &&
+ bpf_stack_narrow_access_ok(off, size, spill_size)) {
/* The earlier check_reg_arg() has decided the
* subreg_def for this insn. Save it first.
*/
@@ -4764,6 +5213,12 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
copy_register_state(&state->regs[dst_regno], reg);
state->regs[dst_regno].subreg_def = subreg_def;
+
+ /* Break the relation on a narrowing fill.
+ * coerce_reg_to_size will adjust the boundaries.
+ */
+ if (get_reg_width(reg) > size * BITS_PER_BYTE)
+ state->regs[dst_regno].id = 0;
} else {
int spill_cnt = 0, zero_cnt = 0;
@@ -4839,7 +5294,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
insn_flags = 0; /* we are not restoring spilled register */
}
if (insn_flags)
- return push_jmp_history(env, env->cur_state, insn_flags);
+ return push_insn_history(env, env->cur_state, insn_flags, 0);
return 0;
}
@@ -4851,7 +5306,7 @@ enum bpf_access_src {
static int check_stack_range_initialized(struct bpf_verifier_env *env,
int regno, int off, int access_size,
bool zero_size_allowed,
- enum bpf_access_src type,
+ enum bpf_access_type type,
struct bpf_call_arg_meta *meta);
static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
@@ -4884,13 +5339,14 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env,
/* Note that we pass a NULL meta, so raw access will not be permitted.
*/
err = check_stack_range_initialized(env, ptr_regno, off, size,
- false, ACCESS_DIRECT, NULL);
+ false, BPF_READ, NULL);
if (err)
return err;
min_off = reg->smin_value + off;
max_off = reg->smax_value + off;
mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno);
+ check_fastcall_stack_contract(env, ptr_state, env->insn_idx, min_off);
return 0;
}
@@ -5211,19 +5667,27 @@ bad_type:
return -EINVAL;
}
+static bool in_sleepable(struct bpf_verifier_env *env)
+{
+ return env->prog->sleepable ||
+ (env->cur_state && env->cur_state->in_sleepable);
+}
+
/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
* can dereference RCU protected pointers and result is PTR_TRUSTED.
*/
static bool in_rcu_cs(struct bpf_verifier_env *env)
{
return env->cur_state->active_rcu_lock ||
- env->cur_state->active_lock.ptr ||
- !env->prog->aux->sleepable;
+ env->cur_state->active_locks ||
+ !in_sleepable(env);
}
/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
BTF_SET_START(rcu_protected_types)
+#ifdef CONFIG_NET
BTF_ID(struct, prog_test_ref_kfunc)
+#endif
#ifdef CONFIG_CGROUPS
BTF_ID(struct, cgroup)
#endif
@@ -5231,6 +5695,9 @@ BTF_ID(struct, cgroup)
BTF_ID(struct, bpf_cpumask)
#endif
BTF_ID(struct, task_struct)
+#ifdef CONFIG_CRYPTO
+BTF_ID(struct, bpf_crypto_ctx)
+#endif
BTF_SET_END(rcu_protected_types)
static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
@@ -5284,6 +5751,22 @@ static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr
return ret;
}
+static int mark_uptr_ld_reg(struct bpf_verifier_env *env, u32 regno,
+ struct btf_field *field)
+{
+ struct bpf_reg_state *reg;
+ const struct btf_type *t;
+
+ t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id);
+ mark_reg_known_zero(env, cur_regs(env), regno);
+ reg = reg_state(env, regno);
+ reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;
+ reg->mem_size = t->size;
+ reg->id = ++env->id_gen;
+
+ return 0;
+}
+
static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
int value_regno, int insn_idx,
struct btf_field *kptr_field)
@@ -5312,16 +5795,20 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
verbose(env, "store to referenced kptr disallowed\n");
return -EACCES;
}
+ if (class != BPF_LDX && kptr_field->type == BPF_UPTR) {
+ verbose(env, "store to uptr disallowed\n");
+ return -EACCES;
+ }
if (class == BPF_LDX) {
- val_reg = reg_state(env, value_regno);
+ if (kptr_field->type == BPF_UPTR)
+ return mark_uptr_ld_reg(env, value_regno, kptr_field);
+
/* We can simply mark the value_regno receiving the pointer
* value from map as PTR_TO_BTF_ID, with the correct type.
*/
mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field));
- /* For mark_ptr_or_null_reg */
- val_reg->id = ++env->id_gen;
} else if (class == BPF_STX) {
val_reg = reg_state(env, value_regno);
if (!register_is_null(val_reg) &&
@@ -5368,27 +5855,32 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* this program. To check that [x1, x2) overlaps with [y1, y2),
* it is sufficient to check x1 < y2 && y1 < x2.
*/
- if (reg->smin_value + off < p + btf_field_type_size(field->type) &&
+ if (reg->smin_value + off < p + field->size &&
p < reg->umax_value + off + size) {
switch (field->type) {
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
if (src != ACCESS_DIRECT) {
- verbose(env, "kptr cannot be accessed indirectly by helper\n");
+ verbose(env, "%s cannot be accessed indirectly by helper\n",
+ btf_field_type_name(field->type));
return -EACCES;
}
if (!tnum_is_const(reg->var_off)) {
- verbose(env, "kptr access cannot have variable offset\n");
+ verbose(env, "%s access cannot have variable offset\n",
+ btf_field_type_name(field->type));
return -EACCES;
}
if (p != off + reg->var_off.value) {
- verbose(env, "kptr access misaligned expected=%u off=%llu\n",
+ verbose(env, "%s access misaligned expected=%u off=%llu\n",
+ btf_field_type_name(field->type),
p, off + reg->var_off.value);
return -EACCES;
}
if (size != bpf_size_to_bytes(BPF_DW)) {
- verbose(env, "kptr access size must be BPF_DW\n");
+ verbose(env, "%s access size must be BPF_DW\n",
+ btf_field_type_name(field->type));
return -EACCES;
}
break;
@@ -5491,11 +5983,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
/* check access to 'struct bpf_context' fields. Supports fixed offsets only */
static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
enum bpf_access_type t, enum bpf_reg_type *reg_type,
- struct btf **btf, u32 *btf_id)
+ struct btf **btf, u32 *btf_id, bool *is_retval, bool is_ldsx)
{
struct bpf_insn_access_aux info = {
.reg_type = *reg_type,
.log = &env->log,
+ .is_retval = false,
+ .is_ldsx = is_ldsx,
};
if (env->ops->is_valid_access &&
@@ -5508,6 +6002,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
* type of narrower access.
*/
*reg_type = info.reg_type;
+ *is_retval = info.is_retval;
if (base_type(*reg_type) == PTR_TO_BTF_ID) {
*btf = info.btf;
@@ -5616,6 +6111,13 @@ static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno)
return reg->type == PTR_TO_FLOW_KEYS;
}
+static bool is_arena_reg(struct bpf_verifier_env *env, int regno)
+{
+ const struct bpf_reg_state *reg = reg_state(env, regno);
+
+ return reg->type == PTR_TO_ARENA;
+}
+
static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
#ifdef CONFIG_NET
[PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
@@ -5632,7 +6134,8 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg)
return true;
/* Types listed in the reg2btf_ids are always trusted */
- if (reg2btf_ids[base_type(reg->type)])
+ if (reg2btf_ids[base_type(reg->type)] &&
+ !bpf_type_has_unsafe_modifiers(reg->type))
return true;
/* If a register is not referenced, it is trusted if it has the
@@ -5763,6 +6266,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
case PTR_TO_XDP_SOCK:
pointer_desc = "xdp_sock ";
break;
+ case PTR_TO_ARENA:
+ return 0;
default:
break;
}
@@ -5770,23 +6275,65 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
strict);
}
+static enum priv_stack_mode bpf_enable_priv_stack(struct bpf_prog *prog)
+{
+ if (!bpf_jit_supports_private_stack())
+ return NO_PRIV_STACK;
+
+ /* bpf_prog_check_recur() checks all prog types that use bpf trampoline
+ * while kprobe/tp/perf_event/raw_tp don't use trampoline hence checked
+ * explicitly.
+ */
+ switch (prog->type) {
+ case BPF_PROG_TYPE_KPROBE:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ return PRIV_STACK_ADAPTIVE;
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ if (prog->aux->priv_stack_requested || bpf_prog_check_recur(prog))
+ return PRIV_STACK_ADAPTIVE;
+ fallthrough;
+ default:
+ break;
+ }
+
+ return NO_PRIV_STACK;
+}
+
+static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
+{
+ if (env->prog->jit_requested)
+ return round_up(stack_depth, 16);
+
+ /* round up to 32-bytes, since this is granularity
+ * of interpreter stack size
+ */
+ return round_up(max_t(u32, stack_depth, 1), 32);
+}
+
/* starting from main bpf function walk all instructions of the function
* and recursively walk all callees that given function can call.
* Ignore jump and exit insns.
* Since recursion is prevented by check_cfg() this algorithm
* only needs a local stack of MAX_CALL_FRAMES to remember callsites
*/
-static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
+static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
+ bool priv_stack_supported)
{
struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
- int depth = 0, frame = 0, i, subprog_end;
+ int depth = 0, frame = 0, i, subprog_end, subprog_depth;
bool tail_call_reachable = false;
int ret_insn[MAX_CALL_FRAMES];
int ret_prog[MAX_CALL_FRAMES];
int j;
i = subprog[idx].start;
+ if (!priv_stack_supported)
+ subprog[idx].priv_stack_mode = NO_PRIV_STACK;
process_func:
/* protect against potential stack overflow that might happen when
* bpf2bpf calls get combined with tailcalls. Limit the caller's stack
@@ -5813,14 +6360,31 @@ process_func:
depth);
return -EACCES;
}
- /* round up to 32-bytes, since this is granularity
- * of interpreter stack size
- */
- depth += round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
- if (depth > MAX_BPF_STACK) {
- verbose(env, "combined stack size of %d calls is %d. Too large\n",
- frame + 1, depth);
- return -EACCES;
+
+ subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth);
+ if (priv_stack_supported) {
+ /* Request private stack support only if the subprog stack
+ * depth is no less than BPF_PRIV_STACK_MIN_SIZE. This is to
+ * avoid jit penalty if the stack usage is small.
+ */
+ if (subprog[idx].priv_stack_mode == PRIV_STACK_UNKNOWN &&
+ subprog_depth >= BPF_PRIV_STACK_MIN_SIZE)
+ subprog[idx].priv_stack_mode = PRIV_STACK_ADAPTIVE;
+ }
+
+ if (subprog[idx].priv_stack_mode == PRIV_STACK_ADAPTIVE) {
+ if (subprog_depth > MAX_BPF_STACK) {
+ verbose(env, "stack size of subprog %d is %d. Too large\n",
+ idx, subprog_depth);
+ return -EACCES;
+ }
+ } else {
+ depth += subprog_depth;
+ if (depth > MAX_BPF_STACK) {
+ verbose(env, "combined stack size of %d calls is %d. Too large\n",
+ frame + 1, depth);
+ return -EACCES;
+ }
}
continue_func:
subprog_end = subprog[idx + 1].start;
@@ -5877,6 +6441,8 @@ continue_func:
}
i = next_insn;
idx = sidx;
+ if (!priv_stack_supported)
+ subprog[idx].priv_stack_mode = NO_PRIV_STACK;
if (subprog[idx].has_tail_call)
tail_call_reachable = true;
@@ -5910,7 +6476,8 @@ continue_func:
*/
if (frame == 0)
return 0;
- depth -= round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
+ if (subprog[idx].priv_stack_mode != PRIV_STACK_ADAPTIVE)
+ depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
frame--;
i = ret_insn[frame];
idx = ret_prog[frame];
@@ -5919,17 +6486,45 @@ continue_func:
static int check_max_stack_depth(struct bpf_verifier_env *env)
{
+ enum priv_stack_mode priv_stack_mode = PRIV_STACK_UNKNOWN;
struct bpf_subprog_info *si = env->subprog_info;
+ bool priv_stack_supported;
int ret;
for (int i = 0; i < env->subprog_cnt; i++) {
+ if (si[i].has_tail_call) {
+ priv_stack_mode = NO_PRIV_STACK;
+ break;
+ }
+ }
+
+ if (priv_stack_mode == PRIV_STACK_UNKNOWN)
+ priv_stack_mode = bpf_enable_priv_stack(env->prog);
+
+ /* All async_cb subprogs use normal kernel stack. If a particular
+ * subprog appears in both main prog and async_cb subtree, that
+ * subprog will use normal kernel stack to avoid potential nesting.
+ * The reverse subprog traversal ensures when main prog subtree is
+ * checked, the subprogs appearing in async_cb subtrees are already
+ * marked as using normal kernel stack, so stack size checking can
+ * be done properly.
+ */
+ for (int i = env->subprog_cnt - 1; i >= 0; i--) {
if (!i || si[i].is_async_cb) {
- ret = check_max_stack_depth_subprog(env, i);
+ priv_stack_supported = !i && priv_stack_mode == PRIV_STACK_ADAPTIVE;
+ ret = check_max_stack_depth_subprog(env, i, priv_stack_supported);
if (ret < 0)
return ret;
}
- continue;
}
+
+ for (int i = 0; i < env->subprog_cnt; i++) {
+ if (si[i].priv_stack_mode == PRIV_STACK_ADAPTIVE) {
+ env->prog->aux->jits_use_priv_stack = true;
+ break;
+ }
+ }
+
return 0;
}
@@ -6041,10 +6636,10 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
* values are also truncated so we push 64-bit bounds into
* 32-bit bounds. Above were truncated < 32-bits already.
*/
- if (size < 4) {
+ if (size < 4)
__mark_reg32_unbounded(reg);
- reg_bounds_sync(reg);
- }
+
+ reg_bounds_sync(reg);
}
static void set_sext64_default_val(struct bpf_reg_state *reg, int size)
@@ -6113,10 +6708,10 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size)
/* both of s64_max/s64_min positive or negative */
if ((s64_max >= 0) == (s64_min >= 0)) {
- reg->smin_value = reg->s32_min_value = s64_min;
- reg->smax_value = reg->s32_max_value = s64_max;
- reg->umin_value = reg->u32_min_value = s64_min;
- reg->umax_value = reg->u32_max_value = s64_max;
+ reg->s32_min_value = reg->smin_value = s64_min;
+ reg->s32_max_value = reg->smax_value = s64_max;
+ reg->u32_min_value = reg->umin_value = s64_min;
+ reg->u32_max_value = reg->umax_value = s64_max;
reg->var_off = tnum_range(s64_min, s64_max);
return;
}
@@ -6137,6 +6732,7 @@ static void set_sext32_default_val(struct bpf_reg_state *reg, int size)
}
reg->u32_min_value = 0;
reg->u32_max_value = U32_MAX;
+ reg->var_off = tnum_subreg(tnum_unknown);
}
static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size)
@@ -6181,6 +6777,7 @@ static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size)
reg->s32_max_value = s32_max;
reg->u32_min_value = (u32)s32_min;
reg->u32_max_value = (u32)s32_max;
+ reg->var_off = tnum_subreg(tnum_range(s32_min, s32_max));
return;
}
@@ -6242,6 +6839,7 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val,
#define BTF_TYPE_SAFE_RCU(__type) __PASTE(__type, __safe_rcu)
#define BTF_TYPE_SAFE_RCU_OR_NULL(__type) __PASTE(__type, __safe_rcu_or_null)
#define BTF_TYPE_SAFE_TRUSTED(__type) __PASTE(__type, __safe_trusted)
+#define BTF_TYPE_SAFE_TRUSTED_OR_NULL(__type) __PASTE(__type, __safe_trusted_or_null)
/*
* Allow list few fields as RCU trusted or full trusted.
@@ -6305,7 +6903,7 @@ BTF_TYPE_SAFE_TRUSTED(struct dentry) {
struct inode *d_inode;
};
-BTF_TYPE_SAFE_TRUSTED(struct socket) {
+BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) {
struct sock *sk;
};
@@ -6340,11 +6938,20 @@ static bool type_is_trusted(struct bpf_verifier_env *env,
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry));
- BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket));
return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted");
}
+static bool type_is_trusted_or_null(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id,
+ "__safe_trusted_or_null");
+}
+
static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
struct bpf_reg_state *regs,
int regno, int off, int size,
@@ -6453,6 +7060,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
*/
if (type_is_trusted(env, reg, field_name, btf_id)) {
flag |= PTR_TRUSTED;
+ } else if (type_is_trusted_or_null(env, reg, field_name, btf_id)) {
+ flag |= PTR_TRUSTED | PTR_MAYBE_NULL;
} else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) {
if (type_is_rcu(env, reg, field_name, btf_id)) {
/* ignore __rcu tag and mark it MEM_RCU */
@@ -6584,7 +7193,7 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env,
static int check_stack_access_within_bounds(
struct bpf_verifier_env *env,
int regno, int off, int access_size,
- enum bpf_access_src src, enum bpf_access_type type)
+ enum bpf_access_type type)
{
struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = regs + regno;
@@ -6593,10 +7202,7 @@ static int check_stack_access_within_bounds(
int err;
char *err_extra;
- if (src == ACCESS_HELPER)
- /* We don't know if helpers are reading or writing (or both). */
- err_extra = " indirect access to";
- else if (type == BPF_READ)
+ if (type == BPF_READ)
err_extra = " read from";
else
err_extra = " write to";
@@ -6618,6 +7224,11 @@ static int check_stack_access_within_bounds(
err = check_stack_slot_within_bounds(env, min_off, state, type);
if (!err && max_off > 0)
err = -EINVAL; /* out of stack access into non-negative offsets */
+ if (!err && access_size < 0)
+ /* access_size should not be negative (or overflow an int); others checks
+ * along the way should have prevented such an access.
+ */
+ err = -EFAULT; /* invalid negative access size; integer overflow? */
if (err) {
if (tnum_is_const(reg->var_off)) {
@@ -6639,6 +7250,17 @@ static int check_stack_access_within_bounds(
return grow_stack_state(env, state, -min_off /* size */);
}
+static bool get_func_retval_range(struct bpf_prog *prog,
+ struct bpf_retval_range *range)
+{
+ if (prog->type == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type == BPF_LSM_MAC &&
+ !bpf_lsm_get_retval_range(prog, range)) {
+ return true;
+ }
+ return false;
+}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
@@ -6693,7 +7315,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return err;
if (tnum_is_const(reg->var_off))
kptr_field = btf_record_find(reg->map_ptr->record,
- off + reg->var_off.value, BPF_KPTR);
+ off + reg->var_off.value, BPF_KPTR | BPF_UPTR);
if (kptr_field) {
err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field);
} else if (t == BPF_READ && value_regno >= 0) {
@@ -6743,6 +7365,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
+ bool is_retval = false;
+ struct bpf_retval_range range;
enum bpf_reg_type reg_type = SCALAR_VALUE;
struct btf *btf = NULL;
u32 btf_id = 0;
@@ -6758,7 +7382,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return err;
err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf,
- &btf_id);
+ &btf_id, &is_retval, is_ldsx);
if (err)
verbose_linfo(env, insn_idx, "; ");
if (!err && t == BPF_READ && value_regno >= 0) {
@@ -6767,7 +7391,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* case, we know the offset is zero.
*/
if (reg_type == SCALAR_VALUE) {
- mark_reg_unknown(env, regs, value_regno);
+ if (is_retval && get_func_retval_range(env->prog, &range)) {
+ err = __mark_reg_s32_range(env, regs, value_regno,
+ range.minval, range.maxval);
+ if (err)
+ return err;
+ } else {
+ mark_reg_unknown(env, regs, value_regno);
+ }
} else {
mark_reg_known_zero(env, regs,
value_regno);
@@ -6789,7 +7420,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else if (reg->type == PTR_TO_STACK) {
/* Basic bounds checks. */
- err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t);
+ err = check_stack_access_within_bounds(env, regno, off, size, t);
if (err)
return err;
@@ -6864,6 +7495,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
mark_reg_unknown(env, regs, value_regno);
+ } else if (reg->type == PTR_TO_ARENA) {
+ if (t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown(env, regs, value_regno);
} else {
verbose(env, "R%d invalid mem access '%s'\n", regno,
reg_type_str(env, reg->type));
@@ -6881,6 +7515,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return err;
}
+static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
+ bool allow_trust_mismatch);
+
static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
{
int load_reg;
@@ -6940,7 +7577,8 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
if (is_ctx_reg(env, insn->dst_reg) ||
is_pkt_reg(env, insn->dst_reg) ||
is_flow_key_reg(env, insn->dst_reg) ||
- is_sk_reg(env, insn->dst_reg)) {
+ is_sk_reg(env, insn->dst_reg) ||
+ (is_arena_reg(env, insn->dst_reg) && !bpf_jit_supports_insn(insn, true))) {
verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
insn->dst_reg,
reg_type_str(env, reg_state(env, insn->dst_reg)->type));
@@ -6976,6 +7614,11 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
if (err)
return err;
+ if (is_arena_reg(env, insn->dst_reg)) {
+ err = save_aux_ptr_type(env, PTR_TO_ARENA, false);
+ if (err)
+ return err;
+ }
/* Check whether we can write into the same memory. */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE, -1, true, false);
@@ -6997,13 +7640,11 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
static int check_stack_range_initialized(
struct bpf_verifier_env *env, int regno, int off,
int access_size, bool zero_size_allowed,
- enum bpf_access_src type, struct bpf_call_arg_meta *meta)
+ enum bpf_access_type type, struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_func_state *state = func(env, reg);
int err, min_off, max_off, i, j, slot, spi;
- char *err_extra = type == ACCESS_HELPER ? " indirect" : "";
- enum bpf_access_type bounds_check_type;
/* Some accesses can write anything into the stack, others are
* read-only.
*/
@@ -7014,18 +7655,10 @@ static int check_stack_range_initialized(
return -EACCES;
}
- if (type == ACCESS_HELPER) {
- /* The bounds checks for writes are more permissive than for
- * reads. However, if raw_mode is not set, we'll do extra
- * checks below.
- */
- bounds_check_type = BPF_WRITE;
+ if (type == BPF_WRITE)
clobber = true;
- } else {
- bounds_check_type = BPF_READ;
- }
- err = check_stack_access_within_bounds(env, regno, off, access_size,
- type, bounds_check_type);
+
+ err = check_stack_access_within_bounds(env, regno, off, access_size, type);
if (err)
return err;
@@ -7042,8 +7675,8 @@ static int check_stack_range_initialized(
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n",
- regno, err_extra, tn_buf);
+ verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
+ regno, tn_buf);
return -EACCES;
}
/* Only initialized buffer on stack is allowed to be accessed
@@ -7096,7 +7729,7 @@ static int check_stack_range_initialized(
slot = -i - 1;
spi = slot / BPF_REG_SIZE;
if (state->allocated_stack <= slot) {
- verbose(env, "verifier bug: allocated_stack too small");
+ verbose(env, "verifier bug: allocated_stack too small\n");
return -EFAULT;
}
@@ -7124,14 +7757,14 @@ static int check_stack_range_initialized(
}
if (tnum_is_const(reg->var_off)) {
- verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n",
- err_extra, regno, min_off, i - min_off, access_size);
+ verbose(env, "invalid read from stack R%d off %d+%d size %d\n",
+ regno, min_off, i - min_off, access_size);
} else {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n",
- err_extra, regno, tn_buf, i - min_off, access_size);
+ verbose(env, "invalid read from stack R%d var_off %s+%d size %d\n",
+ regno, tn_buf, i - min_off, access_size);
}
return -EACCES;
mark:
@@ -7151,7 +7784,8 @@ mark:
}
static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
- int access_size, bool zero_size_allowed,
+ int access_size, enum bpf_access_type access_type,
+ bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
@@ -7163,7 +7797,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return check_packet_access(env, regno, reg->off, access_size,
zero_size_allowed);
case PTR_TO_MAP_KEY:
- if (meta && meta->raw_mode) {
+ if (access_type == BPF_WRITE) {
verbose(env, "R%d cannot write into %s\n", regno,
reg_type_str(env, reg->type));
return -EACCES;
@@ -7171,15 +7805,13 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return check_mem_region_access(env, regno, reg->off, access_size,
reg->map_ptr->key_size, false);
case PTR_TO_MAP_VALUE:
- if (check_map_access_type(env, regno, reg->off, access_size,
- meta && meta->raw_mode ? BPF_WRITE :
- BPF_READ))
+ if (check_map_access_type(env, regno, reg->off, access_size, access_type))
return -EACCES;
return check_map_access(env, regno, reg->off, access_size,
zero_size_allowed, ACCESS_HELPER);
case PTR_TO_MEM:
if (type_is_rdonly_mem(reg->type)) {
- if (meta && meta->raw_mode) {
+ if (access_type == BPF_WRITE) {
verbose(env, "R%d cannot write into %s\n", regno,
reg_type_str(env, reg->type));
return -EACCES;
@@ -7190,7 +7822,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
zero_size_allowed);
case PTR_TO_BUF:
if (type_is_rdonly_mem(reg->type)) {
- if (meta && meta->raw_mode) {
+ if (access_type == BPF_WRITE) {
verbose(env, "R%d cannot write into %s\n", regno,
reg_type_str(env, reg->type));
return -EACCES;
@@ -7207,7 +7839,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return check_stack_range_initialized(
env,
regno, reg->off, access_size,
- zero_size_allowed, ACCESS_HELPER, meta);
+ zero_size_allowed, access_type, meta);
case PTR_TO_BTF_ID:
return check_ptr_to_btf_access(env, regs, regno, reg->off,
access_size, BPF_READ, -1);
@@ -7218,7 +7850,6 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
* Dynamically check it now.
*/
if (!env->ops->convert_ctx_access) {
- enum bpf_access_type atype = meta && meta->raw_mode ? BPF_WRITE : BPF_READ;
int offset = access_size - 1;
/* Allow zero-byte read from PTR_TO_CTX */
@@ -7226,7 +7857,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return zero_size_allowed ? 0 : -EACCES;
return check_mem_access(env, env->insn_idx, regno, offset, BPF_B,
- atype, -1, false, false);
+ access_type, -1, false, false);
}
fallthrough;
@@ -7251,6 +7882,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
*/
static int check_mem_size_reg(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, u32 regno,
+ enum bpf_access_type access_type,
bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
@@ -7266,15 +7898,12 @@ static int check_mem_size_reg(struct bpf_verifier_env *env,
*/
meta->msize_max_value = reg->umax_value;
- /* The register is SCALAR_VALUE; the access check
- * happens using its boundaries.
+ /* The register is SCALAR_VALUE; the access check happens using
+ * its boundaries. For unprivileged variable accesses, disable
+ * raw mode so that the program is required to initialize all
+ * the memory that the helper could just partially fill up.
*/
if (!tnum_is_const(reg->var_off))
- /* For unprivileged variable accesses, disable raw
- * mode so that the program is required to
- * initialize all the memory that the helper could
- * just partially fill up.
- */
meta = NULL;
if (reg->smin_value < 0) {
@@ -7294,9 +7923,8 @@ static int check_mem_size_reg(struct bpf_verifier_env *env,
regno);
return -EACCES;
}
- err = check_helper_mem_access(env, regno - 1,
- reg->umax_value,
- zero_size_allowed, meta);
+ err = check_helper_mem_access(env, regno - 1, reg->umax_value,
+ access_type, zero_size_allowed, meta);
if (!err)
err = mark_chain_precision(env, regno);
return err;
@@ -7307,13 +7935,11 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg
{
bool may_be_null = type_may_be_null(reg->type);
struct bpf_reg_state saved_reg;
- struct bpf_call_arg_meta meta;
int err;
if (register_is_null(reg))
return 0;
- memset(&meta, 0, sizeof(meta));
/* Assuming that the register contains a value check if the memory
* access is safe. Temporarily save and restore the register's state as
* the conversion shouldn't be visible to a caller.
@@ -7323,10 +7949,8 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg
mark_ptr_not_null_reg(reg);
}
- err = check_helper_mem_access(env, regno, mem_size, true, &meta);
- /* Check access for BPF_WRITE */
- meta.raw_mode = true;
- err = err ?: check_helper_mem_access(env, regno, mem_size, true, &meta);
+ err = check_helper_mem_access(env, regno, mem_size, BPF_READ, true, NULL);
+ err = err ?: check_helper_mem_access(env, regno, mem_size, BPF_WRITE, true, NULL);
if (may_be_null)
*reg = saved_reg;
@@ -7352,13 +7976,12 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg
mark_ptr_not_null_reg(mem_reg);
}
- err = check_mem_size_reg(env, reg, regno, true, &meta);
- /* Check access for BPF_WRITE */
- meta.raw_mode = true;
- err = err ?: check_mem_size_reg(env, reg, regno, true, &meta);
+ err = check_mem_size_reg(env, reg, regno, BPF_READ, true, &meta);
+ err = err ?: check_mem_size_reg(env, reg, regno, BPF_WRITE, true, &meta);
if (may_be_null)
*mem_reg = saved_reg;
+
return err;
}
@@ -7381,7 +8004,7 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg
* Since only one bpf_spin_lock is allowed the checks are simpler than
* reg_is_refcounted() logic. The verifier needs to remember only
* one spin_lock instead of array of acquired_refs.
- * cur_state->active_lock remembers which map value element or allocated
+ * env->cur_state->active_locks remembers which map value element or allocated
* object got locked and clears it after bpf_spin_unlock.
*/
static int process_spin_lock(struct bpf_verifier_env *env, int regno,
@@ -7394,6 +8017,7 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
struct bpf_map *map = NULL;
struct btf *btf = NULL;
struct btf_record *rec;
+ int err;
if (!is_const) {
verbose(env,
@@ -7425,16 +8049,23 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
return -EINVAL;
}
if (is_lock) {
- if (cur->active_lock.ptr) {
+ void *ptr;
+
+ if (map)
+ ptr = map;
+ else
+ ptr = btf;
+
+ if (cur->active_locks) {
verbose(env,
"Locking two bpf_spin_locks are not allowed\n");
return -EINVAL;
}
- if (map)
- cur->active_lock.ptr = map;
- else
- cur->active_lock.ptr = btf;
- cur->active_lock.id = reg->id;
+ err = acquire_lock_state(env, env->insn_idx, REF_TYPE_LOCK, reg->id, ptr);
+ if (err < 0) {
+ verbose(env, "Failed to acquire lock state\n");
+ return err;
+ }
} else {
void *ptr;
@@ -7443,20 +8074,17 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
else
ptr = btf;
- if (!cur->active_lock.ptr) {
+ if (!cur->active_locks) {
verbose(env, "bpf_spin_unlock without taking a lock\n");
return -EINVAL;
}
- if (cur->active_lock.ptr != ptr ||
- cur->active_lock.id != reg->id) {
+
+ if (release_lock_state(env->cur_state, REF_TYPE_LOCK, reg->id, ptr)) {
verbose(env, "bpf_spin_unlock of different lock\n");
return -EINVAL;
}
invalidate_non_owning_refs(env);
-
- cur->active_lock.ptr = NULL;
- cur->active_lock.id = 0;
}
return 0;
}
@@ -7498,33 +8126,59 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
return 0;
}
+static int process_wq_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_map *map = reg->map_ptr;
+ u64 val = reg->var_off.value;
+
+ if (map->record->wq_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct bpf_wq' that is at %d\n",
+ val + reg->off, map->record->wq_off);
+ return -EINVAL;
+ }
+ meta->map.uid = reg->map_uid;
+ meta->map.ptr = map;
+ return 0;
+}
+
static int process_kptr_func(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- struct bpf_map *map_ptr = reg->map_ptr;
struct btf_field *kptr_field;
+ struct bpf_map *map_ptr;
+ struct btf_record *rec;
u32 kptr_off;
+ if (type_is_ptr_alloc_obj(reg->type)) {
+ rec = reg_btf_record(reg);
+ } else { /* PTR_TO_MAP_VALUE */
+ map_ptr = reg->map_ptr;
+ if (!map_ptr->btf) {
+ verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n",
+ map_ptr->name);
+ return -EINVAL;
+ }
+ rec = map_ptr->record;
+ meta->map_ptr = map_ptr;
+ }
+
if (!tnum_is_const(reg->var_off)) {
verbose(env,
"R%d doesn't have constant offset. kptr has to be at the constant offset\n",
regno);
return -EINVAL;
}
- if (!map_ptr->btf) {
- verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n",
- map_ptr->name);
- return -EINVAL;
- }
- if (!btf_record_has_field(map_ptr->record, BPF_KPTR)) {
- verbose(env, "map '%s' has no valid kptr\n", map_ptr->name);
+
+ if (!btf_record_has_field(rec, BPF_KPTR)) {
+ verbose(env, "R%d has no valid kptr\n", regno);
return -EINVAL;
}
- meta->map_ptr = map_ptr;
kptr_off = reg->off + reg->var_off.value;
- kptr_field = btf_record_find(map_ptr->record, kptr_off, BPF_KPTR);
+ kptr_field = btf_record_find(rec, kptr_off, BPF_KPTR);
if (!kptr_field) {
verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
return -EACCES;
@@ -7568,6 +8222,13 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
int err;
+ if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) {
+ verbose(env,
+ "arg#%d expected pointer to stack or const struct bpf_dynptr\n",
+ regno - 1);
+ return -EINVAL;
+ }
+
/* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
* ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
*/
@@ -7618,7 +8279,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
if (!is_dynptr_reg_valid_init(env, reg)) {
verbose(env,
"Expected an initialized dynptr as arg #%d\n",
- regno);
+ regno - 1);
return -EINVAL;
}
@@ -7626,7 +8287,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
verbose(env,
"Expected a dynptr of type %s as arg #%d\n",
- dynptr_type_str(arg_to_dynptr_type(arg_type)), regno);
+ dynptr_type_str(arg_to_dynptr_type(arg_type)), regno - 1);
return -EINVAL;
}
@@ -7662,12 +8323,17 @@ static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta)
return meta->kfunc_flags & KF_ITER_DESTROY;
}
-static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg)
+static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx,
+ const struct btf_param *arg)
{
/* btf_check_iter_kfuncs() guarantees that first argument of any iter
* kfunc is iter state pointer
*/
- return arg == 0 && is_iter_kfunc(meta);
+ if (is_iter_kfunc(meta))
+ return arg_idx == 0;
+
+ /* iter passed as an argument to a generic kfunc */
+ return btf_param_match_suffix(meta->btf, arg, "__iter");
}
static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
@@ -7675,21 +8341,32 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
const struct btf_type *t;
- const struct btf_param *arg;
- int spi, err, i, nr_slots;
- u32 btf_id;
+ int spi, err, i, nr_slots, btf_id;
- /* btf_check_iter_kfuncs() ensures we don't need to validate anything here */
- arg = &btf_params(meta->func_proto)[0];
- t = btf_type_skip_modifiers(meta->btf, arg->type, NULL); /* PTR */
- t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id); /* STRUCT */
+ if (reg->type != PTR_TO_STACK) {
+ verbose(env, "arg#%d expected pointer to an iterator on stack\n", regno - 1);
+ return -EINVAL;
+ }
+
+ /* For iter_{new,next,destroy} functions, btf_check_iter_kfuncs()
+ * ensures struct convention, so we wouldn't need to do any BTF
+ * validation here. But given iter state can be passed as a parameter
+ * to any kfunc, if arg has "__iter" suffix, we need to be a bit more
+ * conservative here.
+ */
+ btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, regno - 1);
+ if (btf_id < 0) {
+ verbose(env, "expected valid iter pointer as arg #%d\n", regno - 1);
+ return -EINVAL;
+ }
+ t = btf_type_by_id(meta->btf, btf_id);
nr_slots = t->size / BPF_REG_SIZE;
if (is_iter_new_kfunc(meta)) {
/* bpf_iter_<type>_new() expects pointer to uninit iter state */
if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
verbose(env, "expected uninitialized iter_%s as arg #%d\n",
- iter_type_str(meta->btf, btf_id), regno);
+ iter_type_str(meta->btf, btf_id), regno - 1);
return -EINVAL;
}
@@ -7704,14 +8381,16 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
if (err)
return err;
} else {
- /* iter_next() or iter_destroy() expect initialized iter state*/
+ /* iter_next() or iter_destroy(), as well as any kfunc
+ * accepting iter argument, expect initialized iter state
+ */
err = is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots);
switch (err) {
case 0:
break;
case -EINVAL:
verbose(env, "expected an initialized iter_%s as arg #%d\n",
- iter_type_str(meta->btf, btf_id), regno);
+ iter_type_str(meta->btf, btf_id), regno - 1);
return err;
case -EPROTO:
verbose(env, "expected an RCU CS when using %s\n", meta->func_name);
@@ -7818,6 +8497,15 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env,
return 0;
}
+static struct bpf_reg_state *get_iter_from_state(struct bpf_verifier_state *cur_st,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ int iter_frameno = meta->iter.frameno;
+ int iter_spi = meta->iter.spi;
+
+ return &cur_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+}
+
/* process_iter_next_call() is called when verifier gets to iterator's next
* "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
* to it as just "iter_next()" in comments below.
@@ -7902,12 +8590,10 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
struct bpf_reg_state *cur_iter, *queued_iter;
- int iter_frameno = meta->iter.frameno;
- int iter_spi = meta->iter.spi;
BTF_TYPE_EMIT(struct bpf_iter);
- cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+ cur_iter = get_iter_from_state(cur_st, meta);
if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE &&
cur_iter->iter.state != BPF_ITER_STATE_DRAINED) {
@@ -7935,7 +8621,7 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
if (!queued_st)
return -ENOMEM;
- queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+ queued_iter = get_iter_from_state(queued_st, meta);
queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
queued_iter->iter.depth++;
if (prev_st)
@@ -7959,6 +8645,12 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type)
type == ARG_CONST_SIZE_OR_ZERO;
}
+static bool arg_type_is_raw_mem(enum bpf_arg_type type)
+{
+ return base_type(type) == ARG_PTR_TO_MEM &&
+ type & MEM_UNINIT;
+}
+
static bool arg_type_is_release(enum bpf_arg_type type)
{
return type & OBJ_RELEASE;
@@ -7969,16 +8661,6 @@ static bool arg_type_is_dynptr(enum bpf_arg_type type)
return base_type(type) == ARG_PTR_TO_DYNPTR;
}
-static int int_ptr_type_to_size(enum bpf_arg_type type)
-{
- if (type == ARG_PTR_TO_INT)
- return sizeof(u32);
- else if (type == ARG_PTR_TO_LONG)
- return sizeof(u64);
-
- return -EINVAL;
-}
-
static int resolve_map_arg_type(struct bpf_verifier_env *env,
const struct bpf_call_arg_meta *meta,
enum bpf_arg_type *arg_type)
@@ -8051,16 +8733,6 @@ static const struct bpf_reg_types mem_types = {
},
};
-static const struct bpf_reg_types int_ptr_types = {
- .types = {
- PTR_TO_STACK,
- PTR_TO_PACKET,
- PTR_TO_PACKET_META,
- PTR_TO_MAP_KEY,
- PTR_TO_MAP_VALUE,
- },
-};
-
static const struct bpf_reg_types spin_lock_types = {
.types = {
PTR_TO_MAP_VALUE,
@@ -8091,7 +8763,12 @@ static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
-static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types kptr_xchg_dest_types = {
+ .types = {
+ PTR_TO_MAP_VALUE,
+ PTR_TO_BTF_ID | MEM_ALLOC
+ }
+};
static const struct bpf_reg_types dynptr_types = {
.types = {
PTR_TO_STACK,
@@ -8116,14 +8793,12 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types,
[ARG_PTR_TO_MEM] = &mem_types,
[ARG_PTR_TO_RINGBUF_MEM] = &ringbuf_mem_types,
- [ARG_PTR_TO_INT] = &int_ptr_types,
- [ARG_PTR_TO_LONG] = &int_ptr_types,
[ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
[ARG_PTR_TO_FUNC] = &func_ptr_types,
[ARG_PTR_TO_STACK] = &stack_ptr_types,
[ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
[ARG_PTR_TO_TIMER] = &timer_types,
- [ARG_PTR_TO_KPTR] = &kptr_types,
+ [ARG_KPTR_XCHG_DEST] = &kptr_xchg_dest_types,
[ARG_PTR_TO_DYNPTR] = &dynptr_types,
};
@@ -8162,7 +8837,8 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
if (base_type(arg_type) == ARG_PTR_TO_MEM)
type &= ~DYNPTR_TYPE_FLAG_MASK;
- if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) {
+ /* Local kptr types are allowed as the source argument of bpf_kptr_xchg */
+ if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && regno == BPF_REG_2) {
type &= ~MEM_ALLOC;
type &= ~MEM_PERCPU;
}
@@ -8200,6 +8876,7 @@ found:
switch ((int)reg->type) {
case PTR_TO_BTF_ID:
case PTR_TO_BTF_ID | PTR_TRUSTED:
+ case PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL:
case PTR_TO_BTF_ID | MEM_RCU:
case PTR_TO_BTF_ID | PTR_MAYBE_NULL:
case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU:
@@ -8254,7 +8931,8 @@ found:
verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
return -EFAULT;
}
- if (meta->func_id == BPF_FUNC_kptr_xchg) {
+ /* Check if local kptr in src arg matches kptr in dst arg */
+ if (meta->func_id == BPF_FUNC_kptr_xchg && regno == BPF_REG_2) {
if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
return -EACCES;
}
@@ -8334,6 +9012,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env,
case PTR_TO_MEM | MEM_RINGBUF:
case PTR_TO_BUF:
case PTR_TO_BUF | MEM_RDONLY:
+ case PTR_TO_ARENA:
case SCALAR_VALUE:
return 0;
/* All the rest must be rejected, except PTR_TO_BTF_ID which allows
@@ -8472,6 +9151,69 @@ static int check_reg_const_str(struct bpf_verifier_env *env,
return 0;
}
+/* Returns constant key value in `value` if possible, else negative error */
+static int get_constant_map_key(struct bpf_verifier_env *env,
+ struct bpf_reg_state *key,
+ u32 key_size,
+ s64 *value)
+{
+ struct bpf_func_state *state = func(env, key);
+ struct bpf_reg_state *reg;
+ int slot, spi, off;
+ int spill_size = 0;
+ int zero_size = 0;
+ int stack_off;
+ int i, err;
+ u8 *stype;
+
+ if (!env->bpf_capable)
+ return -EOPNOTSUPP;
+ if (key->type != PTR_TO_STACK)
+ return -EOPNOTSUPP;
+ if (!tnum_is_const(key->var_off))
+ return -EOPNOTSUPP;
+
+ stack_off = key->off + key->var_off.value;
+ slot = -stack_off - 1;
+ spi = slot / BPF_REG_SIZE;
+ off = slot % BPF_REG_SIZE;
+ stype = state->stack[spi].slot_type;
+
+ /* First handle precisely tracked STACK_ZERO */
+ for (i = off; i >= 0 && stype[i] == STACK_ZERO; i--)
+ zero_size++;
+ if (zero_size >= key_size) {
+ *value = 0;
+ return 0;
+ }
+
+ /* Check that stack contains a scalar spill of expected size */
+ if (!is_spilled_scalar_reg(&state->stack[spi]))
+ return -EOPNOTSUPP;
+ for (i = off; i >= 0 && stype[i] == STACK_SPILL; i--)
+ spill_size++;
+ if (spill_size != key_size)
+ return -EOPNOTSUPP;
+
+ reg = &state->stack[spi].spilled_ptr;
+ if (!tnum_is_const(reg->var_off))
+ /* Stack value not statically known */
+ return -EOPNOTSUPP;
+
+ /* We are relying on a constant value. So mark as precise
+ * to prevent pruning on it.
+ */
+ bt_set_frame_slot(&env->bt, key->frameno, spi);
+ err = mark_chain_precision_batch(env);
+ if (err < 0)
+ return err;
+
+ *value = reg->var_off.value;
+ return 0;
+}
+
+static bool can_elide_value_nullness(enum bpf_map_type type);
+
static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
struct bpf_call_arg_meta *meta,
const struct bpf_func_proto *fn,
@@ -8482,6 +9224,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
enum bpf_arg_type arg_type = fn->arg_type[arg];
enum bpf_reg_type type = reg->type;
u32 *arg_btf_id = NULL;
+ u32 key_size;
int err = 0;
if (arg_type == ARG_DONTCARE)
@@ -8564,7 +9307,7 @@ skip_type_check:
meta->release_regno = regno;
}
- if (reg->ref_obj_id) {
+ if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) {
if (meta->ref_obj_id) {
verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
regno, reg->ref_obj_id,
@@ -8615,9 +9358,20 @@ skip_type_check:
verbose(env, "invalid map_ptr to access map->key\n");
return -EACCES;
}
- err = check_helper_mem_access(env, regno,
- meta->map_ptr->key_size, false,
- NULL);
+ key_size = meta->map_ptr->key_size;
+ err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL);
+ if (err)
+ return err;
+ if (can_elide_value_nullness(meta->map_ptr->map_type)) {
+ err = get_constant_map_key(env, reg, key_size, &meta->const_map_key);
+ if (err < 0) {
+ meta->const_map_key = -1;
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ else
+ return err;
+ }
+ }
break;
case ARG_PTR_TO_MAP_VALUE:
if (type_may_be_null(arg_type) && register_is_null(reg))
@@ -8632,9 +9386,9 @@ skip_type_check:
return -EACCES;
}
meta->raw_mode = arg_type & MEM_UNINIT;
- err = check_helper_mem_access(env, regno,
- meta->map_ptr->value_size, false,
- meta);
+ err = check_helper_mem_access(env, regno, meta->map_ptr->value_size,
+ arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ,
+ false, meta);
break;
case ARG_PTR_TO_PERCPU_BTF_ID:
if (!reg->btf_id) {
@@ -8676,16 +9430,26 @@ skip_type_check:
*/
meta->raw_mode = arg_type & MEM_UNINIT;
if (arg_type & MEM_FIXED_SIZE) {
- err = check_helper_mem_access(env, regno,
- fn->arg_size[arg], false,
- meta);
+ err = check_helper_mem_access(env, regno, fn->arg_size[arg],
+ arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ,
+ false, meta);
+ if (err)
+ return err;
+ if (arg_type & MEM_ALIGNED)
+ err = check_ptr_alignment(env, reg, 0, fn->arg_size[arg], true);
}
break;
case ARG_CONST_SIZE:
- err = check_mem_size_reg(env, reg, regno, false, meta);
+ err = check_mem_size_reg(env, reg, regno,
+ fn->arg_type[arg - 1] & MEM_WRITE ?
+ BPF_WRITE : BPF_READ,
+ false, meta);
break;
case ARG_CONST_SIZE_OR_ZERO:
- err = check_mem_size_reg(env, reg, regno, true, meta);
+ err = check_mem_size_reg(env, reg, regno,
+ fn->arg_type[arg - 1] & MEM_WRITE ?
+ BPF_WRITE : BPF_READ,
+ true, meta);
break;
case ARG_PTR_TO_DYNPTR:
err = process_dynptr_func(env, regno, insn_idx, arg_type, 0);
@@ -8703,17 +9467,6 @@ skip_type_check:
if (err)
return err;
break;
- case ARG_PTR_TO_INT:
- case ARG_PTR_TO_LONG:
- {
- int size = int_ptr_type_to_size(arg_type);
-
- err = check_helper_mem_access(env, regno, size, false, meta);
- if (err)
- return err;
- err = check_ptr_alignment(env, reg, 0, size, true);
- break;
- }
case ARG_PTR_TO_CONST_STR:
{
err = check_reg_const_str(env, reg, regno);
@@ -8721,7 +9474,7 @@ skip_type_check:
return err;
break;
}
- case ARG_PTR_TO_KPTR:
+ case ARG_KPTR_XCHG_DEST:
err = process_kptr_func(env, regno, meta);
if (err)
return err;
@@ -8736,7 +9489,8 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
enum bpf_attach_type eatype = env->prog->expected_attach_type;
enum bpf_prog_type type = resolve_prog_type(env->prog);
- if (func_id != BPF_FUNC_map_update_elem)
+ if (func_id != BPF_FUNC_map_update_elem &&
+ func_id != BPF_FUNC_map_delete_elem)
return false;
/* It's not possible to get access to a locked struct sock in these
@@ -8747,6 +9501,11 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
if (eatype == BPF_TRACE_ITER)
return true;
break;
+ case BPF_PROG_TYPE_SOCK_OPS:
+ /* map_update allowed only via dedicated helpers with event type checks */
+ if (func_id == BPF_FUNC_map_delete_elem)
+ return true;
+ break;
case BPF_PROG_TYPE_SOCKET_FILTER:
case BPF_PROG_TYPE_SCHED_CLS:
case BPF_PROG_TYPE_SCHED_ACT:
@@ -8842,7 +9601,6 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_SOCKMAP:
if (func_id != BPF_FUNC_sk_redirect_map &&
func_id != BPF_FUNC_sock_map_update &&
- func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_msg_redirect_map &&
func_id != BPF_FUNC_sk_select_reuseport &&
func_id != BPF_FUNC_map_lookup_elem &&
@@ -8852,7 +9610,6 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_SOCKHASH:
if (func_id != BPF_FUNC_sk_redirect_hash &&
func_id != BPF_FUNC_sock_hash_update &&
- func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_msg_redirect_hash &&
func_id != BPF_FUNC_sk_select_reuseport &&
func_id != BPF_FUNC_map_lookup_elem &&
@@ -9026,15 +9783,15 @@ static bool check_raw_mode_ok(const struct bpf_func_proto *fn)
{
int count = 0;
- if (fn->arg1_type == ARG_PTR_TO_UNINIT_MEM)
+ if (arg_type_is_raw_mem(fn->arg1_type))
count++;
- if (fn->arg2_type == ARG_PTR_TO_UNINIT_MEM)
+ if (arg_type_is_raw_mem(fn->arg2_type))
count++;
- if (fn->arg3_type == ARG_PTR_TO_UNINIT_MEM)
+ if (arg_type_is_raw_mem(fn->arg3_type))
count++;
- if (fn->arg4_type == ARG_PTR_TO_UNINIT_MEM)
+ if (arg_type_is_raw_mem(fn->arg4_type))
count++;
- if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM)
+ if (arg_type_is_raw_mem(fn->arg5_type))
count++;
/* We only support one arg being in raw mode at the moment,
@@ -9146,21 +9903,38 @@ static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range
reg->range = AT_PKT_END;
}
+static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id)
+{
+ int i;
+
+ for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].type != REF_TYPE_PTR)
+ continue;
+ if (state->refs[i].id == ref_obj_id) {
+ release_reference_state(state, i);
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
/* The pointer with the specified id has released its reference to kernel
* resources. Identify all copies of the same pointer and clear the reference.
+ *
+ * This is the release function corresponding to acquire_reference(). Idempotent.
*/
-static int release_reference(struct bpf_verifier_env *env,
- int ref_obj_id)
+static int release_reference(struct bpf_verifier_env *env, int ref_obj_id)
{
+ struct bpf_verifier_state *vstate = env->cur_state;
struct bpf_func_state *state;
struct bpf_reg_state *reg;
int err;
- err = release_reference_state(cur_func(env), ref_obj_id);
+ err = release_reference_nomark(vstate, ref_obj_id);
if (err)
return err;
- bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
if (reg->ref_obj_id == ref_obj_id)
mark_reg_invalid(env, reg);
}));
@@ -9234,9 +10008,7 @@ static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int calls
callsite,
state->curframe + 1 /* frameno within this callchain */,
subprog /* subprog number within this prog */);
- /* Transfer references to the callee */
- err = copy_reference_state(callee, caller);
- err = err ?: set_callee_state_cb(env, caller, callee, callsite);
+ err = set_callee_state_cb(env, caller, callee, callsite);
if (err)
goto err_out;
@@ -9298,10 +10070,38 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
bpf_log(log, "arg#%d is expected to be non-NULL\n", i);
return -EINVAL;
}
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
+ /*
+ * Can pass any value and the kernel won't crash, but
+ * only PTR_TO_ARENA or SCALAR make sense. Everything
+ * else is a bug in the bpf program. Point it out to
+ * the user at the verification time instead of
+ * run-time debug nightmare.
+ */
+ if (reg->type != PTR_TO_ARENA && reg->type != SCALAR_VALUE) {
+ bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno);
+ return -EINVAL;
+ }
} else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
+ ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_DYNPTR);
+ if (ret)
+ return ret;
+
ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0);
if (ret)
return ret;
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
+ struct bpf_call_arg_meta meta;
+ int err;
+
+ if (register_is_null(reg) && type_may_be_null(arg->arg_type))
+ continue;
+
+ memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */
+ err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta);
+ err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type);
+ if (err)
+ return err;
} else {
bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n",
i, arg->arg_type);
@@ -9366,7 +10166,7 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
*/
env->subprog_info[subprog].is_cb = true;
if (bpf_pseudo_kfunc_call(insn) &&
- !is_sync_callback_calling_kfunc(insn->imm)) {
+ !is_callback_calling_kfunc(insn->imm)) {
verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
func_id_name(insn->imm), insn->imm);
return -EFAULT;
@@ -9377,15 +10177,14 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
return -EFAULT;
}
- if (insn->code == (BPF_JMP | BPF_CALL) &&
- insn->src_reg == 0 &&
- insn->imm == BPF_FUNC_timer_set_callback) {
+ if (is_async_callback_calling_insn(insn)) {
struct bpf_verifier_state *async_cb;
- /* there is no real recursion here. timer callbacks are async */
+ /* there is no real recursion here. timer and workqueue callbacks are async */
env->subprog_info[subprog].is_async_cb = true;
async_cb = push_async_cb(env, env->subprog_info[subprog].start,
- insn_idx, subprog);
+ insn_idx, subprog,
+ is_bpf_wq_set_callback_impl_kfunc(insn->imm));
if (!async_cb)
return -EFAULT;
callee = async_cb->frame[0];
@@ -9438,6 +10237,26 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (subprog_is_global(env, subprog)) {
const char *sub_name = subprog_name(env, subprog);
+ /* Only global subprogs cannot be called with a lock held. */
+ if (env->cur_state->active_locks) {
+ verbose(env, "global function calls are not allowed while holding a lock,\n"
+ "use static function instead\n");
+ return -EINVAL;
+ }
+
+ /* Only global subprogs cannot be called with preemption disabled. */
+ if (env->cur_state->active_preempt_locks) {
+ verbose(env, "global function calls are not allowed with preemption disabled,\n"
+ "use static function instead\n");
+ return -EINVAL;
+ }
+
+ if (env->cur_state->active_irq_id) {
+ verbose(env, "global function calls are not allowed with IRQs disabled,\n"
+ "use static function instead\n");
+ return -EINVAL;
+ }
+
if (err) {
verbose(env, "Caller passes invalid args into func#%d ('%s')\n",
subprog, sub_name);
@@ -9446,6 +10265,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
verbose(env, "Func#%d ('%s') is global and assumed valid.\n",
subprog, sub_name);
+ if (env->subprog_info[subprog].changes_pkt_data)
+ clear_all_pkt_pointers(env);
/* mark global subprog for verifying after main prog */
subprog_aux(env, subprog)->called = true;
clear_caller_saved_regs(env, caller->regs);
@@ -9472,9 +10293,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "caller:\n");
- print_verifier_state(env, caller, true);
+ print_verifier_state(env, state, caller->frameno, true);
verbose(env, "callee:\n");
- print_verifier_state(env, state->frame[state->curframe], true);
+ print_verifier_state(env, state, state->curframe, true);
}
return 0;
@@ -9530,12 +10351,8 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
struct bpf_map *map;
int err;
- if (bpf_map_ptr_poisoned(insn_aux)) {
- verbose(env, "tail_call abusing map_ptr\n");
- return -EINVAL;
- }
-
- map = BPF_MAP_PTR(insn_aux->map_ptr_state);
+ /* valid map_ptr and poison value does not matter */
+ map = insn_aux->map_ptr_state.map_ptr;
if (!map->ops->map_set_for_each_callback_args ||
!map->ops->map_for_each_callback) {
verbose(env, "callback function not allowed for map\n");
@@ -9558,7 +10375,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env,
{
/* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx,
* u64 flags);
- * callback_fn(u32 index, void *callback_ctx);
+ * callback_fn(u64 index, void *callback_ctx);
*/
callee->regs[BPF_REG_1].type = SCALAR_VALUE;
callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
@@ -9711,9 +10528,13 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
return is_rbtree_lock_required_kfunc(kfunc_btf_id);
}
-static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg)
+static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg,
+ bool return_32bit)
{
- return range.minval <= reg->smin_value && reg->smax_value <= range.maxval;
+ if (return_32bit)
+ return range.minval <= reg->s32_min_value && reg->s32_max_value <= range.maxval;
+ else
+ return range.minval <= reg->smin_value && reg->smax_value <= range.maxval;
}
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
@@ -9750,8 +10571,8 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
if (err)
return err;
- /* enforce R0 return value range */
- if (!retval_range_within(callee->callback_ret_range, r0)) {
+ /* enforce R0 return value range, and bpf_callback_t returns 64bit */
+ if (!retval_range_within(callee->callback_ret_range, r0, false)) {
verbose_invalid_scalar(env, r0, callee->callback_ret_range,
"At callback return", "R0");
return -EINVAL;
@@ -9766,18 +10587,6 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
caller->regs[BPF_REG_0] = *r0;
}
- /* callback_fn frame should have released its own additions to parent's
- * reference state at this point, or check_reference_leak would
- * complain, hence it must be the same as the caller. There is no need
- * to copy it back.
- */
- if (!callee->in_callback_fn) {
- /* Transfer references to the caller */
- err = copy_reference_state(caller, callee);
- if (err)
- return err;
- }
-
/* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite,
* there function call logic would reschedule callback visit. If iteration
* converges is_state_visited() would prune that visit eventually.
@@ -9790,9 +10599,9 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "returning from callee:\n");
- print_verifier_state(env, callee, true);
+ print_verifier_state(env, state, callee->frameno, true);
verbose(env, "to caller at %d:\n", *insn_idx);
- print_verifier_state(env, caller, true);
+ print_verifier_state(env, state, caller->frameno, true);
}
/* clear everything in the callee. In case of exceptional exits using
* bpf_throw, this will be done by copy_verifier_state for extra frames. */
@@ -9894,12 +10703,12 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
return -EACCES;
}
- if (!BPF_MAP_PTR(aux->map_ptr_state))
+ if (!aux->map_ptr_state.map_ptr)
+ bpf_map_ptr_store(aux, meta->map_ptr,
+ !meta->map_ptr->bypass_spec_v1, false);
+ else if (aux->map_ptr_state.map_ptr != meta->map_ptr)
bpf_map_ptr_store(aux, meta->map_ptr,
- !meta->map_ptr->bypass_spec_v1);
- else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr)
- bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON,
- !meta->map_ptr->bypass_spec_v1);
+ !meta->map_ptr->bypass_spec_v1, true);
return 0;
}
@@ -9942,15 +10751,15 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit)
{
- struct bpf_func_state *state = cur_func(env);
+ struct bpf_verifier_state *state = env->cur_state;
bool refs_lingering = false;
int i;
- if (!exception_exit && state->frameno && !state->in_callback_fn)
+ if (!exception_exit && cur_func(env)->frameno)
return 0;
for (i = 0; i < state->acquired_refs; i++) {
- if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
+ if (state->refs[i].type != REF_TYPE_PTR)
continue;
verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
state->refs[i].id, state->refs[i].insn_idx);
@@ -9959,6 +10768,39 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi
return refs_lingering ? -EINVAL : 0;
}
+static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit, bool check_lock, const char *prefix)
+{
+ int err;
+
+ if (check_lock && env->cur_state->active_locks) {
+ verbose(env, "%s cannot be used inside bpf_spin_lock-ed region\n", prefix);
+ return -EINVAL;
+ }
+
+ err = check_reference_leak(env, exception_exit);
+ if (err) {
+ verbose(env, "%s would lead to reference leak\n", prefix);
+ return err;
+ }
+
+ if (check_lock && env->cur_state->active_irq_id) {
+ verbose(env, "%s cannot be used inside bpf_local_irq_save-ed region\n", prefix);
+ return -EINVAL;
+ }
+
+ if (check_lock && env->cur_state->active_rcu_lock) {
+ verbose(env, "%s cannot be used inside bpf_rcu_read_lock-ed region\n", prefix);
+ return -EINVAL;
+ }
+
+ if (check_lock && env->cur_state->active_preempt_locks) {
+ verbose(env, "%s cannot be used inside bpf_preempt_disable-ed region\n", prefix);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
struct bpf_reg_state *regs)
{
@@ -10053,6 +10895,34 @@ static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno
state->callback_subprogno == subprogno);
}
+/* Returns whether or not the given map type can potentially elide
+ * lookup return value nullness check. This is possible if the key
+ * is statically known.
+ */
+static bool can_elide_value_nullness(enum bpf_map_type type)
+{
+ switch (type) {
+ case BPF_MAP_TYPE_ARRAY:
+ case BPF_MAP_TYPE_PERCPU_ARRAY:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int get_helper_proto(struct bpf_verifier_env *env, int func_id,
+ const struct bpf_func_proto **ptr)
+{
+ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID)
+ return -ERANGE;
+
+ if (!env->ops->get_func_proto)
+ return -EINVAL;
+
+ *ptr = env->ops->get_func_proto(func_id, env->prog);
+ return *ptr ? 0 : -EINVAL;
+}
+
static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
@@ -10069,18 +10939,16 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* find function prototype */
func_id = insn->imm;
- if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
- verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
- func_id);
+ err = get_helper_proto(env, insn->imm, &fn);
+ if (err == -ERANGE) {
+ verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id);
return -EINVAL;
}
- if (env->ops->get_func_proto)
- fn = env->ops->get_func_proto(func_id, env->prog);
- if (!fn) {
- verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
- func_id);
- return -EINVAL;
+ if (err) {
+ verbose(env, "program of this type cannot use helper %s#%d\n",
+ func_id_name(func_id), func_id);
+ return err;
}
/* eBPF programs must be GPL compatible to use GPL-ed functions */
@@ -10094,13 +10962,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EINVAL;
}
- if (!env->prog->aux->sleepable && fn->might_sleep) {
+ if (!in_sleepable(env) && fn->might_sleep) {
verbose(env, "helper call might sleep in a non-sleepable prog\n");
return -EINVAL;
}
/* With LD_ABS/IND some JITs save/restore skb from r1. */
- changes_data = bpf_helper_changes_pkt_data(fn->func);
+ changes_data = bpf_helper_changes_pkt_data(func_id);
if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n",
func_id_name(func_id), func_id);
@@ -10124,7 +10992,29 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EINVAL;
}
- if (env->prog->aux->sleepable && is_storage_get_function(func_id))
+ if (in_sleepable(env) && is_storage_get_function(func_id))
+ env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
+ }
+
+ if (env->cur_state->active_preempt_locks) {
+ if (fn->might_sleep) {
+ verbose(env, "sleepable helper %s#%d in non-preemptible region\n",
+ func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
+
+ if (in_sleepable(env) && is_storage_get_function(func_id))
+ env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
+ }
+
+ if (env->cur_state->active_irq_id) {
+ if (fn->might_sleep) {
+ verbose(env, "sleepable helper %s#%d in IRQ-disabled region\n",
+ func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
+
+ if (in_sleepable(env) && is_storage_get_function(func_id))
env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
}
@@ -10174,7 +11064,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
struct bpf_func_state *state;
struct bpf_reg_state *reg;
- err = release_reference_state(cur_func(env), ref_obj_id);
+ err = release_reference_nomark(env->cur_state, ref_obj_id);
if (!err) {
bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
if (reg->ref_obj_id == ref_obj_id) {
@@ -10205,11 +11095,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
switch (func_id) {
case BPF_FUNC_tail_call:
- err = check_reference_leak(env, false);
- if (err) {
- verbose(env, "tail_call would lead to reference leak\n");
+ err = check_resource_leak(env, false, true, "tail_call");
+ if (err)
return err;
- }
break;
case BPF_FUNC_get_local_storage:
/* check that flags argument in get_local_storage(map, flags) is 0,
@@ -10388,10 +11276,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
"kernel subsystem misconfigured verifier\n");
return -EINVAL;
}
+
+ if (func_id == BPF_FUNC_map_lookup_elem &&
+ can_elide_value_nullness(meta.map_ptr->map_type) &&
+ meta.const_map_key >= 0 &&
+ meta.const_map_key < meta.map_ptr->max_entries)
+ ret_flag &= ~PTR_MAYBE_NULL;
+
regs[BPF_REG_0].map_ptr = meta.map_ptr;
regs[BPF_REG_0].map_uid = meta.map_uid;
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
- if (!type_may_be_null(ret_type) &&
+ if (!type_may_be_null(ret_flag) &&
btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) {
regs[BPF_REG_0].id = ++env->id_gen;
}
@@ -10509,7 +11404,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
} else if (is_acquire_function(func_id, meta.map_ptr)) {
- int id = acquire_reference_state(env, insn_idx);
+ int id = acquire_reference(env, insn_idx);
if (id < 0)
return id;
@@ -10620,24 +11515,6 @@ static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta)
return meta->kfunc_flags & KF_RCU_PROTECTED;
}
-static bool __kfunc_param_match_suffix(const struct btf *btf,
- const struct btf_param *arg,
- const char *suffix)
-{
- int suffix_len = strlen(suffix), len;
- const char *param_name;
-
- /* In the future, this can be ported to use BTF tagging */
- param_name = btf_name_by_offset(btf, arg->name_off);
- if (str_is_empty(param_name))
- return false;
- len = strlen(param_name);
- if (len < suffix_len)
- return false;
- param_name += len - suffix_len;
- return !strncmp(param_name, suffix, suffix_len);
-}
-
static bool is_kfunc_arg_mem_size(const struct btf *btf,
const struct btf_param *arg,
const struct bpf_reg_state *reg)
@@ -10648,7 +11525,7 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf,
if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
return false;
- return __kfunc_param_match_suffix(btf, arg, "__sz");
+ return btf_param_match_suffix(btf, arg, "__sz");
}
static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
@@ -10661,47 +11538,57 @@ static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
return false;
- return __kfunc_param_match_suffix(btf, arg, "__szk");
+ return btf_param_match_suffix(btf, arg, "__szk");
}
static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg)
{
- return __kfunc_param_match_suffix(btf, arg, "__opt");
+ return btf_param_match_suffix(btf, arg, "__opt");
}
static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
{
- return __kfunc_param_match_suffix(btf, arg, "__k");
+ return btf_param_match_suffix(btf, arg, "__k");
}
static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *arg)
{
- return __kfunc_param_match_suffix(btf, arg, "__ign");
+ return btf_param_match_suffix(btf, arg, "__ign");
+}
+
+static bool is_kfunc_arg_map(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__map");
}
static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param *arg)
{
- return __kfunc_param_match_suffix(btf, arg, "__alloc");
+ return btf_param_match_suffix(btf, arg, "__alloc");
}
static bool is_kfunc_arg_uninit(const struct btf *btf, const struct btf_param *arg)
{
- return __kfunc_param_match_suffix(btf, arg, "__uninit");
+ return btf_param_match_suffix(btf, arg, "__uninit");
}
static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf_param *arg)
{
- return __kfunc_param_match_suffix(btf, arg, "__refcounted_kptr");
+ return btf_param_match_suffix(btf, arg, "__refcounted_kptr");
}
static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param *arg)
{
- return __kfunc_param_match_suffix(btf, arg, "__nullable");
+ return btf_param_match_suffix(btf, arg, "__nullable");
}
static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg)
{
- return __kfunc_param_match_suffix(btf, arg, "__str");
+ return btf_param_match_suffix(btf, arg, "__str");
+}
+
+static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__irq_flag");
}
static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
@@ -10729,14 +11616,16 @@ enum {
KF_ARG_LIST_NODE_ID,
KF_ARG_RB_ROOT_ID,
KF_ARG_RB_NODE_ID,
+ KF_ARG_WORKQUEUE_ID,
};
BTF_ID_LIST(kf_arg_btf_ids)
-BTF_ID(struct, bpf_dynptr_kern)
+BTF_ID(struct, bpf_dynptr)
BTF_ID(struct, bpf_list_head)
BTF_ID(struct, bpf_list_node)
BTF_ID(struct, bpf_rb_root)
BTF_ID(struct, bpf_rb_node)
+BTF_ID(struct, bpf_wq)
static bool __is_kfunc_ptr_arg_type(const struct btf *btf,
const struct btf_param *arg, int type)
@@ -10780,6 +11669,11 @@ static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_par
return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID);
}
+static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID);
+}
+
static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf,
const struct btf_param *arg)
{
@@ -10848,6 +11742,9 @@ enum kfunc_ptr_arg_type {
KF_ARG_PTR_TO_RB_NODE,
KF_ARG_PTR_TO_NULL,
KF_ARG_PTR_TO_CONST_STR,
+ KF_ARG_PTR_TO_MAP,
+ KF_ARG_PTR_TO_WORKQUEUE,
+ KF_ARG_PTR_TO_IRQ_FLAG,
};
enum special_kfunc_type {
@@ -10873,7 +11770,17 @@ enum special_kfunc_type {
KF_bpf_percpu_obj_new_impl,
KF_bpf_percpu_obj_drop_impl,
KF_bpf_throw,
+ KF_bpf_wq_set_callback_impl,
+ KF_bpf_preempt_disable,
+ KF_bpf_preempt_enable,
KF_bpf_iter_css_task_new,
+ KF_bpf_session_cookie,
+ KF_bpf_get_kmem_cache,
+ KF_bpf_local_irq_save,
+ KF_bpf_local_irq_restore,
+ KF_bpf_iter_num_new,
+ KF_bpf_iter_num_next,
+ KF_bpf_iter_num_destroy,
};
BTF_SET_START(special_kfunc_set)
@@ -10889,14 +11796,17 @@ BTF_ID(func, bpf_rdonly_cast)
BTF_ID(func, bpf_rbtree_remove)
BTF_ID(func, bpf_rbtree_add_impl)
BTF_ID(func, bpf_rbtree_first)
+#ifdef CONFIG_NET
BTF_ID(func, bpf_dynptr_from_skb)
BTF_ID(func, bpf_dynptr_from_xdp)
+#endif
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
BTF_ID(func, bpf_percpu_obj_new_impl)
BTF_ID(func, bpf_percpu_obj_drop_impl)
BTF_ID(func, bpf_throw)
+BTF_ID(func, bpf_wq_set_callback_impl)
#ifdef CONFIG_CGROUPS
BTF_ID(func, bpf_iter_css_task_new)
#endif
@@ -10917,19 +11827,38 @@ BTF_ID(func, bpf_rcu_read_unlock)
BTF_ID(func, bpf_rbtree_remove)
BTF_ID(func, bpf_rbtree_add_impl)
BTF_ID(func, bpf_rbtree_first)
+#ifdef CONFIG_NET
BTF_ID(func, bpf_dynptr_from_skb)
BTF_ID(func, bpf_dynptr_from_xdp)
+#else
+BTF_ID_UNUSED
+BTF_ID_UNUSED
+#endif
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
BTF_ID(func, bpf_percpu_obj_new_impl)
BTF_ID(func, bpf_percpu_obj_drop_impl)
BTF_ID(func, bpf_throw)
+BTF_ID(func, bpf_wq_set_callback_impl)
+BTF_ID(func, bpf_preempt_disable)
+BTF_ID(func, bpf_preempt_enable)
#ifdef CONFIG_CGROUPS
BTF_ID(func, bpf_iter_css_task_new)
#else
BTF_ID_UNUSED
#endif
+#ifdef CONFIG_BPF_EVENTS
+BTF_ID(func, bpf_session_cookie)
+#else
+BTF_ID_UNUSED
+#endif
+BTF_ID(func, bpf_get_kmem_cache)
+BTF_ID(func, bpf_local_irq_save)
+BTF_ID(func, bpf_local_irq_restore)
+BTF_ID(func, bpf_iter_num_new)
+BTF_ID(func, bpf_iter_num_next)
+BTF_ID(func, bpf_iter_num_destroy)
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -10951,6 +11880,16 @@ static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta)
return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock];
}
+static bool is_kfunc_bpf_preempt_disable(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_preempt_disable];
+}
+
+static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable];
+}
+
static enum kfunc_ptr_arg_type
get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
struct bpf_kfunc_call_arg_meta *meta,
@@ -10971,9 +11910,12 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
* type to our caller. When a set of conditions hold in the BTF type of
* arguments, we resolve it to a known kfunc_ptr_arg_type.
*/
- if (btf_get_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
+ if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
return KF_ARG_PTR_TO_CTX;
+ if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg))
+ return KF_ARG_PTR_TO_NULL;
+
if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_ALLOC_BTF_ID;
@@ -10983,7 +11925,7 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_DYNPTR;
- if (is_kfunc_arg_iter(meta, argno))
+ if (is_kfunc_arg_iter(meta, argno, &args[argno]))
return KF_ARG_PTR_TO_ITER;
if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
@@ -11001,6 +11943,15 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_const_str(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_CONST_STR;
+ if (is_kfunc_arg_map(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_MAP;
+
+ if (is_kfunc_arg_wq(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_WORKQUEUE;
+
+ if (is_kfunc_arg_irq_flag(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_IRQ_FLAG;
+
if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
if (!btf_type_is_struct(ref_t)) {
verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
@@ -11013,9 +11964,6 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
return KF_ARG_PTR_TO_CALLBACK;
- if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg))
- return KF_ARG_PTR_TO_NULL;
-
if (argno + 1 < nargs &&
(is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
@@ -11046,6 +11994,8 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
bool strict_type_match = false;
const struct btf *reg_btf;
const char *reg_ref_tname;
+ bool taking_projection;
+ bool struct_same;
u32 reg_ref_id;
if (base_type(reg->type) == PTR_TO_BTF_ID) {
@@ -11080,16 +12030,23 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
* btf_struct_ids_match() to walk the struct at the 0th offset, and
* resolve types.
*/
- if (is_kfunc_acquire(meta) ||
- (is_kfunc_release(meta) && reg->ref_obj_id) ||
+ if ((is_kfunc_release(meta) && reg->ref_obj_id) ||
btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))
strict_type_match = true;
- WARN_ON_ONCE(is_kfunc_trusted_args(meta) && reg->off);
+ WARN_ON_ONCE(is_kfunc_release(meta) &&
+ (reg->off || !tnum_is_const(reg->var_off) ||
+ reg->var_off.value));
reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, &reg_ref_id);
reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off);
- if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) {
+ struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match);
+ /* If kfunc is accepting a projection type (ie. __sk_buff), it cannot
+ * actually use it -- it must cast to the underlying type. So we allow
+ * caller to pass in the underlying type.
+ */
+ taking_projection = btf_is_projection_of(ref_tname, reg_ref_tname);
+ if (!taking_projection && !struct_same) {
verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1,
btf_type_str(reg_ref_t), reg_ref_tname);
@@ -11098,12 +12055,59 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
return 0;
}
+static int process_irq_flag(struct bpf_verifier_env *env, int regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ bool irq_save;
+ int err;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save]) {
+ irq_save = true;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore]) {
+ irq_save = false;
+ } else {
+ verbose(env, "verifier internal error: unknown irq flags kfunc\n");
+ return -EFAULT;
+ }
+
+ if (irq_save) {
+ if (!is_irq_flag_reg_valid_uninit(env, reg)) {
+ verbose(env, "expected uninitialized irq flag as arg#%d\n", regno - 1);
+ return -EINVAL;
+ }
+
+ err = check_mem_access(env, env->insn_idx, regno, 0, BPF_DW, BPF_WRITE, -1, false, false);
+ if (err)
+ return err;
+
+ err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx);
+ if (err)
+ return err;
+ } else {
+ err = is_irq_flag_reg_valid_init(env, reg);
+ if (err) {
+ verbose(env, "expected an initialized irq flag as arg#%d\n", regno - 1);
+ return err;
+ }
+
+ err = mark_irq_flag_read(env, reg);
+ if (err)
+ return err;
+
+ err = unmark_stack_slot_irq_flag(env, reg);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+
static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- struct bpf_verifier_state *state = env->cur_state;
struct btf_record *rec = reg_btf_record(reg);
- if (!state->active_lock.ptr) {
+ if (!env->cur_state->active_locks) {
verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
return -EFAULT;
}
@@ -11122,12 +12126,11 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id)
{
- struct bpf_func_state *state, *unused;
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_func_state *unused;
struct bpf_reg_state *reg;
int i;
- state = cur_func(env);
-
if (!ref_obj_id) {
verbose(env, "verifier internal error: ref_obj_id is zero for "
"owning -> non-owning conversion\n");
@@ -11200,6 +12203,7 @@ static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_o
*/
static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
+ struct bpf_reference_state *s;
void *ptr;
u32 id;
@@ -11216,10 +12220,10 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_
}
id = reg->id;
- if (!env->cur_state->active_lock.ptr)
+ if (!env->cur_state->active_locks)
return -EINVAL;
- if (env->cur_state->active_lock.ptr != ptr ||
- env->cur_state->active_lock.id != id) {
+ s = find_lock_state(env->cur_state, REF_TYPE_LOCK, id, ptr);
+ if (!s) {
verbose(env, "held lock and object are not in the same allocation\n");
return -EINVAL;
}
@@ -11241,23 +12245,51 @@ static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
btf_id == special_kfunc_list[KF_bpf_rbtree_first];
}
+static bool is_bpf_iter_num_api_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_iter_num_new] ||
+ btf_id == special_kfunc_list[KF_bpf_iter_num_next] ||
+ btf_id == special_kfunc_list[KF_bpf_iter_num_destroy];
+}
+
static bool is_bpf_graph_api_kfunc(u32 btf_id)
{
return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id) ||
btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
}
+static bool kfunc_spin_allowed(u32 btf_id)
+{
+ return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id);
+}
+
static bool is_sync_callback_calling_kfunc(u32 btf_id)
{
return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
}
+static bool is_async_callback_calling_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl];
+}
+
static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
{
return bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
insn->imm == special_kfunc_list[KF_bpf_throw];
}
+static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl];
+}
+
+static bool is_callback_calling_kfunc(u32 btf_id)
+{
+ return is_sync_callback_calling_kfunc(btf_id) ||
+ is_async_callback_calling_kfunc(btf_id);
+}
+
static bool is_rbtree_lock_required_kfunc(u32 btf_id)
{
return is_bpf_rbtree_api_kfunc(btf_id);
@@ -11413,7 +12445,7 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
node_off = reg->off + reg->var_off.value;
field = reg_find_field_offset(reg, node_off, node_field_type);
- if (!field || field->offset != node_off) {
+ if (!field) {
verbose(env, "%s not found at offset=%u\n", node_type_name, node_off);
return -EINVAL;
}
@@ -11483,7 +12515,7 @@ static bool check_css_task_iter_allowlist(struct bpf_verifier_env *env)
return true;
fallthrough;
default:
- return env->prog->aux->sleepable;
+ return in_sleepable(env);
}
}
@@ -11601,6 +12633,35 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
switch (kf_arg_type) {
case KF_ARG_PTR_TO_NULL:
continue;
+ case KF_ARG_PTR_TO_MAP:
+ if (!reg->map_ptr) {
+ verbose(env, "pointer in R%d isn't map pointer\n", regno);
+ return -EINVAL;
+ }
+ if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) {
+ /* Use map_uid (which is unique id of inner map) to reject:
+ * inner_map1 = bpf_map_lookup_elem(outer_map, key1)
+ * inner_map2 = bpf_map_lookup_elem(outer_map, key2)
+ * if (inner_map1 && inner_map2) {
+ * wq = bpf_map_lookup_elem(inner_map1);
+ * if (wq)
+ * // mismatch would have been allowed
+ * bpf_wq_init(wq, inner_map2);
+ * }
+ *
+ * Comparing map_ptr is enough to distinguish normal and outer maps.
+ */
+ if (meta->map.ptr != reg->map_ptr ||
+ meta->map.uid != reg->map_uid) {
+ verbose(env,
+ "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
+ meta->map.uid, reg->map_uid);
+ return -EINVAL;
+ }
+ }
+ meta->map.ptr = reg->map_ptr;
+ meta->map.uid = reg->map_uid;
+ fallthrough;
case KF_ARG_PTR_TO_ALLOC_BTF_ID:
case KF_ARG_PTR_TO_BTF_ID:
if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
@@ -11616,12 +12677,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return -EINVAL;
}
}
-
fallthrough;
case KF_ARG_PTR_TO_CTX:
- /* Trusted arguments have the same offset checks as release arguments */
- arg_type |= OBJ_RELEASE;
- break;
case KF_ARG_PTR_TO_DYNPTR:
case KF_ARG_PTR_TO_ITER:
case KF_ARG_PTR_TO_LIST_HEAD:
@@ -11633,7 +12690,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_CALLBACK:
case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
case KF_ARG_PTR_TO_CONST_STR:
- /* Trusted by default */
+ case KF_ARG_PTR_TO_WORKQUEUE:
+ case KF_ARG_PTR_TO_IRQ_FLAG:
break;
default:
WARN_ON_ONCE(1);
@@ -11649,7 +12707,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
switch (kf_arg_type) {
case KF_ARG_PTR_TO_CTX:
if (reg->type != PTR_TO_CTX) {
- verbose(env, "arg#%d expected pointer to ctx, but got %s\n", i, btf_type_str(t));
+ verbose(env, "arg#%d expected pointer to ctx, but got %s\n",
+ i, reg_type_str(env, reg->type));
return -EINVAL;
}
@@ -11689,12 +12748,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;
int clone_ref_obj_id = 0;
- if (reg->type != PTR_TO_STACK &&
- reg->type != CONST_PTR_TO_DYNPTR) {
- verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i);
- return -EINVAL;
- }
-
if (reg->type == CONST_PTR_TO_DYNPTR)
dynptr_arg_type |= MEM_RDONLY;
@@ -11817,6 +12870,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (ret < 0)
return ret;
break;
+ case KF_ARG_PTR_TO_MAP:
+ /* If argument has '__map' suffix expect 'struct bpf_map *' */
+ ref_id = *reg2btf_ids[CONST_PTR_TO_MAP];
+ ref_t = btf_type_by_id(btf_vmlinux, ref_id);
+ ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+ fallthrough;
case KF_ARG_PTR_TO_BTF_ID:
/* Only base_type is checked, further checks are done here */
if ((base_type(reg->type) != PTR_TO_BTF_ID ||
@@ -11913,6 +12972,24 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (ret)
return ret;
break;
+ case KF_ARG_PTR_TO_WORKQUEUE:
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "arg#%d doesn't point to a map value\n", i);
+ return -EINVAL;
+ }
+ ret = process_wq_func(env, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_IRQ_FLAG:
+ if (reg->type != PTR_TO_STACK) {
+ verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i);
+ return -EINVAL;
+ }
+ ret = process_irq_flag(env, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
}
}
@@ -11972,11 +13049,11 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
- const struct btf_type *t, *ptr_type;
+ bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable;
u32 i, nargs, ptr_type_id, release_ref_obj_id;
struct bpf_reg_state *regs = cur_regs(env);
const char *func_name, *ptr_type_name;
- bool sleepable, rcu_lock, rcu_unlock;
+ const struct btf_type *t, *ptr_type;
struct bpf_kfunc_call_arg_meta meta;
struct bpf_insn_aux_data *insn_aux;
int err, insn_idx = *insn_idx_p;
@@ -12004,7 +13081,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
sleepable = is_kfunc_sleepable(&meta);
- if (sleepable && !env->prog->aux->sleepable) {
+ if (sleepable && !in_sleepable(env)) {
verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name);
return -EACCES;
}
@@ -12024,9 +13101,27 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
+ if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie]) {
+ meta.r0_size = sizeof(u64);
+ meta.r0_rdonly = false;
+ }
+
+ if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) {
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_timer_callback_state);
+ if (err) {
+ verbose(env, "kfunc %s#%d failed callback verification\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
+
rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
+ preempt_disable = is_kfunc_bpf_preempt_disable(&meta);
+ preempt_enable = is_kfunc_bpf_preempt_enable(&meta);
+
if (env->cur_state->active_rcu_lock) {
struct bpf_func_state *state;
struct bpf_reg_state *reg;
@@ -12059,6 +13154,27 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EINVAL;
}
+ if (env->cur_state->active_preempt_locks) {
+ if (preempt_disable) {
+ env->cur_state->active_preempt_locks++;
+ } else if (preempt_enable) {
+ env->cur_state->active_preempt_locks--;
+ } else if (sleepable) {
+ verbose(env, "kernel func %s is sleepable within non-preemptible region\n", func_name);
+ return -EACCES;
+ }
+ } else if (preempt_disable) {
+ env->cur_state->active_preempt_locks++;
+ } else if (preempt_enable) {
+ verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name);
+ return -EINVAL;
+ }
+
+ if (env->cur_state->active_irq_id && sleepable) {
+ verbose(env, "kernel func %s is sleepable within IRQ-disabled region\n", func_name);
+ return -EACCES;
+ }
+
/* In case of release function, we get register number of refcounted
* PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
*/
@@ -12291,6 +13407,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
meta.func_name);
return -EFAULT;
}
+ } else if (btf_type_is_void(ptr_type)) {
+ /* kfunc returning 'void *' is equivalent to returning scalar */
+ mark_reg_unknown(env, regs, BPF_REG_0);
} else if (!__btf_type_is_struct(ptr_type)) {
if (!meta.r0_size) {
__u32 sz;
@@ -12326,6 +13445,20 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
regs[BPF_REG_0].btf = desc_btf;
regs[BPF_REG_0].type = PTR_TO_BTF_ID;
regs[BPF_REG_0].btf_id = ptr_type_id;
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache])
+ regs[BPF_REG_0].type |= PTR_UNTRUSTED;
+
+ if (is_iter_next_kfunc(&meta)) {
+ struct bpf_reg_state *cur_iter;
+
+ cur_iter = get_iter_from_state(env->cur_state, &meta);
+
+ if (cur_iter->type & MEM_RCU) /* KF_RCU_PROTECTED */
+ regs[BPF_REG_0].type |= MEM_RCU;
+ else
+ regs[BPF_REG_0].type |= PTR_TRUSTED;
+ }
}
if (is_kfunc_ret_null(&meta)) {
@@ -12335,7 +13468,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
if (is_kfunc_acquire(&meta)) {
- int id = acquire_reference_state(env, insn_idx);
+ int id = acquire_reference(env, insn_idx);
if (id < 0)
return id;
@@ -12381,46 +13514,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return 0;
}
-static bool signed_add_overflows(s64 a, s64 b)
-{
- /* Do the add in u64, where overflow is well-defined */
- s64 res = (s64)((u64)a + (u64)b);
-
- if (b < 0)
- return res > a;
- return res < a;
-}
-
-static bool signed_add32_overflows(s32 a, s32 b)
-{
- /* Do the add in u32, where overflow is well-defined */
- s32 res = (s32)((u32)a + (u32)b);
-
- if (b < 0)
- return res > a;
- return res < a;
-}
-
-static bool signed_sub_overflows(s64 a, s64 b)
-{
- /* Do the sub in u64, where overflow is well-defined */
- s64 res = (s64)((u64)a - (u64)b);
-
- if (b < 0)
- return res < a;
- return res > a;
-}
-
-static bool signed_sub32_overflows(s32 a, s32 b)
-{
- /* Do the sub in u32, where overflow is well-defined */
- s32 res = (s32)((u32)a - (u32)b);
-
- if (b < 0)
- return res < a;
- return res > a;
-}
-
static bool check_reg_sane_offset(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
enum bpf_reg_type type)
@@ -12828,6 +13921,19 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
}
switch (base_type(ptr_reg->type)) {
+ case PTR_TO_CTX:
+ case PTR_TO_MAP_VALUE:
+ case PTR_TO_MAP_KEY:
+ case PTR_TO_STACK:
+ case PTR_TO_PACKET_META:
+ case PTR_TO_PACKET:
+ case PTR_TO_TP_BUFFER:
+ case PTR_TO_BTF_ID:
+ case PTR_TO_MEM:
+ case PTR_TO_BUF:
+ case PTR_TO_FUNC:
+ case CONST_PTR_TO_DYNPTR:
+ break;
case PTR_TO_FLOW_KEYS:
if (known)
break;
@@ -12837,16 +13943,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
if (known && smin_val == 0 && opcode == BPF_ADD)
break;
fallthrough;
- case PTR_TO_PACKET_END:
- case PTR_TO_SOCKET:
- case PTR_TO_SOCK_COMMON:
- case PTR_TO_TCP_SOCK:
- case PTR_TO_XDP_SOCK:
+ default:
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
dst, reg_type_str(env, ptr_reg->type));
return -EACCES;
- default:
- break;
}
/* In case of 'scalar += pointer', dst_reg inherits pointer type and id.
@@ -12895,21 +13995,15 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
* added into the variable offset, and we copy the fixed offset
* from ptr_reg.
*/
- if (signed_add_overflows(smin_ptr, smin_val) ||
- signed_add_overflows(smax_ptr, smax_val)) {
+ if (check_add_overflow(smin_ptr, smin_val, &dst_reg->smin_value) ||
+ check_add_overflow(smax_ptr, smax_val, &dst_reg->smax_value)) {
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
- } else {
- dst_reg->smin_value = smin_ptr + smin_val;
- dst_reg->smax_value = smax_ptr + smax_val;
}
- if (umin_ptr + umin_val < umin_ptr ||
- umax_ptr + umax_val < umax_ptr) {
+ if (check_add_overflow(umin_ptr, umin_val, &dst_reg->umin_value) ||
+ check_add_overflow(umax_ptr, umax_val, &dst_reg->umax_value)) {
dst_reg->umin_value = 0;
dst_reg->umax_value = U64_MAX;
- } else {
- dst_reg->umin_value = umin_ptr + umin_val;
- dst_reg->umax_value = umax_ptr + umax_val;
}
dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
dst_reg->off = ptr_reg->off;
@@ -12952,14 +14046,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
/* A new variable offset is created. If the subtrahend is known
* nonnegative, then any reg->range we had before is still good.
*/
- if (signed_sub_overflows(smin_ptr, smax_val) ||
- signed_sub_overflows(smax_ptr, smin_val)) {
+ if (check_sub_overflow(smin_ptr, smax_val, &dst_reg->smin_value) ||
+ check_sub_overflow(smax_ptr, smin_val, &dst_reg->smax_value)) {
/* Overflow possible, we know nothing */
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
- } else {
- dst_reg->smin_value = smin_ptr - smax_val;
- dst_reg->smax_value = smax_ptr - smin_val;
}
if (umin_ptr < umax_val) {
/* Overflow possible, we know nothing */
@@ -13012,71 +14103,56 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
static void scalar32_min_max_add(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s32 smin_val = src_reg->s32_min_value;
- s32 smax_val = src_reg->s32_max_value;
- u32 umin_val = src_reg->u32_min_value;
- u32 umax_val = src_reg->u32_max_value;
+ s32 *dst_smin = &dst_reg->s32_min_value;
+ s32 *dst_smax = &dst_reg->s32_max_value;
+ u32 *dst_umin = &dst_reg->u32_min_value;
+ u32 *dst_umax = &dst_reg->u32_max_value;
- if (signed_add32_overflows(dst_reg->s32_min_value, smin_val) ||
- signed_add32_overflows(dst_reg->s32_max_value, smax_val)) {
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
- } else {
- dst_reg->s32_min_value += smin_val;
- dst_reg->s32_max_value += smax_val;
+ if (check_add_overflow(*dst_smin, src_reg->s32_min_value, dst_smin) ||
+ check_add_overflow(*dst_smax, src_reg->s32_max_value, dst_smax)) {
+ *dst_smin = S32_MIN;
+ *dst_smax = S32_MAX;
}
- if (dst_reg->u32_min_value + umin_val < umin_val ||
- dst_reg->u32_max_value + umax_val < umax_val) {
- dst_reg->u32_min_value = 0;
- dst_reg->u32_max_value = U32_MAX;
- } else {
- dst_reg->u32_min_value += umin_val;
- dst_reg->u32_max_value += umax_val;
+ if (check_add_overflow(*dst_umin, src_reg->u32_min_value, dst_umin) ||
+ check_add_overflow(*dst_umax, src_reg->u32_max_value, dst_umax)) {
+ *dst_umin = 0;
+ *dst_umax = U32_MAX;
}
}
static void scalar_min_max_add(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s64 smin_val = src_reg->smin_value;
- s64 smax_val = src_reg->smax_value;
- u64 umin_val = src_reg->umin_value;
- u64 umax_val = src_reg->umax_value;
+ s64 *dst_smin = &dst_reg->smin_value;
+ s64 *dst_smax = &dst_reg->smax_value;
+ u64 *dst_umin = &dst_reg->umin_value;
+ u64 *dst_umax = &dst_reg->umax_value;
- if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
- signed_add_overflows(dst_reg->smax_value, smax_val)) {
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- } else {
- dst_reg->smin_value += smin_val;
- dst_reg->smax_value += smax_val;
+ if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) ||
+ check_add_overflow(*dst_smax, src_reg->smax_value, dst_smax)) {
+ *dst_smin = S64_MIN;
+ *dst_smax = S64_MAX;
}
- if (dst_reg->umin_value + umin_val < umin_val ||
- dst_reg->umax_value + umax_val < umax_val) {
- dst_reg->umin_value = 0;
- dst_reg->umax_value = U64_MAX;
- } else {
- dst_reg->umin_value += umin_val;
- dst_reg->umax_value += umax_val;
+ if (check_add_overflow(*dst_umin, src_reg->umin_value, dst_umin) ||
+ check_add_overflow(*dst_umax, src_reg->umax_value, dst_umax)) {
+ *dst_umin = 0;
+ *dst_umax = U64_MAX;
}
}
static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s32 smin_val = src_reg->s32_min_value;
- s32 smax_val = src_reg->s32_max_value;
+ s32 *dst_smin = &dst_reg->s32_min_value;
+ s32 *dst_smax = &dst_reg->s32_max_value;
u32 umin_val = src_reg->u32_min_value;
u32 umax_val = src_reg->u32_max_value;
- if (signed_sub32_overflows(dst_reg->s32_min_value, smax_val) ||
- signed_sub32_overflows(dst_reg->s32_max_value, smin_val)) {
+ if (check_sub_overflow(*dst_smin, src_reg->s32_max_value, dst_smin) ||
+ check_sub_overflow(*dst_smax, src_reg->s32_min_value, dst_smax)) {
/* Overflow possible, we know nothing */
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
- } else {
- dst_reg->s32_min_value -= smax_val;
- dst_reg->s32_max_value -= smin_val;
+ *dst_smin = S32_MIN;
+ *dst_smax = S32_MAX;
}
if (dst_reg->u32_min_value < umax_val) {
/* Overflow possible, we know nothing */
@@ -13092,19 +14168,16 @@ static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,
static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s64 smin_val = src_reg->smin_value;
- s64 smax_val = src_reg->smax_value;
+ s64 *dst_smin = &dst_reg->smin_value;
+ s64 *dst_smax = &dst_reg->smax_value;
u64 umin_val = src_reg->umin_value;
u64 umax_val = src_reg->umax_value;
- if (signed_sub_overflows(dst_reg->smin_value, smax_val) ||
- signed_sub_overflows(dst_reg->smax_value, smin_val)) {
+ if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) ||
+ check_sub_overflow(*dst_smax, src_reg->smin_value, dst_smax)) {
/* Overflow possible, we know nothing */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- } else {
- dst_reg->smin_value -= smax_val;
- dst_reg->smax_value -= smin_val;
+ *dst_smin = S64_MIN;
+ *dst_smax = S64_MAX;
}
if (dst_reg->umin_value < umax_val) {
/* Overflow possible, we know nothing */
@@ -13120,64 +14193,56 @@ static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,
static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s32 smin_val = src_reg->s32_min_value;
- u32 umin_val = src_reg->u32_min_value;
- u32 umax_val = src_reg->u32_max_value;
+ s32 *dst_smin = &dst_reg->s32_min_value;
+ s32 *dst_smax = &dst_reg->s32_max_value;
+ u32 *dst_umin = &dst_reg->u32_min_value;
+ u32 *dst_umax = &dst_reg->u32_max_value;
+ s32 tmp_prod[4];
- if (smin_val < 0 || dst_reg->s32_min_value < 0) {
- /* Ain't nobody got time to multiply that sign */
- __mark_reg32_unbounded(dst_reg);
- return;
- }
- /* Both values are positive, so we can work with unsigned and
- * copy the result to signed (unless it exceeds S32_MAX).
- */
- if (umax_val > U16_MAX || dst_reg->u32_max_value > U16_MAX) {
- /* Potential overflow, we know nothing */
- __mark_reg32_unbounded(dst_reg);
- return;
+ if (check_mul_overflow(*dst_umax, src_reg->u32_max_value, dst_umax) ||
+ check_mul_overflow(*dst_umin, src_reg->u32_min_value, dst_umin)) {
+ /* Overflow possible, we know nothing */
+ *dst_umin = 0;
+ *dst_umax = U32_MAX;
}
- dst_reg->u32_min_value *= umin_val;
- dst_reg->u32_max_value *= umax_val;
- if (dst_reg->u32_max_value > S32_MAX) {
+ if (check_mul_overflow(*dst_smin, src_reg->s32_min_value, &tmp_prod[0]) ||
+ check_mul_overflow(*dst_smin, src_reg->s32_max_value, &tmp_prod[1]) ||
+ check_mul_overflow(*dst_smax, src_reg->s32_min_value, &tmp_prod[2]) ||
+ check_mul_overflow(*dst_smax, src_reg->s32_max_value, &tmp_prod[3])) {
/* Overflow possible, we know nothing */
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
+ *dst_smin = S32_MIN;
+ *dst_smax = S32_MAX;
} else {
- dst_reg->s32_min_value = dst_reg->u32_min_value;
- dst_reg->s32_max_value = dst_reg->u32_max_value;
+ *dst_smin = min_array(tmp_prod, 4);
+ *dst_smax = max_array(tmp_prod, 4);
}
}
static void scalar_min_max_mul(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s64 smin_val = src_reg->smin_value;
- u64 umin_val = src_reg->umin_value;
- u64 umax_val = src_reg->umax_value;
+ s64 *dst_smin = &dst_reg->smin_value;
+ s64 *dst_smax = &dst_reg->smax_value;
+ u64 *dst_umin = &dst_reg->umin_value;
+ u64 *dst_umax = &dst_reg->umax_value;
+ s64 tmp_prod[4];
- if (smin_val < 0 || dst_reg->smin_value < 0) {
- /* Ain't nobody got time to multiply that sign */
- __mark_reg64_unbounded(dst_reg);
- return;
- }
- /* Both values are positive, so we can work with unsigned and
- * copy the result to signed (unless it exceeds S64_MAX).
- */
- if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) {
- /* Potential overflow, we know nothing */
- __mark_reg64_unbounded(dst_reg);
- return;
+ if (check_mul_overflow(*dst_umax, src_reg->umax_value, dst_umax) ||
+ check_mul_overflow(*dst_umin, src_reg->umin_value, dst_umin)) {
+ /* Overflow possible, we know nothing */
+ *dst_umin = 0;
+ *dst_umax = U64_MAX;
}
- dst_reg->umin_value *= umin_val;
- dst_reg->umax_value *= umax_val;
- if (dst_reg->umax_value > S64_MAX) {
+ if (check_mul_overflow(*dst_smin, src_reg->smin_value, &tmp_prod[0]) ||
+ check_mul_overflow(*dst_smin, src_reg->smax_value, &tmp_prod[1]) ||
+ check_mul_overflow(*dst_smax, src_reg->smin_value, &tmp_prod[2]) ||
+ check_mul_overflow(*dst_smax, src_reg->smax_value, &tmp_prod[3])) {
/* Overflow possible, we know nothing */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
+ *dst_smin = S64_MIN;
+ *dst_smax = S64_MAX;
} else {
- dst_reg->smin_value = dst_reg->umin_value;
- dst_reg->smax_value = dst_reg->umax_value;
+ *dst_smin = min_array(tmp_prod, 4);
+ *dst_smax = max_array(tmp_prod, 4);
}
}
@@ -13187,7 +14252,6 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
- s32 smin_val = src_reg->s32_min_value;
u32 umax_val = src_reg->u32_max_value;
if (src_known && dst_known) {
@@ -13200,18 +14264,16 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
*/
dst_reg->u32_min_value = var32_off.value;
dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val);
- if (dst_reg->s32_min_value < 0 || smin_val < 0) {
- /* Lose signed bounds when ANDing negative numbers,
- * ain't nobody got time for that.
- */
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
- } else {
- /* ANDing two positives gives a positive, so safe to
- * cast result into s64.
- */
+
+ /* Safe to set s32 bounds by casting u32 result into s32 when u32
+ * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
+ */
+ if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
+ } else {
+ dst_reg->s32_min_value = S32_MIN;
+ dst_reg->s32_max_value = S32_MAX;
}
}
@@ -13220,7 +14282,6 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
{
bool src_known = tnum_is_const(src_reg->var_off);
bool dst_known = tnum_is_const(dst_reg->var_off);
- s64 smin_val = src_reg->smin_value;
u64 umax_val = src_reg->umax_value;
if (src_known && dst_known) {
@@ -13233,18 +14294,16 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
*/
dst_reg->umin_value = dst_reg->var_off.value;
dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
- if (dst_reg->smin_value < 0 || smin_val < 0) {
- /* Lose signed bounds when ANDing negative numbers,
- * ain't nobody got time for that.
- */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- } else {
- /* ANDing two positives gives a positive, so safe to
- * cast result into s64.
- */
+
+ /* Safe to set s64 bounds by casting u64 result into s64 when u64
+ * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
+ */
+ if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
dst_reg->smin_value = dst_reg->umin_value;
dst_reg->smax_value = dst_reg->umax_value;
+ } else {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
}
/* We may learn something more from the var_off */
__update_reg_bounds(dst_reg);
@@ -13256,7 +14315,6 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
- s32 smin_val = src_reg->s32_min_value;
u32 umin_val = src_reg->u32_min_value;
if (src_known && dst_known) {
@@ -13269,18 +14327,16 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
*/
dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val);
dst_reg->u32_max_value = var32_off.value | var32_off.mask;
- if (dst_reg->s32_min_value < 0 || smin_val < 0) {
- /* Lose signed bounds when ORing negative numbers,
- * ain't nobody got time for that.
- */
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
- } else {
- /* ORing two positives gives a positive, so safe to
- * cast result into s64.
- */
+
+ /* Safe to set s32 bounds by casting u32 result into s32 when u32
+ * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
+ */
+ if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
+ } else {
+ dst_reg->s32_min_value = S32_MIN;
+ dst_reg->s32_max_value = S32_MAX;
}
}
@@ -13289,7 +14345,6 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
{
bool src_known = tnum_is_const(src_reg->var_off);
bool dst_known = tnum_is_const(dst_reg->var_off);
- s64 smin_val = src_reg->smin_value;
u64 umin_val = src_reg->umin_value;
if (src_known && dst_known) {
@@ -13302,18 +14357,16 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
*/
dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
- if (dst_reg->smin_value < 0 || smin_val < 0) {
- /* Lose signed bounds when ORing negative numbers,
- * ain't nobody got time for that.
- */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- } else {
- /* ORing two positives gives a positive, so safe to
- * cast result into s64.
- */
+
+ /* Safe to set s64 bounds by casting u64 result into s64 when u64
+ * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
+ */
+ if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
dst_reg->smin_value = dst_reg->umin_value;
dst_reg->smax_value = dst_reg->umax_value;
+ } else {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
}
/* We may learn something more from the var_off */
__update_reg_bounds(dst_reg);
@@ -13325,7 +14378,6 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
- s32 smin_val = src_reg->s32_min_value;
if (src_known && dst_known) {
__mark_reg32_known(dst_reg, var32_off.value);
@@ -13336,10 +14388,10 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
dst_reg->u32_min_value = var32_off.value;
dst_reg->u32_max_value = var32_off.value | var32_off.mask;
- if (dst_reg->s32_min_value >= 0 && smin_val >= 0) {
- /* XORing two positive sign numbers gives a positive,
- * so safe to cast u32 result into s32.
- */
+ /* Safe to set s32 bounds by casting u32 result into s32 when u32
+ * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
+ */
+ if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
} else {
@@ -13353,7 +14405,6 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg,
{
bool src_known = tnum_is_const(src_reg->var_off);
bool dst_known = tnum_is_const(dst_reg->var_off);
- s64 smin_val = src_reg->smin_value;
if (src_known && dst_known) {
/* dst_reg->var_off.value has been updated earlier */
@@ -13365,10 +14416,10 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg,
dst_reg->umin_value = dst_reg->var_off.value;
dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
- if (dst_reg->smin_value >= 0 && smin_val >= 0) {
- /* XORing two positive sign numbers gives a positive,
- * so safe to cast u64 result into s64.
- */
+ /* Safe to set s64 bounds by casting u64 result into s64 when u64
+ * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
+ */
+ if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
dst_reg->smin_value = dst_reg->umin_value;
dst_reg->smax_value = dst_reg->umax_value;
} else {
@@ -13576,6 +14627,46 @@ static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg,
__update_reg_bounds(dst_reg);
}
+static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
+ const struct bpf_reg_state *src_reg)
+{
+ bool src_is_const = false;
+ u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
+
+ if (insn_bitness == 32) {
+ if (tnum_subreg_is_const(src_reg->var_off)
+ && src_reg->s32_min_value == src_reg->s32_max_value
+ && src_reg->u32_min_value == src_reg->u32_max_value)
+ src_is_const = true;
+ } else {
+ if (tnum_is_const(src_reg->var_off)
+ && src_reg->smin_value == src_reg->smax_value
+ && src_reg->umin_value == src_reg->umax_value)
+ src_is_const = true;
+ }
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_ADD:
+ case BPF_SUB:
+ case BPF_AND:
+ case BPF_XOR:
+ case BPF_OR:
+ case BPF_MUL:
+ return true;
+
+ /* Shift operators range is only computable if shift dimension operand
+ * is a constant. Shifts greater than 31 or 63 are undefined. This
+ * includes shifts by a negative number.
+ */
+ case BPF_LSH:
+ case BPF_RSH:
+ case BPF_ARSH:
+ return (src_is_const && src_reg->umax_value < insn_bitness);
+ default:
+ return false;
+ }
+}
+
/* WARNING: This function does calculations on 64-bit values, but the actual
* execution may occur on 32-bit values. Therefore, things like bitshifts
* need extra checks in the 32-bit case.
@@ -13585,53 +14676,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
struct bpf_reg_state *dst_reg,
struct bpf_reg_state src_reg)
{
- struct bpf_reg_state *regs = cur_regs(env);
u8 opcode = BPF_OP(insn->code);
- bool src_known;
- s64 smin_val, smax_val;
- u64 umin_val, umax_val;
- s32 s32_min_val, s32_max_val;
- u32 u32_min_val, u32_max_val;
- u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
int ret;
- smin_val = src_reg.smin_value;
- smax_val = src_reg.smax_value;
- umin_val = src_reg.umin_value;
- umax_val = src_reg.umax_value;
-
- s32_min_val = src_reg.s32_min_value;
- s32_max_val = src_reg.s32_max_value;
- u32_min_val = src_reg.u32_min_value;
- u32_max_val = src_reg.u32_max_value;
-
- if (alu32) {
- src_known = tnum_subreg_is_const(src_reg.var_off);
- if ((src_known &&
- (s32_min_val != s32_max_val || u32_min_val != u32_max_val)) ||
- s32_min_val > s32_max_val || u32_min_val > u32_max_val) {
- /* Taint dst register if offset had invalid bounds
- * derived from e.g. dead branches.
- */
- __mark_reg_unknown(env, dst_reg);
- return 0;
- }
- } else {
- src_known = tnum_is_const(src_reg.var_off);
- if ((src_known &&
- (smin_val != smax_val || umin_val != umax_val)) ||
- smin_val > smax_val || umin_val > umax_val) {
- /* Taint dst register if offset had invalid bounds
- * derived from e.g. dead branches.
- */
- __mark_reg_unknown(env, dst_reg);
- return 0;
- }
- }
-
- if (!src_known &&
- opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
+ if (!is_safe_to_compute_dst_reg_range(insn, &src_reg)) {
__mark_reg_unknown(env, dst_reg);
return 0;
}
@@ -13688,46 +14737,24 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
scalar_min_max_xor(dst_reg, &src_reg);
break;
case BPF_LSH:
- if (umax_val >= insn_bitness) {
- /* Shifts greater than 31 or 63 are undefined.
- * This includes shifts by a negative number.
- */
- mark_reg_unknown(env, regs, insn->dst_reg);
- break;
- }
if (alu32)
scalar32_min_max_lsh(dst_reg, &src_reg);
else
scalar_min_max_lsh(dst_reg, &src_reg);
break;
case BPF_RSH:
- if (umax_val >= insn_bitness) {
- /* Shifts greater than 31 or 63 are undefined.
- * This includes shifts by a negative number.
- */
- mark_reg_unknown(env, regs, insn->dst_reg);
- break;
- }
if (alu32)
scalar32_min_max_rsh(dst_reg, &src_reg);
else
scalar_min_max_rsh(dst_reg, &src_reg);
break;
case BPF_ARSH:
- if (umax_val >= insn_bitness) {
- /* Shifts greater than 31 or 63 are undefined.
- * This includes shifts by a negative number.
- */
- mark_reg_unknown(env, regs, insn->dst_reg);
- break;
- }
if (alu32)
scalar32_min_max_arsh(dst_reg, &src_reg);
else
scalar_min_max_arsh(dst_reg, &src_reg);
break;
default:
- mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
@@ -13748,18 +14775,30 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
+ bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
u8 opcode = BPF_OP(insn->code);
int err;
dst_reg = &regs[insn->dst_reg];
src_reg = NULL;
+
+ if (dst_reg->type == PTR_TO_ARENA) {
+ struct bpf_insn_aux_data *aux = cur_aux(env);
+
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ /*
+ * 32-bit operations zero upper bits automatically.
+ * 64-bit operations need to be converted to 32.
+ */
+ aux->needs_zext = true;
+
+ /* Any arithmetic operations are allowed on arena pointers */
+ return 0;
+ }
+
if (dst_reg->type != SCALAR_VALUE)
ptr_reg = dst_reg;
- else
- /* Make sure ID is cleared otherwise dst_reg min/max could be
- * incorrectly propagated into other registers by find_equal_scalars()
- */
- dst_reg->id = 0;
+
if (BPF_SRC(insn->code) == BPF_X) {
src_reg = &regs[insn->src_reg];
if (src_reg->type != SCALAR_VALUE) {
@@ -13814,16 +14853,53 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
/* Got here implies adding two SCALAR_VALUEs */
if (WARN_ON_ONCE(ptr_reg)) {
- print_verifier_state(env, state, true);
+ print_verifier_state(env, vstate, vstate->curframe, true);
verbose(env, "verifier internal error: unexpected ptr_reg\n");
return -EINVAL;
}
if (WARN_ON(!src_reg)) {
- print_verifier_state(env, state, true);
+ print_verifier_state(env, vstate, vstate->curframe, true);
verbose(env, "verifier internal error: no src_reg\n");
return -EINVAL;
}
- return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
+ err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
+ if (err)
+ return err;
+ /*
+ * Compilers can generate the code
+ * r1 = r2
+ * r1 += 0x1
+ * if r2 < 1000 goto ...
+ * use r1 in memory access
+ * So for 64-bit alu remember constant delta between r2 and r1 and
+ * update r1 after 'if' condition.
+ */
+ if (env->bpf_capable &&
+ BPF_OP(insn->code) == BPF_ADD && !alu32 &&
+ dst_reg->id && is_reg_const(src_reg, false)) {
+ u64 val = reg_const_value(src_reg, false);
+
+ if ((dst_reg->id & BPF_ADD_CONST) ||
+ /* prevent overflow in sync_linked_regs() later */
+ val > (u32)S32_MAX) {
+ /*
+ * If the register already went through rX += val
+ * we cannot accumulate another val into rx->off.
+ */
+ dst_reg->off = 0;
+ dst_reg->id = 0;
+ } else {
+ dst_reg->id |= BPF_ADD_CONST;
+ dst_reg->off = val;
+ }
+ } else {
+ /*
+ * Make sure ID is cleared otherwise dst_reg min/max could be
+ * incorrectly propagated into other registers by sync_linked_regs()
+ */
+ dst_reg->id = 0;
+ }
+ return 0;
}
/* check validity of 32-bit and 64-bit arithmetic operations */
@@ -13870,19 +14946,24 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else if (opcode == BPF_MOV) {
if (BPF_SRC(insn->code) == BPF_X) {
- if (insn->imm != 0) {
- verbose(env, "BPF_MOV uses reserved fields\n");
- return -EINVAL;
- }
-
if (BPF_CLASS(insn->code) == BPF_ALU) {
- if (insn->off != 0 && insn->off != 8 && insn->off != 16) {
+ if ((insn->off != 0 && insn->off != 8 && insn->off != 16) ||
+ insn->imm) {
verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
+ } else if (insn->off == BPF_ADDR_SPACE_CAST) {
+ if (insn->imm != 1 && insn->imm != 1u << 16) {
+ verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n");
+ return -EINVAL;
+ }
+ if (!env->prog->aux->arena) {
+ verbose(env, "addr_space_cast insn can only be used in a program that has an associated arena\n");
+ return -EINVAL;
+ }
} else {
- if (insn->off != 0 && insn->off != 8 && insn->off != 16 &&
- insn->off != 32) {
+ if ((insn->off != 0 && insn->off != 8 && insn->off != 16 &&
+ insn->off != 32) || insn->imm) {
verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
@@ -13907,20 +14988,21 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (BPF_SRC(insn->code) == BPF_X) {
struct bpf_reg_state *src_reg = regs + insn->src_reg;
struct bpf_reg_state *dst_reg = regs + insn->dst_reg;
- bool need_id = src_reg->type == SCALAR_VALUE && !src_reg->id &&
- !tnum_is_const(src_reg->var_off);
if (BPF_CLASS(insn->code) == BPF_ALU64) {
- if (insn->off == 0) {
+ if (insn->imm) {
+ /* off == BPF_ADDR_SPACE_CAST */
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ if (insn->imm == 1) { /* cast from as(1) to as(0) */
+ dst_reg->type = PTR_TO_ARENA;
+ /* PTR_TO_ARENA is 32-bit */
+ dst_reg->subreg_def = env->insn_idx + 1;
+ }
+ } else if (insn->off == 0) {
/* case: R1 = R2
* copy register state to dest reg
*/
- if (need_id)
- /* Assign src and dst registers the same ID
- * that will be used by find_equal_scalars()
- * to propagate min/max range.
- */
- src_reg->id = ++env->id_gen;
+ assign_scalar_id_before_mov(env, src_reg);
copy_register_state(dst_reg, src_reg);
dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = DEF_NOT_SUBREG;
@@ -13935,8 +15017,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
bool no_sext;
no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
- if (no_sext && need_id)
- src_reg->id = ++env->id_gen;
+ if (no_sext)
+ assign_scalar_id_before_mov(env, src_reg);
copy_register_state(dst_reg, src_reg);
if (!no_sext)
dst_reg->id = 0;
@@ -13956,14 +15038,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EACCES;
} else if (src_reg->type == SCALAR_VALUE) {
if (insn->off == 0) {
- bool is_src_reg_u32 = src_reg->umax_value <= U32_MAX;
+ bool is_src_reg_u32 = get_reg_width(src_reg) <= 32;
- if (is_src_reg_u32 && need_id)
- src_reg->id = ++env->id_gen;
+ if (is_src_reg_u32)
+ assign_scalar_id_before_mov(env, src_reg);
copy_register_state(dst_reg, src_reg);
/* Make sure ID is cleared if src_reg is not in u32
* range otherwise dst_reg min/max could be incorrectly
- * propagated into src_reg by find_equal_scalars()
+ * propagated into src_reg by sync_linked_regs()
*/
if (!is_src_reg_u32)
dst_reg->id = 0;
@@ -13973,8 +15055,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
/* case: W1 = (s8, s16)W2 */
bool no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
- if (no_sext && need_id)
- src_reg->id = ++env->id_gen;
+ if (no_sext)
+ assign_scalar_id_before_mov(env, src_reg);
copy_register_state(dst_reg, src_reg);
if (!no_sext)
dst_reg->id = 0;
@@ -14412,7 +15494,19 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state
struct tnum t;
u64 val;
-again:
+ /* In case of GE/GT/SGE/JST, reuse LE/LT/SLE/SLT logic from below */
+ switch (opcode) {
+ case BPF_JGE:
+ case BPF_JGT:
+ case BPF_JSGE:
+ case BPF_JSGT:
+ opcode = flip_opcode(opcode);
+ swap(reg1, reg2);
+ break;
+ default:
+ break;
+ }
+
switch (opcode) {
case BPF_JEQ:
if (is_jmp32) {
@@ -14555,14 +15649,6 @@ again:
reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value);
}
break;
- case BPF_JGE:
- case BPF_JGT:
- case BPF_JSGE:
- case BPF_JSGT:
- /* just reuse LE/LT logic above */
- opcode = flip_opcode(opcode);
- swap(reg1, reg2);
- goto again;
default:
return;
}
@@ -14570,7 +15656,7 @@ again:
/* Adjusts the register min/max values in the case that the dst_reg and
* src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K
- * check, in which case we havea fake SCALAR_VALUE representing insn->imm).
+ * check, in which case we have a fake SCALAR_VALUE representing insn->imm).
* Technically we can do similar adjustments for pointers to the same object,
* but we don't support that right now.
*/
@@ -14670,7 +15756,7 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
* No one could have freed the reference state before
* doing the NULL check.
*/
- WARN_ON_ONCE(release_reference_state(state, id));
+ WARN_ON_ONCE(release_reference_nomark(vstate, id));
bpf_for_each_reg_in_vstate(vstate, state, reg, ({
mark_ptr_or_null_reg(state, reg, id, is_null);
@@ -14782,16 +15868,97 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
return true;
}
-static void find_equal_scalars(struct bpf_verifier_state *vstate,
- struct bpf_reg_state *known_reg)
+static void __collect_linked_regs(struct linked_regs *reg_set, struct bpf_reg_state *reg,
+ u32 id, u32 frameno, u32 spi_or_reg, bool is_reg)
{
- struct bpf_func_state *state;
+ struct linked_reg *e;
+
+ if (reg->type != SCALAR_VALUE || (reg->id & ~BPF_ADD_CONST) != id)
+ return;
+
+ e = linked_regs_push(reg_set);
+ if (e) {
+ e->frameno = frameno;
+ e->is_reg = is_reg;
+ e->regno = spi_or_reg;
+ } else {
+ reg->id = 0;
+ }
+}
+
+/* For all R being scalar registers or spilled scalar registers
+ * in verifier state, save R in linked_regs if R->id == id.
+ * If there are too many Rs sharing same id, reset id for leftover Rs.
+ */
+static void collect_linked_regs(struct bpf_verifier_state *vstate, u32 id,
+ struct linked_regs *linked_regs)
+{
+ struct bpf_func_state *func;
struct bpf_reg_state *reg;
+ int i, j;
+
+ id = id & ~BPF_ADD_CONST;
+ for (i = vstate->curframe; i >= 0; i--) {
+ func = vstate->frame[i];
+ for (j = 0; j < BPF_REG_FP; j++) {
+ reg = &func->regs[j];
+ __collect_linked_regs(linked_regs, reg, id, i, j, true);
+ }
+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+ if (!is_spilled_reg(&func->stack[j]))
+ continue;
+ reg = &func->stack[j].spilled_ptr;
+ __collect_linked_regs(linked_regs, reg, id, i, j, false);
+ }
+ }
+}
+
+/* For all R in linked_regs, copy known_reg range into R
+ * if R->id == known_reg->id.
+ */
+static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_state *known_reg,
+ struct linked_regs *linked_regs)
+{
+ struct bpf_reg_state fake_reg;
+ struct bpf_reg_state *reg;
+ struct linked_reg *e;
+ int i;
+
+ for (i = 0; i < linked_regs->cnt; ++i) {
+ e = &linked_regs->entries[i];
+ reg = e->is_reg ? &vstate->frame[e->frameno]->regs[e->regno]
+ : &vstate->frame[e->frameno]->stack[e->spi].spilled_ptr;
+ if (reg->type != SCALAR_VALUE || reg == known_reg)
+ continue;
+ if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST))
+ continue;
+ if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) ||
+ reg->off == known_reg->off) {
+ s32 saved_subreg_def = reg->subreg_def;
- bpf_for_each_reg_in_vstate(vstate, state, reg, ({
- if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
copy_register_state(reg, known_reg);
- }));
+ reg->subreg_def = saved_subreg_def;
+ } else {
+ s32 saved_subreg_def = reg->subreg_def;
+ s32 saved_off = reg->off;
+
+ fake_reg.type = SCALAR_VALUE;
+ __mark_reg_known(&fake_reg, (s32)reg->off - (s32)known_reg->off);
+
+ /* reg = known_reg; reg += delta */
+ copy_register_state(reg, known_reg);
+ /*
+ * Must preserve off, id and add_const flag,
+ * otherwise another sync_linked_regs() will be incorrect.
+ */
+ reg->off = saved_off;
+ reg->subreg_def = saved_subreg_def;
+
+ scalar32_min_max_add(reg, &fake_reg);
+ scalar_min_max_add(reg, &fake_reg);
+ reg->var_off = tnum_add(reg->var_off, fake_reg.var_off);
+ }
+ }
}
static int check_cond_jmp_op(struct bpf_verifier_env *env,
@@ -14802,18 +15969,42 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
struct bpf_reg_state *eq_branch_regs;
- struct bpf_reg_state fake_reg = {};
+ struct linked_regs linked_regs = {};
u8 opcode = BPF_OP(insn->code);
bool is_jmp32;
int pred = -1;
int err;
/* Only conditional jumps are expected to reach here. */
- if (opcode == BPF_JA || opcode > BPF_JSLE) {
+ if (opcode == BPF_JA || opcode > BPF_JCOND) {
verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode);
return -EINVAL;
}
+ if (opcode == BPF_JCOND) {
+ struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
+ int idx = *insn_idx;
+
+ if (insn->code != (BPF_JMP | BPF_JCOND) ||
+ insn->src_reg != BPF_MAY_GOTO ||
+ insn->dst_reg || insn->imm) {
+ verbose(env, "invalid may_goto imm %d\n", insn->imm);
+ return -EINVAL;
+ }
+ prev_st = find_prev_entry(env, cur_st->parent, idx);
+
+ /* branch out 'fallthrough' insn as a new state to explore */
+ queued_st = push_stack(env, idx + 1, idx, false);
+ if (!queued_st)
+ return -ENOMEM;
+
+ queued_st->may_goto_depth++;
+ if (prev_st)
+ widen_imprecise_scalars(env, prev_st, queued_st);
+ *insn_idx += insn->off;
+ return 0;
+ }
+
/* check src2 operand */
err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
@@ -14843,7 +16034,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
return -EINVAL;
}
- src_reg = &fake_reg;
+ src_reg = &env->fake_reg[0];
+ memset(src_reg, 0, sizeof(*src_reg));
src_reg->type = SCALAR_VALUE;
__mark_reg_known(src_reg, insn->imm);
}
@@ -14873,7 +16065,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
*insn_idx))
return -EFAULT;
if (env->log.level & BPF_LOG_LEVEL)
- print_insn_state(env, this_branch->frame[this_branch->curframe]);
+ print_insn_state(env, this_branch, this_branch->curframe);
*insn_idx += insn->off;
return 0;
} else if (pred == 0) {
@@ -14887,10 +16079,25 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
*insn_idx))
return -EFAULT;
if (env->log.level & BPF_LOG_LEVEL)
- print_insn_state(env, this_branch->frame[this_branch->curframe]);
+ print_insn_state(env, this_branch, this_branch->curframe);
return 0;
}
+ /* Push scalar registers sharing same ID to jump history,
+ * do this before creating 'other_branch', so that both
+ * 'this_branch' and 'other_branch' share this history
+ * if parent state is created.
+ */
+ if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id)
+ collect_linked_regs(this_branch, src_reg->id, &linked_regs);
+ if (dst_reg->type == SCALAR_VALUE && dst_reg->id)
+ collect_linked_regs(this_branch, dst_reg->id, &linked_regs);
+ if (linked_regs.cnt > 1) {
+ err = push_insn_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
+ if (err)
+ return err;
+ }
+
other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx,
false);
if (!other_branch)
@@ -14903,10 +16110,16 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
&other_branch_regs[insn->src_reg],
dst_reg, src_reg, opcode, is_jmp32);
} else /* BPF_SRC(insn->code) == BPF_K */ {
+ /* reg_set_min_max() can mangle the fake_reg. Make a copy
+ * so that these are two different memory locations. The
+ * src_reg is not used beyond here in context of K.
+ */
+ memcpy(&env->fake_reg[1], &env->fake_reg[0],
+ sizeof(env->fake_reg[0]));
err = reg_set_min_max(env,
&other_branch_regs[insn->dst_reg],
- src_reg /* fake one */,
- dst_reg, src_reg /* same fake one */,
+ &env->fake_reg[0],
+ dst_reg, &env->fake_reg[1],
opcode, is_jmp32);
}
if (err)
@@ -14915,13 +16128,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (BPF_SRC(insn->code) == BPF_X &&
src_reg->type == SCALAR_VALUE && src_reg->id &&
!WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
- find_equal_scalars(this_branch, src_reg);
- find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]);
+ sync_linked_regs(this_branch, src_reg, &linked_regs);
+ sync_linked_regs(other_branch, &other_branch_regs[insn->src_reg], &linked_regs);
}
if (dst_reg->type == SCALAR_VALUE && dst_reg->id &&
!WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) {
- find_equal_scalars(this_branch, dst_reg);
- find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]);
+ sync_linked_regs(this_branch, dst_reg, &linked_regs);
+ sync_linked_regs(other_branch, &other_branch_regs[insn->dst_reg], &linked_regs);
}
/* if one pointer register is compared to another pointer
@@ -14983,7 +16196,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return -EACCES;
}
if (env->log.level & BPF_LOG_LEVEL)
- print_insn_state(env, this_branch->frame[this_branch->curframe]);
+ print_insn_state(env, this_branch, this_branch->curframe);
return 0;
}
@@ -15065,6 +16278,10 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
+ if (map->map_type == BPF_MAP_TYPE_ARENA) {
+ __mark_reg_unknown(env, dst_reg);
+ return 0;
+ }
dst_reg->type = PTR_TO_MAP_VALUE;
dst_reg->off = aux->map_off;
WARN_ON_ONCE(map->max_entries != 1);
@@ -15140,21 +16357,9 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
* gen_ld_abs() may terminate the program at runtime, leading to
* reference leak.
*/
- err = check_reference_leak(env, false);
- if (err) {
- verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n");
+ err = check_resource_leak(env, false, true, "BPF_LD_[ABS|IND]");
+ if (err)
return err;
- }
-
- if (env->cur_state->active_lock.ptr) {
- verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
- return -EINVAL;
- }
-
- if (env->cur_state->active_rcu_lock) {
- verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n");
- return -EINVAL;
- }
if (regs[ctx_reg].type != PTR_TO_CTX) {
verbose(env,
@@ -15200,6 +16405,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
int err;
struct bpf_func_state *frame = env->cur_state->frame[0];
const bool is_subprog = frame->subprogno;
+ bool return_32bit = false;
/* LSM and struct_ops func-ptr's return type could be "void" */
if (!is_subprog || frame->in_exception_callback_fn) {
@@ -15299,18 +16505,30 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
return -ENOTSUPP;
}
break;
+ case BPF_PROG_TYPE_KPROBE:
+ switch (env->prog->expected_attach_type) {
+ case BPF_TRACE_KPROBE_SESSION:
+ case BPF_TRACE_UPROBE_SESSION:
+ range = retval_range(0, 1);
+ break;
+ default:
+ return 0;
+ }
+ break;
case BPF_PROG_TYPE_SK_LOOKUP:
range = retval_range(SK_DROP, SK_PASS);
break;
case BPF_PROG_TYPE_LSM:
if (env->prog->expected_attach_type != BPF_LSM_CGROUP) {
- /* Regular BPF_PROG_TYPE_LSM programs can return
- * any value.
- */
- return 0;
- }
- if (!env->prog->aux->attach_func_proto->type) {
+ /* no range found, any return value is allowed */
+ if (!get_func_retval_range(env->prog, &range))
+ return 0;
+ /* no restricted range, any return value is allowed */
+ if (range.minval == S32_MIN && range.maxval == S32_MAX)
+ return 0;
+ return_32bit = true;
+ } else if (!env->prog->aux->attach_func_proto->type) {
/* Make sure programs that attach to void
* hooks don't try to modify return value.
*/
@@ -15340,7 +16558,7 @@ enforce_retval:
if (err)
return err;
- if (!retval_range_within(range, reg)) {
+ if (!retval_range_within(range, reg, return_32bit)) {
verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name);
if (!is_subprog &&
prog->expected_attach_type == BPF_LSM_CGROUP &&
@@ -15356,6 +16574,29 @@ enforce_retval:
return 0;
}
+static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *subprog;
+
+ subprog = find_containing_subprog(env, off);
+ subprog->changes_pkt_data = true;
+}
+
+/* 't' is an index of a call-site.
+ * 'w' is a callee entry point.
+ * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED.
+ * Rely on DFS traversal order and absence of recursive calls to guarantee that
+ * callee's change_pkt_data marks would be correct at that moment.
+ */
+static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w)
+{
+ struct bpf_subprog_info *caller, *callee;
+
+ caller = find_containing_subprog(env, t);
+ callee = find_containing_subprog(env, w);
+ caller->changes_pkt_data |= callee->changes_pkt_data;
+}
+
/* non-recursive DFS pseudo code
* 1 procedure DFS-iterative(G,v):
* 2 label v as discovered
@@ -15489,6 +16730,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
bool visit_callee)
{
int ret, insn_sz;
+ int w;
insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
ret = push_insn(t, t + insn_sz, FALLTHROUGH, env);
@@ -15500,12 +16742,279 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
mark_jmp_point(env, t + insn_sz);
if (visit_callee) {
+ w = t + insns[t].imm + 1;
mark_prune_point(env, t);
- ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
+ merge_callee_effects(env, t, w);
+ ret = push_insn(t, w, BRANCH, env);
}
return ret;
}
+/* Bitmask with 1s for all caller saved registers */
+#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1)
+
+/* Return a bitmask specifying which caller saved registers are
+ * clobbered by a call to a helper *as if* this helper follows
+ * bpf_fastcall contract:
+ * - includes R0 if function is non-void;
+ * - includes R1-R5 if corresponding parameter has is described
+ * in the function prototype.
+ */
+static u32 helper_fastcall_clobber_mask(const struct bpf_func_proto *fn)
+{
+ u32 mask;
+ int i;
+
+ mask = 0;
+ if (fn->ret_type != RET_VOID)
+ mask |= BIT(BPF_REG_0);
+ for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i)
+ if (fn->arg_type[i] != ARG_DONTCARE)
+ mask |= BIT(BPF_REG_1 + i);
+ return mask;
+}
+
+/* True if do_misc_fixups() replaces calls to helper number 'imm',
+ * replacement patch is presumed to follow bpf_fastcall contract
+ * (see mark_fastcall_pattern_for_call() below).
+ */
+static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm)
+{
+ switch (imm) {
+#ifdef CONFIG_X86_64
+ case BPF_FUNC_get_smp_processor_id:
+ return env->prog->jit_requested && bpf_jit_supports_percpu_insn();
+#endif
+ default:
+ return false;
+ }
+}
+
+/* Same as helper_fastcall_clobber_mask() but for kfuncs, see comment above */
+static u32 kfunc_fastcall_clobber_mask(struct bpf_kfunc_call_arg_meta *meta)
+{
+ u32 vlen, i, mask;
+
+ vlen = btf_type_vlen(meta->func_proto);
+ mask = 0;
+ if (!btf_type_is_void(btf_type_by_id(meta->btf, meta->func_proto->type)))
+ mask |= BIT(BPF_REG_0);
+ for (i = 0; i < vlen; ++i)
+ mask |= BIT(BPF_REG_1 + i);
+ return mask;
+}
+
+/* Same as verifier_inlines_helper_call() but for kfuncs, see comment above */
+static bool is_fastcall_kfunc_call(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_FASTCALL;
+}
+
+/* LLVM define a bpf_fastcall function attribute.
+ * This attribute means that function scratches only some of
+ * the caller saved registers defined by ABI.
+ * For BPF the set of such registers could be defined as follows:
+ * - R0 is scratched only if function is non-void;
+ * - R1-R5 are scratched only if corresponding parameter type is defined
+ * in the function prototype.
+ *
+ * The contract between kernel and clang allows to simultaneously use
+ * such functions and maintain backwards compatibility with old
+ * kernels that don't understand bpf_fastcall calls:
+ *
+ * - for bpf_fastcall calls clang allocates registers as-if relevant r0-r5
+ * registers are not scratched by the call;
+ *
+ * - as a post-processing step, clang visits each bpf_fastcall call and adds
+ * spill/fill for every live r0-r5;
+ *
+ * - stack offsets used for the spill/fill are allocated as lowest
+ * stack offsets in whole function and are not used for any other
+ * purposes;
+ *
+ * - when kernel loads a program, it looks for such patterns
+ * (bpf_fastcall function surrounded by spills/fills) and checks if
+ * spill/fill stack offsets are used exclusively in fastcall patterns;
+ *
+ * - if so, and if verifier or current JIT inlines the call to the
+ * bpf_fastcall function (e.g. a helper call), kernel removes unnecessary
+ * spill/fill pairs;
+ *
+ * - when old kernel loads a program, presence of spill/fill pairs
+ * keeps BPF program valid, albeit slightly less efficient.
+ *
+ * For example:
+ *
+ * r1 = 1;
+ * r2 = 2;
+ * *(u64 *)(r10 - 8) = r1; r1 = 1;
+ * *(u64 *)(r10 - 16) = r2; r2 = 2;
+ * call %[to_be_inlined] --> call %[to_be_inlined]
+ * r2 = *(u64 *)(r10 - 16); r0 = r1;
+ * r1 = *(u64 *)(r10 - 8); r0 += r2;
+ * r0 = r1; exit;
+ * r0 += r2;
+ * exit;
+ *
+ * The purpose of mark_fastcall_pattern_for_call is to:
+ * - look for such patterns;
+ * - mark spill and fill instructions in env->insn_aux_data[*].fastcall_pattern;
+ * - mark set env->insn_aux_data[*].fastcall_spills_num for call instruction;
+ * - update env->subprog_info[*]->fastcall_stack_off to find an offset
+ * at which bpf_fastcall spill/fill stack slots start;
+ * - update env->subprog_info[*]->keep_fastcall_stack.
+ *
+ * The .fastcall_pattern and .fastcall_stack_off are used by
+ * check_fastcall_stack_contract() to check if every stack access to
+ * fastcall spill/fill stack slot originates from spill/fill
+ * instructions, members of fastcall patterns.
+ *
+ * If such condition holds true for a subprogram, fastcall patterns could
+ * be rewritten by remove_fastcall_spills_fills().
+ * Otherwise bpf_fastcall patterns are not changed in the subprogram
+ * (code, presumably, generated by an older clang version).
+ *
+ * For example, it is *not* safe to remove spill/fill below:
+ *
+ * r1 = 1;
+ * *(u64 *)(r10 - 8) = r1; r1 = 1;
+ * call %[to_be_inlined] --> call %[to_be_inlined]
+ * r1 = *(u64 *)(r10 - 8); r0 = *(u64 *)(r10 - 8); <---- wrong !!!
+ * r0 = *(u64 *)(r10 - 8); r0 += r1;
+ * r0 += r1; exit;
+ * exit;
+ */
+static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env,
+ struct bpf_subprog_info *subprog,
+ int insn_idx, s16 lowest_off)
+{
+ struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx;
+ struct bpf_insn *call = &env->prog->insnsi[insn_idx];
+ const struct bpf_func_proto *fn;
+ u32 clobbered_regs_mask = ALL_CALLER_SAVED_REGS;
+ u32 expected_regs_mask;
+ bool can_be_inlined = false;
+ s16 off;
+ int i;
+
+ if (bpf_helper_call(call)) {
+ if (get_helper_proto(env, call->imm, &fn) < 0)
+ /* error would be reported later */
+ return;
+ clobbered_regs_mask = helper_fastcall_clobber_mask(fn);
+ can_be_inlined = fn->allow_fastcall &&
+ (verifier_inlines_helper_call(env, call->imm) ||
+ bpf_jit_inlines_helper_call(call->imm));
+ }
+
+ if (bpf_pseudo_kfunc_call(call)) {
+ struct bpf_kfunc_call_arg_meta meta;
+ int err;
+
+ err = fetch_kfunc_meta(env, call, &meta, NULL);
+ if (err < 0)
+ /* error would be reported later */
+ return;
+
+ clobbered_regs_mask = kfunc_fastcall_clobber_mask(&meta);
+ can_be_inlined = is_fastcall_kfunc_call(&meta);
+ }
+
+ if (clobbered_regs_mask == ALL_CALLER_SAVED_REGS)
+ return;
+
+ /* e.g. if helper call clobbers r{0,1}, expect r{2,3,4,5} in the pattern */
+ expected_regs_mask = ~clobbered_regs_mask & ALL_CALLER_SAVED_REGS;
+
+ /* match pairs of form:
+ *
+ * *(u64 *)(r10 - Y) = rX (where Y % 8 == 0)
+ * ...
+ * call %[to_be_inlined]
+ * ...
+ * rX = *(u64 *)(r10 - Y)
+ */
+ for (i = 1, off = lowest_off; i <= ARRAY_SIZE(caller_saved); ++i, off += BPF_REG_SIZE) {
+ if (insn_idx - i < 0 || insn_idx + i >= env->prog->len)
+ break;
+ stx = &insns[insn_idx - i];
+ ldx = &insns[insn_idx + i];
+ /* must be a stack spill/fill pair */
+ if (stx->code != (BPF_STX | BPF_MEM | BPF_DW) ||
+ ldx->code != (BPF_LDX | BPF_MEM | BPF_DW) ||
+ stx->dst_reg != BPF_REG_10 ||
+ ldx->src_reg != BPF_REG_10)
+ break;
+ /* must be a spill/fill for the same reg */
+ if (stx->src_reg != ldx->dst_reg)
+ break;
+ /* must be one of the previously unseen registers */
+ if ((BIT(stx->src_reg) & expected_regs_mask) == 0)
+ break;
+ /* must be a spill/fill for the same expected offset,
+ * no need to check offset alignment, BPF_DW stack access
+ * is always 8-byte aligned.
+ */
+ if (stx->off != off || ldx->off != off)
+ break;
+ expected_regs_mask &= ~BIT(stx->src_reg);
+ env->insn_aux_data[insn_idx - i].fastcall_pattern = 1;
+ env->insn_aux_data[insn_idx + i].fastcall_pattern = 1;
+ }
+ if (i == 1)
+ return;
+
+ /* Conditionally set 'fastcall_spills_num' to allow forward
+ * compatibility when more helper functions are marked as
+ * bpf_fastcall at compile time than current kernel supports, e.g:
+ *
+ * 1: *(u64 *)(r10 - 8) = r1
+ * 2: call A ;; assume A is bpf_fastcall for current kernel
+ * 3: r1 = *(u64 *)(r10 - 8)
+ * 4: *(u64 *)(r10 - 8) = r1
+ * 5: call B ;; assume B is not bpf_fastcall for current kernel
+ * 6: r1 = *(u64 *)(r10 - 8)
+ *
+ * There is no need to block bpf_fastcall rewrite for such program.
+ * Set 'fastcall_pattern' for both calls to keep check_fastcall_stack_contract() happy,
+ * don't set 'fastcall_spills_num' for call B so that remove_fastcall_spills_fills()
+ * does not remove spill/fill pair {4,6}.
+ */
+ if (can_be_inlined)
+ env->insn_aux_data[insn_idx].fastcall_spills_num = i - 1;
+ else
+ subprog->keep_fastcall_stack = 1;
+ subprog->fastcall_stack_off = min(subprog->fastcall_stack_off, off);
+}
+
+static int mark_fastcall_patterns(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprog = env->subprog_info;
+ struct bpf_insn *insn;
+ s16 lowest_off;
+ int s, i;
+
+ for (s = 0; s < env->subprog_cnt; ++s, ++subprog) {
+ /* find lowest stack spill offset used in this subprog */
+ lowest_off = 0;
+ for (i = subprog->start; i < (subprog + 1)->start; ++i) {
+ insn = env->prog->insnsi + i;
+ if (insn->code != (BPF_STX | BPF_MEM | BPF_DW) ||
+ insn->dst_reg != BPF_REG_10)
+ continue;
+ lowest_off = min(lowest_off, insn->off);
+ }
+ /* use this offset to find fastcall patterns */
+ for (i = subprog->start; i < (subprog + 1)->start; ++i) {
+ insn = env->prog->insnsi + i;
+ if (insn->code != (BPF_JMP | BPF_CALL))
+ continue;
+ mark_fastcall_pattern_for_call(env, subprog, i, lowest_off);
+ }
+ }
+ return 0;
+}
+
/* Visits the instruction at index t and returns one of the following:
* < 0 - an error occurred
* DONE_EXPLORING - the instruction was fully explored
@@ -15531,7 +17040,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
return DONE_EXPLORING;
case BPF_CALL:
- if (insn->src_reg == 0 && insn->imm == BPF_FUNC_timer_set_callback)
+ if (is_async_callback_calling_insn(insn))
/* Mark this call insn as a prune point to trigger
* is_state_visited() check before call itself is
* processed by __check_func_call(). Otherwise new
@@ -15553,6 +17062,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
mark_prune_point(env, t);
mark_jmp_point(env, t);
}
+ if (bpf_helper_call(insn) && bpf_helper_changes_pkt_data(insn->imm))
+ mark_subprog_changes_pkt_data(env, t);
if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
struct bpf_kfunc_call_arg_meta meta;
@@ -15597,6 +17108,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
default:
/* conditional jump with two edges */
mark_prune_point(env, t);
+ if (is_may_goto_insn(insn))
+ mark_force_checkpoint(env, t);
ret = push_insn(t, t + 1, FALLTHROUGH, env);
if (ret)
@@ -15685,6 +17198,7 @@ walk_cfg:
}
}
ret = 0; /* cfg looks good */
+ env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data;
err_free:
kvfree(insn_state);
@@ -16160,8 +17674,8 @@ static int check_btf_info(struct bpf_verifier_env *env,
}
/* check %cur's range satisfies %old's */
-static bool range_within(struct bpf_reg_state *old,
- struct bpf_reg_state *cur)
+static bool range_within(const struct bpf_reg_state *old,
+ const struct bpf_reg_state *cur)
{
return old->umin_value <= cur->umin_value &&
old->umax_value >= cur->umax_value &&
@@ -16325,21 +17839,28 @@ static bool regs_exact(const struct bpf_reg_state *rold,
check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
}
+enum exact_level {
+ NOT_EXACT,
+ EXACT,
+ RANGE_WITHIN
+};
+
/* Returns true if (rold safe implies rcur safe) */
static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
- struct bpf_reg_state *rcur, struct bpf_idmap *idmap, bool exact)
+ struct bpf_reg_state *rcur, struct bpf_idmap *idmap,
+ enum exact_level exact)
{
- if (exact)
+ if (exact == EXACT)
return regs_exact(rold, rcur, idmap);
- if (!(rold->live & REG_LIVE_READ))
+ if (!(rold->live & REG_LIVE_READ) && exact == NOT_EXACT)
/* explored state didn't use this */
return true;
- if (rold->type == NOT_INIT)
- /* explored state can't have used this */
- return true;
- if (rcur->type == NOT_INIT)
- return false;
+ if (rold->type == NOT_INIT) {
+ if (exact == NOT_EXACT || rcur->type == NOT_INIT)
+ /* explored state can't have used this */
+ return true;
+ }
/* Enforce that register types have to match exactly, including their
* modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general
@@ -16374,8 +17895,12 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
check_scalar_ids(rold->id, rcur->id, idmap);
}
- if (!rold->precise)
+ if (!rold->precise && exact == NOT_EXACT)
return true;
+ if ((rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST))
+ return false;
+ if ((rold->id & BPF_ADD_CONST) && (rold->off != rcur->off))
+ return false;
/* Why check_ids() for scalar registers?
*
* Consider the following BPF code:
@@ -16388,7 +17913,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
*
* First verification path is [1-6]:
* - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7;
- * - at (5) r6 would be marked <= X, find_equal_scalars() would also mark
+ * - at (5) r6 would be marked <= X, sync_linked_regs() would also mark
* r7 <= X, because r6 and r7 share same id.
* Next verification path is [1-4, 6].
*
@@ -16442,13 +17967,53 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
* the same stack frame, since fp-8 in foo != fp-8 in bar
*/
return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;
+ case PTR_TO_ARENA:
+ return true;
default:
return regs_exact(rold, rcur, idmap);
}
}
+static struct bpf_reg_state unbound_reg;
+
+static __init int unbound_reg_init(void)
+{
+ __mark_reg_unknown_imprecise(&unbound_reg);
+ unbound_reg.live |= REG_LIVE_READ;
+ return 0;
+}
+late_initcall(unbound_reg_init);
+
+static bool is_stack_all_misc(struct bpf_verifier_env *env,
+ struct bpf_stack_state *stack)
+{
+ u32 i;
+
+ for (i = 0; i < ARRAY_SIZE(stack->slot_type); ++i) {
+ if ((stack->slot_type[i] == STACK_MISC) ||
+ (stack->slot_type[i] == STACK_INVALID && env->allow_uninit_stack))
+ continue;
+ return false;
+ }
+
+ return true;
+}
+
+static struct bpf_reg_state *scalar_reg_for_stack(struct bpf_verifier_env *env,
+ struct bpf_stack_state *stack)
+{
+ if (is_spilled_scalar_reg64(stack))
+ return &stack->spilled_ptr;
+
+ if (is_stack_all_misc(env, stack))
+ return &unbound_reg;
+
+ return NULL;
+}
+
static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
- struct bpf_func_state *cur, struct bpf_idmap *idmap, bool exact)
+ struct bpf_func_state *cur, struct bpf_idmap *idmap,
+ enum exact_level exact)
{
int i, spi;
@@ -16461,12 +18026,14 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
spi = i / BPF_REG_SIZE;
- if (exact &&
- old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
- cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+ if (exact != NOT_EXACT &&
+ (i >= cur->allocated_stack ||
+ old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE]))
return false;
- if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) && !exact) {
+ if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)
+ && exact == NOT_EXACT) {
i += BPF_REG_SIZE - 1;
/* explored state didn't use this */
continue;
@@ -16485,6 +18052,20 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
if (i >= cur->allocated_stack)
return false;
+ /* 64-bit scalar spill vs all slots MISC and vice versa.
+ * Load from all slots MISC produces unbound scalar.
+ * Construct a fake register for such stack and call
+ * regsafe() to ensure scalar ids are compared.
+ */
+ old_reg = scalar_reg_for_stack(env, &old->stack[spi]);
+ cur_reg = scalar_reg_for_stack(env, &cur->stack[spi]);
+ if (old_reg && cur_reg) {
+ if (!regsafe(env, old_reg, cur_reg, idmap, exact))
+ return false;
+ i += BPF_REG_SIZE - 1;
+ continue;
+ }
+
/* if old state was safe with misc data in the stack
* it will be safe with zero-initialized stack.
* The opposite is not true
@@ -16543,6 +18124,12 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
return false;
break;
+ case STACK_IRQ_FLAG:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ return false;
+ break;
case STACK_MISC:
case STACK_ZERO:
case STACK_INVALID:
@@ -16555,7 +18142,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
return true;
}
-static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
+static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *cur,
struct bpf_idmap *idmap)
{
int i;
@@ -16563,9 +18150,34 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
if (old->acquired_refs != cur->acquired_refs)
return false;
+ if (old->active_locks != cur->active_locks)
+ return false;
+
+ if (old->active_preempt_locks != cur->active_preempt_locks)
+ return false;
+
+ if (old->active_rcu_lock != cur->active_rcu_lock)
+ return false;
+
+ if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap))
+ return false;
+
for (i = 0; i < old->acquired_refs; i++) {
- if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap))
+ if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) ||
+ old->refs[i].type != cur->refs[i].type)
+ return false;
+ switch (old->refs[i].type) {
+ case REF_TYPE_PTR:
+ case REF_TYPE_IRQ:
+ break;
+ case REF_TYPE_LOCK:
+ if (old->refs[i].ptr != cur->refs[i].ptr)
+ return false;
+ break;
+ default:
+ WARN_ONCE(1, "Unhandled enum type for reference state: %d\n", old->refs[i].type);
return false;
+ }
}
return true;
@@ -16598,7 +18210,7 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
* the current state will reach 'bpf_exit' instruction safely
*/
static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
- struct bpf_func_state *cur, bool exact)
+ struct bpf_func_state *cur, enum exact_level exact)
{
int i;
@@ -16613,9 +18225,6 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
if (!stacksafe(env, old, cur, &env->idmap_scratch, exact))
return false;
- if (!refsafe(old, cur, &env->idmap_scratch))
- return false;
-
return true;
}
@@ -16628,7 +18237,7 @@ static void reset_idmap_scratch(struct bpf_verifier_env *env)
static bool states_equal(struct bpf_verifier_env *env,
struct bpf_verifier_state *old,
struct bpf_verifier_state *cur,
- bool exact)
+ enum exact_level exact)
{
int i;
@@ -16643,20 +18252,10 @@ static bool states_equal(struct bpf_verifier_env *env,
if (old->speculative && !cur->speculative)
return false;
- if (old->active_lock.ptr != cur->active_lock.ptr)
- return false;
-
- /* Old and cur active_lock's have to be either both present
- * or both absent.
- */
- if (!!old->active_lock.id != !!cur->active_lock.id)
+ if (old->in_sleepable != cur->in_sleepable)
return false;
- if (old->active_lock.id &&
- !check_ids(old->active_lock.id, cur->active_lock.id, &env->idmap_scratch))
- return false;
-
- if (old->active_rcu_lock != cur->active_rcu_lock)
+ if (!refsafe(old, cur, &env->idmap_scratch))
return false;
/* for states to be equal callsites have to be the same
@@ -16921,9 +18520,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
struct bpf_verifier_state_list *sl, **pprev;
struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry;
int i, j, n, err, states_cnt = 0;
- bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
- bool add_new_state = force_new_state;
- bool force_exact;
+ bool force_new_state, add_new_state, force_exact;
+
+ force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) ||
+ /* Avoid accumulating infinitely long jmp history */
+ cur->insn_hist_end - cur->insn_hist_start > 40;
/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
@@ -16933,6 +18534,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* In tests that amounts to up to 50% reduction into total verifier
* memory consumption and 20% verifier time speedup.
*/
+ add_new_state = force_new_state;
if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
env->insn_processed - env->prev_insn_processed >= 8)
add_new_state = true;
@@ -17002,7 +18604,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* => unsafe memory access at 11 would not be caught.
*/
if (is_iter_next_insn(env, insn_idx)) {
- if (states_equal(env, &sl->state, cur, true)) {
+ if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
struct bpf_func_state *cur_frame;
struct bpf_reg_state *iter_state, *iter_reg;
int spi;
@@ -17025,22 +18627,30 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
}
goto skip_inf_loop_check;
}
+ if (is_may_goto_insn_at(env, insn_idx)) {
+ if (sl->state.may_goto_depth != cur->may_goto_depth &&
+ states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
+ update_loop_entry(cur, &sl->state);
+ goto hit;
+ }
+ }
if (calls_callback(env, insn_idx)) {
- if (states_equal(env, &sl->state, cur, true))
+ if (states_equal(env, &sl->state, cur, RANGE_WITHIN))
goto hit;
goto skip_inf_loop_check;
}
/* attempt to detect infinite loop to avoid unnecessary doomed work */
if (states_maybe_looping(&sl->state, cur) &&
- states_equal(env, &sl->state, cur, false) &&
+ states_equal(env, &sl->state, cur, EXACT) &&
!iter_active_depths_differ(&sl->state, cur) &&
+ sl->state.may_goto_depth == cur->may_goto_depth &&
sl->state.callback_unroll_depth == cur->callback_unroll_depth) {
verbose_linfo(env, insn_idx, "; ");
verbose(env, "infinite loop detected at insn %d\n", insn_idx);
verbose(env, "cur state:");
- print_verifier_state(env, cur->frame[cur->curframe], true);
+ print_verifier_state(env, cur, cur->curframe, true);
verbose(env, "old state:");
- print_verifier_state(env, sl->state.frame[cur->curframe], true);
+ print_verifier_state(env, &sl->state, cur->curframe, true);
return -EINVAL;
}
/* if the verifier is processing a loop, avoid adding new state
@@ -17089,7 +18699,7 @@ skip_inf_loop_check:
*/
loop_entry = get_loop_entry(&sl->state);
force_exact = loop_entry && loop_entry->branches > 0;
- if (states_equal(env, &sl->state, cur, force_exact)) {
+ if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) {
if (force_exact)
update_loop_entry(cur, loop_entry);
hit:
@@ -17107,12 +18717,12 @@ hit:
err = propagate_liveness(env, &sl->state, cur);
/* if previous state reached the exit with precision and
- * current state is equivalent to it (except precsion marks)
+ * current state is equivalent to it (except precision marks)
* the precision needs to be propagated back in
* the current state.
*/
if (is_jmp_point(env, env->insn_idx))
- err = err ? : push_jmp_history(env, cur, 0);
+ err = err ? : push_insn_history(env, cur, 0, 0);
err = err ? : propagate_precision(env, &sl->state);
if (err)
return err;
@@ -17211,8 +18821,8 @@ next:
cur->parent = new;
cur->first_insn_idx = insn_idx;
+ cur->insn_hist_start = cur->insn_hist_end;
cur->dfs_depth = new->dfs_depth + 1;
- clear_jmp_history(cur);
new_sl->next = *explored_state(env, insn_idx);
*explored_state(env, insn_idx) = new_sl;
/* connect new state to parentage chain. Current frame needs all
@@ -17259,6 +18869,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
case PTR_TO_TCP_SOCK:
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
+ case PTR_TO_ARENA:
return false;
default:
return true;
@@ -17284,7 +18895,7 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
}
static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
- bool allow_trust_missmatch)
+ bool allow_trust_mismatch)
{
enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type;
@@ -17302,7 +18913,7 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ
* src_reg == stack|map in some other branch.
* Reject it.
*/
- if (allow_trust_missmatch &&
+ if (allow_trust_mismatch &&
base_type(type) == PTR_TO_BTF_ID &&
base_type(*prev_type) == PTR_TO_BTF_ID) {
/*
@@ -17379,7 +18990,7 @@ static int do_check(struct bpf_verifier_env *env)
}
if (is_jmp_point(env, env->insn_idx)) {
- err = push_jmp_history(env, state, 0);
+ err = push_insn_history(env, state, 0, 0);
if (err)
return err;
}
@@ -17395,7 +19006,7 @@ static int do_check(struct bpf_verifier_env *env)
env->prev_insn_idx, env->insn_idx,
env->cur_state->speculative ?
" (speculative execution)" : "");
- print_verifier_state(env, state->frame[state->curframe], true);
+ print_verifier_state(env, state, state->curframe, true);
do_print_state = false;
}
@@ -17407,7 +19018,7 @@ static int do_check(struct bpf_verifier_env *env)
};
if (verifier_state_scratched(env))
- print_insn_state(env, state->frame[state->curframe]);
+ print_insn_state(env, state, state->curframe);
verbose_linfo(env, env->insn_idx, "; ");
env->prev_log_pos = env->log.end_pos;
@@ -17539,11 +19150,10 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (env->cur_state->active_lock.ptr) {
+ if (env->cur_state->active_locks) {
if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) ||
- (insn->src_reg == BPF_PSEUDO_CALL) ||
(insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
- (insn->off != 0 || !is_bpf_graph_api_kfunc(insn->imm)))) {
+ (insn->off != 0 || !kfunc_spin_allowed(insn->imm)))) {
verbose(env, "function calls are not allowed while holding a lock\n");
return -EINVAL;
}
@@ -17589,25 +19199,14 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
process_bpf_exit_full:
- if (env->cur_state->active_lock.ptr &&
- !in_rbtree_lock_required_cb(env)) {
- verbose(env, "bpf_spin_unlock is missing\n");
- return -EINVAL;
- }
-
- if (env->cur_state->active_rcu_lock &&
- !in_rbtree_lock_required_cb(env)) {
- verbose(env, "bpf_rcu_read_unlock is missing\n");
- return -EINVAL;
- }
-
/* We must do check_reference_leak here before
* prepare_func_exit to handle the case when
* state->curframe > 0, it may be a callback
* function, for which reference_state must
* match caller reference state when it exits.
*/
- err = check_reference_leak(env, exception_exit);
+ err = check_resource_leak(env, exception_exit, !env->cur_state->curframe,
+ "BPF_EXIT instruction in main prog");
if (err)
return err;
@@ -17714,50 +19313,68 @@ static int find_btf_percpu_datasec(struct btf *btf)
return -ENOENT;
}
+/*
+ * Add btf to the used_btfs array and return the index. (If the btf was
+ * already added, then just return the index.) Upon successful insertion
+ * increase btf refcnt, and, if present, also refcount the corresponding
+ * kernel module.
+ */
+static int __add_used_btf(struct bpf_verifier_env *env, struct btf *btf)
+{
+ struct btf_mod_pair *btf_mod;
+ int i;
+
+ /* check whether we recorded this BTF (and maybe module) already */
+ for (i = 0; i < env->used_btf_cnt; i++)
+ if (env->used_btfs[i].btf == btf)
+ return i;
+
+ if (env->used_btf_cnt >= MAX_USED_BTFS)
+ return -E2BIG;
+
+ btf_get(btf);
+
+ btf_mod = &env->used_btfs[env->used_btf_cnt];
+ btf_mod->btf = btf;
+ btf_mod->module = NULL;
+
+ /* if we reference variables from kernel module, bump its refcount */
+ if (btf_is_module(btf)) {
+ btf_mod->module = btf_try_get_module(btf);
+ if (!btf_mod->module) {
+ btf_put(btf);
+ return -ENXIO;
+ }
+ }
+
+ return env->used_btf_cnt++;
+}
+
/* replace pseudo btf_id with kernel symbol address */
-static int check_pseudo_btf_id(struct bpf_verifier_env *env,
- struct bpf_insn *insn,
- struct bpf_insn_aux_data *aux)
+static int __check_pseudo_btf_id(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_insn_aux_data *aux,
+ struct btf *btf)
{
const struct btf_var_secinfo *vsi;
const struct btf_type *datasec;
- struct btf_mod_pair *btf_mod;
const struct btf_type *t;
const char *sym_name;
bool percpu = false;
u32 type, id = insn->imm;
- struct btf *btf;
s32 datasec_id;
u64 addr;
- int i, btf_fd, err;
-
- btf_fd = insn[1].imm;
- if (btf_fd) {
- btf = btf_get_by_fd(btf_fd);
- if (IS_ERR(btf)) {
- verbose(env, "invalid module BTF object FD specified.\n");
- return -EINVAL;
- }
- } else {
- if (!btf_vmlinux) {
- verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
- return -EINVAL;
- }
- btf = btf_vmlinux;
- btf_get(btf);
- }
+ int i;
t = btf_type_by_id(btf, id);
if (!t) {
verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id);
- err = -ENOENT;
- goto err_put;
+ return -ENOENT;
}
if (!btf_type_is_var(t) && !btf_type_is_func(t)) {
verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR or KIND_FUNC\n", id);
- err = -EINVAL;
- goto err_put;
+ return -EINVAL;
}
sym_name = btf_name_by_offset(btf, t->name_off);
@@ -17765,8 +19382,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
if (!addr) {
verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n",
sym_name);
- err = -ENOENT;
- goto err_put;
+ return -ENOENT;
}
insn[0].imm = (u32)addr;
insn[1].imm = addr >> 32;
@@ -17774,7 +19390,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
if (btf_type_is_func(t)) {
aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
aux->btf_var.mem_size = 0;
- goto check_btf;
+ return 0;
}
datasec_id = find_btf_percpu_datasec(btf);
@@ -17805,8 +19421,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
tname = btf_name_by_offset(btf, t->name_off);
verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n",
tname, PTR_ERR(ret));
- err = -EINVAL;
- goto err_put;
+ return -EINVAL;
}
aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
aux->btf_var.mem_size = tsize;
@@ -17815,39 +19430,43 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
aux->btf_var.btf = btf;
aux->btf_var.btf_id = type;
}
-check_btf:
- /* check whether we recorded this BTF (and maybe module) already */
- for (i = 0; i < env->used_btf_cnt; i++) {
- if (env->used_btfs[i].btf == btf) {
- btf_put(btf);
- return 0;
- }
- }
- if (env->used_btf_cnt >= MAX_USED_BTFS) {
- err = -E2BIG;
- goto err_put;
- }
+ return 0;
+}
- btf_mod = &env->used_btfs[env->used_btf_cnt];
- btf_mod->btf = btf;
- btf_mod->module = NULL;
+static int check_pseudo_btf_id(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_insn_aux_data *aux)
+{
+ struct btf *btf;
+ int btf_fd;
+ int err;
- /* if we reference variables from kernel module, bump its refcount */
- if (btf_is_module(btf)) {
- btf_mod->module = btf_try_get_module(btf);
- if (!btf_mod->module) {
- err = -ENXIO;
- goto err_put;
+ btf_fd = insn[1].imm;
+ if (btf_fd) {
+ CLASS(fd, f)(btf_fd);
+
+ btf = __btf_get_by_fd(f);
+ if (IS_ERR(btf)) {
+ verbose(env, "invalid module BTF object FD specified.\n");
+ return -EINVAL;
}
+ } else {
+ if (!btf_vmlinux) {
+ verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
+ return -EINVAL;
+ }
+ btf = btf_vmlinux;
}
- env->used_btf_cnt++;
+ err = __check_pseudo_btf_id(env, insn, aux, btf);
+ if (err)
+ return err;
+ err = __add_used_btf(env, btf);
+ if (err < 0)
+ return err;
return 0;
-err_put:
- btf_put(btf);
- return err;
}
static bool is_tracing_prog_type(enum bpf_prog_type type)
@@ -17864,6 +19483,12 @@ static bool is_tracing_prog_type(enum bpf_prog_type type)
}
}
+static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
+{
+ return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
+}
+
static int check_map_prog_compatibility(struct bpf_verifier_env *env,
struct bpf_map *map,
struct bpf_prog *prog)
@@ -17898,6 +19523,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
}
+ if (btf_record_has_field(map->record, BPF_WORKQUEUE)) {
+ if (is_tracing_prog_type(prog_type)) {
+ verbose(env, "tracing progs cannot use bpf_wq yet\n");
+ return -EINVAL;
+ }
+ }
+
if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) &&
!bpf_offload_prog_map_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n");
@@ -17909,7 +19541,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return -EINVAL;
}
- if (prog->aux->sleepable)
+ if (prog->sleepable)
switch (map->map_type) {
case BPF_MAP_TYPE_HASH:
case BPF_MAP_TYPE_LRU_HASH:
@@ -17925,6 +19557,9 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_SK_STORAGE:
case BPF_MAP_TYPE_TASK_STORAGE:
case BPF_MAP_TYPE_CGRP_STORAGE:
+ case BPF_MAP_TYPE_QUEUE:
+ case BPF_MAP_TYPE_STACK:
+ case BPF_MAP_TYPE_ARENA:
break;
default:
verbose(env,
@@ -17932,13 +19567,89 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return -EINVAL;
}
+ if (bpf_map_is_cgroup_storage(map) &&
+ bpf_cgroup_storage_assign(env->prog->aux, map)) {
+ verbose(env, "only one cgroup storage of each type is allowed\n");
+ return -EBUSY;
+ }
+
+ if (map->map_type == BPF_MAP_TYPE_ARENA) {
+ if (env->prog->aux->arena) {
+ verbose(env, "Only one arena per program\n");
+ return -EBUSY;
+ }
+ if (!env->allow_ptr_leaks || !env->bpf_capable) {
+ verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n");
+ return -EPERM;
+ }
+ if (!env->prog->jit_requested) {
+ verbose(env, "JIT is required to use arena\n");
+ return -EOPNOTSUPP;
+ }
+ if (!bpf_jit_supports_arena()) {
+ verbose(env, "JIT doesn't support arena\n");
+ return -EOPNOTSUPP;
+ }
+ env->prog->aux->arena = (void *)map;
+ if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) {
+ verbose(env, "arena's user address must be set via map_extra or mmap()\n");
+ return -EINVAL;
+ }
+ }
+
return 0;
}
-static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
+static int __add_used_map(struct bpf_verifier_env *env, struct bpf_map *map)
{
- return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
- map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
+ int i, err;
+
+ /* check whether we recorded this map already */
+ for (i = 0; i < env->used_map_cnt; i++)
+ if (env->used_maps[i] == map)
+ return i;
+
+ if (env->used_map_cnt >= MAX_USED_MAPS) {
+ verbose(env, "The total number of maps per program has reached the limit of %u\n",
+ MAX_USED_MAPS);
+ return -E2BIG;
+ }
+
+ err = check_map_prog_compatibility(env, map, env->prog);
+ if (err)
+ return err;
+
+ if (env->prog->sleepable)
+ atomic64_inc(&map->sleepable_refcnt);
+
+ /* hold the map. If the program is rejected by verifier,
+ * the map will be released by release_maps() or it
+ * will be used by the valid program until it's unloaded
+ * and all maps are released in bpf_free_used_maps()
+ */
+ bpf_map_inc(map);
+
+ env->used_maps[env->used_map_cnt++] = map;
+
+ return env->used_map_cnt - 1;
+}
+
+/* Add map behind fd to used maps list, if it's not already there, and return
+ * its index.
+ * Returns <0 on error, or >= 0 index, on success.
+ */
+static int add_used_map(struct bpf_verifier_env *env, int fd)
+{
+ struct bpf_map *map;
+ CLASS(fd, f)(fd);
+
+ map = __bpf_map_get(f);
+ if (IS_ERR(map)) {
+ verbose(env, "fd %d is not pointing to valid bpf_map\n", fd);
+ return PTR_ERR(map);
+ }
+
+ return __add_used_map(env, map);
}
/* find and rewrite pseudo imm in ld_imm64 instructions:
@@ -17952,7 +19663,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
- int i, j, err;
+ int i, err;
err = bpf_prog_calc_tag(env->prog);
if (err)
@@ -17969,7 +19680,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
struct bpf_insn_aux_data *aux;
struct bpf_map *map;
- struct fd f;
+ int map_idx;
u64 addr;
u32 fd;
@@ -18032,21 +19743,14 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
break;
}
- f = fdget(fd);
- map = __bpf_map_get(f);
- if (IS_ERR(map)) {
- verbose(env, "fd %d is not pointing to valid bpf_map\n",
- insn[0].imm);
- return PTR_ERR(map);
- }
-
- err = check_map_prog_compatibility(env, map, env->prog);
- if (err) {
- fdput(f);
- return err;
- }
+ map_idx = add_used_map(env, fd);
+ if (map_idx < 0)
+ return map_idx;
+ map = env->used_maps[map_idx];
aux = &env->insn_aux_data[i];
+ aux->map_index = map_idx;
+
if (insn[0].src_reg == BPF_PSEUDO_MAP_FD ||
insn[0].src_reg == BPF_PSEUDO_MAP_IDX) {
addr = (unsigned long)map;
@@ -18055,13 +19759,11 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
if (off >= BPF_MAX_VAR_OFF) {
verbose(env, "direct value offset of %u is not allowed\n", off);
- fdput(f);
return -EINVAL;
}
if (!map->ops->map_direct_value_addr) {
verbose(env, "no direct value access support for this map type\n");
- fdput(f);
return -EINVAL;
}
@@ -18069,7 +19771,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
if (err) {
verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n",
map->value_size, off);
- fdput(f);
return err;
}
@@ -18080,40 +19781,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
insn[0].imm = (u32)addr;
insn[1].imm = addr >> 32;
- /* check whether we recorded this map already */
- for (j = 0; j < env->used_map_cnt; j++) {
- if (env->used_maps[j] == map) {
- aux->map_index = j;
- fdput(f);
- goto next_insn;
- }
- }
-
- if (env->used_map_cnt >= MAX_USED_MAPS) {
- fdput(f);
- return -E2BIG;
- }
-
- if (env->prog->aux->sleepable)
- atomic64_inc(&map->sleepable_refcnt);
- /* hold the map. If the program is rejected by verifier,
- * the map will be released by release_maps() or it
- * will be used by the valid program until it's unloaded
- * and all maps are released in bpf_free_used_maps()
- */
- bpf_map_inc(map);
-
- aux->map_index = env->used_map_cnt;
- env->used_maps[env->used_map_cnt++] = map;
-
- if (bpf_map_is_cgroup_storage(map) &&
- bpf_cgroup_storage_assign(env->prog->aux, map)) {
- verbose(env, "only one cgroup storage of each type is allowed\n");
- fdput(f);
- return -EBUSY;
- }
-
- fdput(f);
next_insn:
insn++;
i++;
@@ -18144,8 +19811,7 @@ static void release_maps(struct bpf_verifier_env *env)
/* drop refcnt of maps used by the rejected program */
static void release_btfs(struct bpf_verifier_env *env)
{
- __bpf_free_used_btfs(env->prog->aux, env->used_btfs,
- env->used_btf_cnt);
+ __bpf_free_used_btfs(env->used_btfs, env->used_btf_cnt);
}
/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
@@ -18256,6 +19922,44 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
return new_prog;
}
+/*
+ * For all jmp insns in a given 'prog' that point to 'tgt_idx' insn adjust the
+ * jump offset by 'delta'.
+ */
+static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta)
+{
+ struct bpf_insn *insn = prog->insnsi;
+ u32 insn_cnt = prog->len, i;
+ s32 imm;
+ s16 off;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ u8 code = insn->code;
+
+ if (tgt_idx <= i && i < tgt_idx + delta)
+ continue;
+
+ if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) ||
+ BPF_OP(code) == BPF_CALL || BPF_OP(code) == BPF_EXIT)
+ continue;
+
+ if (insn->code == (BPF_JMP32 | BPF_JA)) {
+ if (i + 1 + insn->imm != tgt_idx)
+ continue;
+ if (check_add_overflow(insn->imm, delta, &imm))
+ return -ERANGE;
+ insn->imm = imm;
+ } else {
+ if (i + 1 + insn->off != tgt_idx)
+ continue;
+ if (check_add_overflow(insn->off, delta, &off))
+ return -ERANGE;
+ insn->off = off;
+ }
+ }
+ return 0;
+}
+
static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
u32 off, u32 cnt)
{
@@ -18494,22 +20198,29 @@ static int opt_remove_dead_code(struct bpf_verifier_env *env)
return 0;
}
+static const struct bpf_insn NOP = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
+static const struct bpf_insn MAY_GOTO_0 = BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0, 0);
+
static int opt_remove_nops(struct bpf_verifier_env *env)
{
- const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
+ bool is_may_goto_0, is_ja;
int i, err;
for (i = 0; i < insn_cnt; i++) {
- if (memcmp(&insn[i], &ja, sizeof(ja)))
+ is_may_goto_0 = !memcmp(&insn[i], &MAY_GOTO_0, sizeof(MAY_GOTO_0));
+ is_ja = !memcmp(&insn[i], &NOP, sizeof(NOP));
+
+ if (!is_may_goto_0 && !is_ja)
continue;
err = verifier_remove_insns(env, i, 1);
if (err)
return err;
insn_cnt--;
- i--;
+ /* Go back one insn to catch may_goto +1; may_goto +0 sequence */
+ i -= (is_may_goto_0 && i > 0) ? 2 : 1;
}
return 0;
@@ -18621,14 +20332,39 @@ apply_patch_buffer:
*/
static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
+ struct bpf_subprog_info *subprogs = env->subprog_info;
const struct bpf_verifier_ops *ops = env->ops;
- int i, cnt, size, ctx_field_size, delta = 0;
+ int i, cnt, size, ctx_field_size, delta = 0, epilogue_cnt = 0;
const int insn_cnt = env->prog->len;
- struct bpf_insn insn_buf[16], *insn;
+ struct bpf_insn *epilogue_buf = env->epilogue_buf;
+ struct bpf_insn *insn_buf = env->insn_buf;
+ struct bpf_insn *insn;
u32 target_size, size_default, off;
struct bpf_prog *new_prog;
enum bpf_access_type type;
bool is_narrower_load;
+ int epilogue_idx = 0;
+
+ if (ops->gen_epilogue) {
+ epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog,
+ -(subprogs[0].stack_depth + 8));
+ if (epilogue_cnt >= INSN_BUF_SIZE) {
+ verbose(env, "bpf verifier is misconfigured\n");
+ return -EINVAL;
+ } else if (epilogue_cnt) {
+ /* Save the ARG_PTR_TO_CTX for the epilogue to use */
+ cnt = 0;
+ subprogs[0].stack_depth += 8;
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1,
+ -subprogs[0].stack_depth);
+ insn_buf[cnt++] = env->prog->insnsi[0];
+ new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+ env->prog = new_prog;
+ delta += cnt - 1;
+ }
+ }
if (ops->gen_prologue || env->seen_direct_write) {
if (!ops->gen_prologue) {
@@ -18637,7 +20373,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
}
cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
env->prog);
- if (cnt >= ARRAY_SIZE(insn_buf)) {
+ if (cnt >= INSN_BUF_SIZE) {
verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
} else if (cnt) {
@@ -18650,6 +20386,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
}
}
+ if (delta)
+ WARN_ON(adjust_jmp_off(env->prog, 0, delta));
+
if (bpf_prog_is_offloaded(env->prog->aux))
return 0;
@@ -18676,6 +20415,31 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
type = BPF_WRITE;
+ } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) ||
+ insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) &&
+ env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) {
+ insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code);
+ env->prog->aux->num_exentries++;
+ continue;
+ } else if (insn->code == (BPF_JMP | BPF_EXIT) &&
+ epilogue_cnt &&
+ i + delta < subprogs[1].start) {
+ /* Generate epilogue for the main prog */
+ if (epilogue_idx) {
+ /* jump back to the earlier generated epilogue */
+ insn_buf[0] = BPF_JMP32_A(epilogue_idx - i - delta - 1);
+ cnt = 1;
+ } else {
+ memcpy(insn_buf, epilogue_buf,
+ epilogue_cnt * sizeof(*epilogue_buf));
+ cnt = epilogue_cnt;
+ /* epilogue_idx cannot be 0. It must have at
+ * least one ctx ptr saving insn before the
+ * epilogue.
+ */
+ epilogue_idx = i + delta;
+ }
+ goto patch_insn_buf;
} else {
continue;
}
@@ -18733,6 +20497,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
env->prog->aux->num_exentries++;
}
continue;
+ case PTR_TO_ARENA:
+ if (BPF_MODE(insn->code) == BPF_MEMSX) {
+ verbose(env, "sign extending loads from arena are not supported yet\n");
+ return -EOPNOTSUPP;
+ }
+ insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code);
+ env->prog->aux->num_exentries++;
+ continue;
default:
continue;
}
@@ -18770,7 +20542,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
target_size = 0;
cnt = convert_ctx_access(type, insn, insn_buf, env->prog,
&target_size);
- if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
+ if (cnt == 0 || cnt >= INSN_BUF_SIZE ||
(ctx_field_size && !target_size)) {
verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
@@ -18779,7 +20551,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
if (is_narrower_load && size < target_size) {
u8 shift = bpf_ctx_narrow_access_offset(
off, size, size_default) * 8;
- if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) {
+ if (shift && cnt + 1 >= INSN_BUF_SIZE) {
verbose(env, "bpf verifier narrow ctx load misconfigured\n");
return -EINVAL;
}
@@ -18804,6 +20576,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn->dst_reg, insn->dst_reg,
size * 8, 0);
+patch_insn_buf:
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
@@ -18854,12 +20627,19 @@ static int jit_subprogs(struct bpf_verifier_env *env)
env->insn_aux_data[i].call_imm = insn->imm;
/* point imm to __bpf_call_base+1 from JITs point of view */
insn->imm = 1;
- if (bpf_pseudo_func(insn))
+ if (bpf_pseudo_func(insn)) {
+#if defined(MODULES_VADDR)
+ u64 addr = MODULES_VADDR;
+#else
+ u64 addr = VMALLOC_START;
+#endif
/* jit (e.g. x86_64) may emit fewer instructions
* if it learns a u32 imm is the same as a u64 imm.
- * Force a non zero here.
+ * Set close enough to possible prog address.
*/
- insn[1].imm = 1;
+ insn[0].imm = (u32)addr;
+ insn[1].imm = addr >> 32;
+ }
}
err = bpf_prog_alloc_jited_linfo(prog);
@@ -18891,6 +20671,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
if (bpf_prog_calc_tag(func[i]))
goto out_free;
func[i]->is_func = 1;
+ func[i]->sleepable = prog->sleepable;
func[i]->aux->func_idx = i;
/* Below members will be freed only at prog->aux */
func[i]->aux->btf = prog->aux->btf;
@@ -18910,6 +20691,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->name[0] = 'F';
func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
+ if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE)
+ func[i]->aux->jits_use_priv_stack = true;
+
func[i]->jit_requested = 1;
func[i]->blinding_requested = prog->blinding_requested;
func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
@@ -18918,17 +20702,27 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->nr_linfo = prog->aux->nr_linfo;
func[i]->aux->jited_linfo = prog->aux->jited_linfo;
func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
+ func[i]->aux->arena = prog->aux->arena;
num_exentries = 0;
insn = func[i]->insnsi;
for (j = 0; j < func[i]->len; j++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
(BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEM32 ||
BPF_MODE(insn->code) == BPF_PROBE_MEMSX))
num_exentries++;
+ if ((BPF_CLASS(insn->code) == BPF_STX ||
+ BPF_CLASS(insn->code) == BPF_ST) &&
+ BPF_MODE(insn->code) == BPF_PROBE_MEM32)
+ num_exentries++;
+ if (BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_PROBE_ATOMIC)
+ num_exentries++;
}
func[i]->aux->num_exentries = num_exentries;
func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb;
+ func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data;
if (!i)
func[i]->aux->exception_boundary = env->seen_exception;
func[i] = bpf_int_jit_compile(func[i]);
@@ -18989,10 +20783,14 @@ static int jit_subprogs(struct bpf_verifier_env *env)
* bpf_prog_load will add the kallsyms for the main program.
*/
for (i = 1; i < env->subprog_cnt; i++) {
- bpf_prog_lock_ro(func[i]);
- bpf_prog_kallsyms_add(func[i]);
+ err = bpf_prog_lock_ro(func[i]);
+ if (err)
+ goto out_free;
}
+ for (i = 1; i < env->subprog_cnt; i++)
+ bpf_prog_kallsyms_add(func[i]);
+
/* Last step: make now unused interpreter insns from main
* prog consistent for later dump requests, so they can
* later look the same as if they were interpreted only.
@@ -19252,6 +21050,13 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
*cnt = 1;
+ } else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) {
+ struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(BPF_REG_4, (long)env->prog->aux) };
+
+ insn_buf[0] = ld_addrs[0];
+ insn_buf[1] = ld_addrs[1];
+ insn_buf[2] = *insn;
+ *cnt = 3;
}
return 0;
}
@@ -19296,10 +21101,13 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
const int insn_cnt = prog->len;
const struct bpf_map_ops *ops;
struct bpf_insn_aux_data *aux;
- struct bpf_insn insn_buf[16];
+ struct bpf_insn *insn_buf = env->insn_buf;
struct bpf_prog *new_prog;
struct bpf_map *map_ptr;
- int i, ret, cnt, delta = 0;
+ int i, ret, cnt, delta = 0, cur_subprog = 0;
+ struct bpf_subprog_info *subprogs = env->subprog_info;
+ u16 stack_depth = subprogs[cur_subprog].stack_depth;
+ u16 stack_depth_extra = 0;
if (env->seen_exception && !env->exception_callback_subprog) {
struct bpf_insn patch[] = {
@@ -19319,14 +21127,63 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
mark_subprog_exc_cb(env, env->exception_callback_subprog);
}
- for (i = 0; i < insn_cnt; i++, insn++) {
- /* Make divide-by-zero exceptions impossible. */
+ for (i = 0; i < insn_cnt;) {
+ if (insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->imm) {
+ if ((insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1) ||
+ (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) {
+ /* convert to 32-bit mov that clears upper 32-bit */
+ insn->code = BPF_ALU | BPF_MOV | BPF_X;
+ /* clear off and imm, so it's a normal 'wX = wY' from JIT pov */
+ insn->off = 0;
+ insn->imm = 0;
+ } /* cast from as(0) to as(1) should be handled by JIT */
+ goto next_insn;
+ }
+
+ if (env->insn_aux_data[i + delta].needs_zext)
+ /* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */
+ insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code);
+
+ /* Make sdiv/smod divide-by-minus-one exceptions impossible. */
+ if ((insn->code == (BPF_ALU64 | BPF_MOD | BPF_K) ||
+ insn->code == (BPF_ALU64 | BPF_DIV | BPF_K) ||
+ insn->code == (BPF_ALU | BPF_MOD | BPF_K) ||
+ insn->code == (BPF_ALU | BPF_DIV | BPF_K)) &&
+ insn->off == 1 && insn->imm == -1) {
+ bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+ bool isdiv = BPF_OP(insn->code) == BPF_DIV;
+ struct bpf_insn *patchlet;
+ struct bpf_insn chk_and_sdiv[] = {
+ BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_NEG | BPF_K, insn->dst_reg,
+ 0, 0, 0),
+ };
+ struct bpf_insn chk_and_smod[] = {
+ BPF_MOV32_IMM(insn->dst_reg, 0),
+ };
+
+ patchlet = isdiv ? chk_and_sdiv : chk_and_smod;
+ cnt = isdiv ? ARRAY_SIZE(chk_and_sdiv) : ARRAY_SIZE(chk_and_smod);
+
+ new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Make divide-by-zero and divide-by-minus-one exceptions impossible. */
if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
bool isdiv = BPF_OP(insn->code) == BPF_DIV;
+ bool is_sdiv = isdiv && insn->off == 1;
+ bool is_smod = !isdiv && insn->off == 1;
struct bpf_insn *patchlet;
struct bpf_insn chk_and_div[] = {
/* [R,W]x div 0 -> 0 */
@@ -19346,10 +21203,62 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
BPF_JMP_IMM(BPF_JA, 0, 0, 1),
BPF_MOV32_REG(insn->dst_reg, insn->dst_reg),
};
+ struct bpf_insn chk_and_sdiv[] = {
+ /* [R,W]x sdiv 0 -> 0
+ * LLONG_MIN sdiv -1 -> LLONG_MIN
+ * INT_MIN sdiv -1 -> INT_MIN
+ */
+ BPF_MOV64_REG(BPF_REG_AX, insn->src_reg),
+ BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_ADD | BPF_K, BPF_REG_AX,
+ 0, 0, 1),
+ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JGT | BPF_K, BPF_REG_AX,
+ 0, 4, 1),
+ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, BPF_REG_AX,
+ 0, 1, 0),
+ BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_MOV | BPF_K, insn->dst_reg,
+ 0, 0, 0),
+ /* BPF_NEG(LLONG_MIN) == -LLONG_MIN == LLONG_MIN */
+ BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_NEG | BPF_K, insn->dst_reg,
+ 0, 0, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ *insn,
+ };
+ struct bpf_insn chk_and_smod[] = {
+ /* [R,W]x mod 0 -> [R,W]x */
+ /* [R,W]x mod -1 -> 0 */
+ BPF_MOV64_REG(BPF_REG_AX, insn->src_reg),
+ BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_ADD | BPF_K, BPF_REG_AX,
+ 0, 0, 1),
+ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JGT | BPF_K, BPF_REG_AX,
+ 0, 3, 1),
+ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, BPF_REG_AX,
+ 0, 3 + (is64 ? 0 : 1), 1),
+ BPF_MOV32_IMM(insn->dst_reg, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ *insn,
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_MOV32_REG(insn->dst_reg, insn->dst_reg),
+ };
- patchlet = isdiv ? chk_and_div : chk_and_mod;
- cnt = isdiv ? ARRAY_SIZE(chk_and_div) :
- ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0);
+ if (is_sdiv) {
+ patchlet = chk_and_sdiv;
+ cnt = ARRAY_SIZE(chk_and_sdiv);
+ } else if (is_smod) {
+ patchlet = chk_and_smod;
+ cnt = ARRAY_SIZE(chk_and_smod) - (is64 ? 2 : 0);
+ } else {
+ patchlet = isdiv ? chk_and_div : chk_and_mod;
+ cnt = isdiv ? ARRAY_SIZE(chk_and_div) :
+ ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0);
+ }
new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
if (!new_prog)
@@ -19358,7 +21267,37 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
+ }
+
+ /* Make it impossible to de-reference a userspace address */
+ if (BPF_CLASS(insn->code) == BPF_LDX &&
+ (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
+ struct bpf_insn *patch = &insn_buf[0];
+ u64 uaddress_limit = bpf_arch_uaddress_limit();
+
+ if (!uaddress_limit)
+ goto next_insn;
+
+ *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
+ if (insn->off)
+ *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
+ *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
+ *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
+ *patch++ = *insn;
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);
+
+ cnt = patch - insn_buf;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
}
/* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
@@ -19366,7 +21305,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
(BPF_MODE(insn->code) == BPF_ABS ||
BPF_MODE(insn->code) == BPF_IND)) {
cnt = env->ops->gen_ld_abs(insn, insn_buf);
- if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ if (cnt == 0 || cnt >= INSN_BUF_SIZE) {
verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
@@ -19378,7 +21317,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
/* Rewrite pointer arithmetic to mitigate speculation attacks. */
@@ -19393,7 +21332,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
aux = &env->insn_aux_data[i + delta];
if (!aux->alu_state ||
aux->alu_state == BPF_ALU_NON_POINTER)
- continue;
+ goto next_insn;
isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
@@ -19431,19 +21370,42 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
+ }
+
+ if (is_may_goto_insn(insn)) {
+ int stack_off = -stack_depth - 8;
+
+ stack_depth_extra = 8;
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off);
+ if (insn->off >= 0)
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2);
+ else
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
+ insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off);
+ cnt = 4;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
}
if (insn->code != (BPF_JMP | BPF_CALL))
- continue;
+ goto next_insn;
if (insn->src_reg == BPF_PSEUDO_CALL)
- continue;
+ goto next_insn;
if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
if (ret)
return ret;
if (cnt == 0)
- continue;
+ goto next_insn;
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
@@ -19452,9 +21414,13 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
+ /* Skip inlining the helper call if the JIT does it. */
+ if (bpf_jit_inlines_helper_call(insn->imm))
+ goto next_insn;
+
if (insn->imm == BPF_FUNC_get_route_realm)
prog->dst_needed = 1;
if (insn->imm == BPF_FUNC_get_prandom_u32)
@@ -19488,7 +21454,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
!bpf_map_ptr_unpriv(aux)) {
struct bpf_jit_poke_descriptor desc = {
.reason = BPF_POKE_REASON_TAIL_CALL,
- .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state),
+ .tail_call.map = aux->map_ptr_state.map_ptr,
.tail_call.key = bpf_map_key_immediate(aux),
.insn_idx = i + delta,
};
@@ -19500,11 +21466,11 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
}
insn->imm = ret + 1;
- continue;
+ goto next_insn;
}
if (!bpf_map_ptr_unpriv(aux))
- continue;
+ goto next_insn;
/* instead of changing every JIT dealing with tail_call
* emit two extra insns:
@@ -19517,7 +21483,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
return -EINVAL;
}
- map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
+ map_ptr = aux->map_ptr_state.map_ptr;
insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
map_ptr->max_entries, 2);
insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
@@ -19533,7 +21499,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
if (insn->imm == BPF_FUNC_timer_set_callback) {
@@ -19570,7 +21536,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
}
if (is_storage_get_function(insn->imm)) {
- if (!env->prog->aux->sleepable ||
+ if (!in_sleepable(env) ||
env->insn_aux_data[i + delta].storage_get_func_atomic)
insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
else
@@ -19625,14 +21591,14 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
if (bpf_map_ptr_poisoned(aux))
goto patch_call_imm;
- map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
+ map_ptr = aux->map_ptr_state.map_ptr;
ops = map_ptr->ops;
if (insn->imm == BPF_FUNC_map_lookup_elem &&
ops->map_gen_lookup) {
cnt = ops->map_gen_lookup(map_ptr, insn_buf);
if (cnt == -EOPNOTSUPP)
goto patch_map_ops_generic;
- if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ if (cnt <= 0 || cnt >= INSN_BUF_SIZE) {
verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
@@ -19645,7 +21611,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
@@ -19676,31 +21642,31 @@ patch_map_ops_generic:
switch (insn->imm) {
case BPF_FUNC_map_lookup_elem:
insn->imm = BPF_CALL_IMM(ops->map_lookup_elem);
- continue;
+ goto next_insn;
case BPF_FUNC_map_update_elem:
insn->imm = BPF_CALL_IMM(ops->map_update_elem);
- continue;
+ goto next_insn;
case BPF_FUNC_map_delete_elem:
insn->imm = BPF_CALL_IMM(ops->map_delete_elem);
- continue;
+ goto next_insn;
case BPF_FUNC_map_push_elem:
insn->imm = BPF_CALL_IMM(ops->map_push_elem);
- continue;
+ goto next_insn;
case BPF_FUNC_map_pop_elem:
insn->imm = BPF_CALL_IMM(ops->map_pop_elem);
- continue;
+ goto next_insn;
case BPF_FUNC_map_peek_elem:
insn->imm = BPF_CALL_IMM(ops->map_peek_elem);
- continue;
+ goto next_insn;
case BPF_FUNC_redirect_map:
insn->imm = BPF_CALL_IMM(ops->map_redirect);
- continue;
+ goto next_insn;
case BPF_FUNC_for_each_map_elem:
insn->imm = BPF_CALL_IMM(ops->map_for_each_callback);
- continue;
+ goto next_insn;
case BPF_FUNC_map_lookup_percpu_elem:
insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem);
- continue;
+ goto next_insn;
}
goto patch_call_imm;
@@ -19728,9 +21694,37 @@ patch_map_ops_generic:
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
+ /* Implement bpf_get_smp_processor_id() inline. */
+ if (insn->imm == BPF_FUNC_get_smp_processor_id &&
+ verifier_inlines_helper_call(env, insn->imm)) {
+ /* BPF_FUNC_get_smp_processor_id inlining is an
+ * optimization, so if pcpu_hot.cpu_number is ever
+ * changed in some incompatible and hard to support
+ * way, it's fine to back out this inlining logic
+ */
+#ifdef CONFIG_SMP
+ insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number);
+ insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
+ insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0);
+ cnt = 3;
+#else
+ insn_buf[0] = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
+ cnt = 1;
+#endif
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+#endif
/* Implement bpf_get_func_arg inline. */
if (prog_type == BPF_PROG_TYPE_TRACING &&
insn->imm == BPF_FUNC_get_func_arg) {
@@ -19753,7 +21747,7 @@ patch_map_ops_generic:
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
/* Implement bpf_get_func_ret inline. */
@@ -19781,7 +21775,7 @@ patch_map_ops_generic:
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
/* Implement get_func_arg_cnt inline. */
@@ -19796,7 +21790,7 @@ patch_map_ops_generic:
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
/* Implement bpf_get_func_ip inline. */
@@ -19811,9 +21805,82 @@ patch_map_ops_generic:
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
+ /* Implement bpf_get_branch_snapshot inline. */
+ if (IS_ENABLED(CONFIG_PERF_EVENTS) &&
+ prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_get_branch_snapshot) {
+ /* We are dealing with the following func protos:
+ * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags);
+ * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt);
+ */
+ const u32 br_entry_size = sizeof(struct perf_branch_entry);
+
+ /* struct perf_branch_entry is part of UAPI and is
+ * used as an array element, so extremely unlikely to
+ * ever grow or shrink
+ */
+ BUILD_BUG_ON(br_entry_size != 24);
+
+ /* if (unlikely(flags)) return -EINVAL */
+ insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7);
+
+ /* Transform size (bytes) into number of entries (cnt = size / 24).
+ * But to avoid expensive division instruction, we implement
+ * divide-by-3 through multiplication, followed by further
+ * division by 8 through 3-bit right shift.
+ * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr.,
+ * p. 227, chapter "Unsigned Division by 3" for details and proofs.
+ *
+ * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab.
+ */
+ insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab);
+ insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0);
+ insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36);
+
+ /* call perf_snapshot_branch_stack implementation */
+ insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack));
+ /* if (entry_cnt == 0) return -ENOENT */
+ insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4);
+ /* return entry_cnt * sizeof(struct perf_branch_entry) */
+ insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size);
+ insn_buf[7] = BPF_JMP_A(3);
+ /* return -EINVAL; */
+ insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
+ insn_buf[9] = BPF_JMP_A(1);
+ /* return -ENOENT; */
+ insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT);
+ cnt = 11;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Implement bpf_kptr_xchg inline */
+ if (prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_kptr_xchg &&
+ bpf_jit_supports_ptr_xchg()) {
+ insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_2);
+ insn_buf[1] = BPF_ATOMIC_OP(BPF_DW, BPF_XCHG, BPF_REG_1, BPF_REG_0, 0);
+ cnt = 2;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
patch_call_imm:
fn = env->ops->get_func_proto(insn->imm, env->prog);
/* all functions that have prototype and verifier allowed
@@ -19826,6 +21893,47 @@ patch_call_imm:
return -EFAULT;
}
insn->imm = fn->func - __bpf_call_base;
+next_insn:
+ if (subprogs[cur_subprog + 1].start == i + delta + 1) {
+ subprogs[cur_subprog].stack_depth += stack_depth_extra;
+ subprogs[cur_subprog].stack_extra = stack_depth_extra;
+ cur_subprog++;
+ stack_depth = subprogs[cur_subprog].stack_depth;
+ stack_depth_extra = 0;
+ }
+ i++;
+ insn++;
+ }
+
+ env->prog->aux->stack_depth = subprogs[0].stack_depth;
+ for (i = 0; i < env->subprog_cnt; i++) {
+ int subprog_start = subprogs[i].start;
+ int stack_slots = subprogs[i].stack_extra / 8;
+
+ if (!stack_slots)
+ continue;
+ if (stack_slots > 1) {
+ verbose(env, "verifier bug: stack_slots supports may_goto only\n");
+ return -EFAULT;
+ }
+
+ /* Add ST insn to subprog prologue to init extra stack */
+ insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP,
+ -subprogs[i].stack_depth, BPF_MAX_LOOPS);
+ /* Copy first actual insn to preserve it */
+ insn_buf[1] = env->prog->insnsi[subprog_start];
+
+ new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2);
+ if (!new_prog)
+ return -ENOMEM;
+ env->prog = prog = new_prog;
+ /*
+ * If may_goto is a first insn of a prog there could be a jmp
+ * insn that points to it, hence adjust all such jmps to point
+ * to insn after BPF_ST that inits may_goto count.
+ * Adjustment will succeed because bpf_patch_insn_data() didn't fail.
+ */
+ WARN_ON(adjust_jmp_off(env->prog, subprog_start, 1));
}
/* Since poke tab is now finalized, publish aux to tracker. */
@@ -19854,7 +21962,7 @@ static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
int position,
s32 stack_base,
u32 callback_subprogno,
- u32 *cnt)
+ u32 *total_cnt)
{
s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
@@ -19863,55 +21971,56 @@ static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
int reg_loop_cnt = BPF_REG_7;
int reg_loop_ctx = BPF_REG_8;
+ struct bpf_insn *insn_buf = env->insn_buf;
struct bpf_prog *new_prog;
u32 callback_start;
u32 call_insn_offset;
s32 callback_offset;
+ u32 cnt = 0;
/* This represents an inlined version of bpf_iter.c:bpf_loop,
* be careful to modify this code in sync.
*/
- struct bpf_insn insn_buf[] = {
- /* Return error and jump to the end of the patch if
- * expected number of iterations is too big.
- */
- BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2),
- BPF_MOV32_IMM(BPF_REG_0, -E2BIG),
- BPF_JMP_IMM(BPF_JA, 0, 0, 16),
- /* spill R6, R7, R8 to use these as loop vars */
- BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset),
- BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset),
- BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset),
- /* initialize loop vars */
- BPF_MOV64_REG(reg_loop_max, BPF_REG_1),
- BPF_MOV32_IMM(reg_loop_cnt, 0),
- BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3),
- /* loop header,
- * if reg_loop_cnt >= reg_loop_max skip the loop body
- */
- BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5),
- /* callback call,
- * correct callback offset would be set after patching
- */
- BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt),
- BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx),
- BPF_CALL_REL(0),
- /* increment loop counter */
- BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1),
- /* jump to loop header if callback returned 0 */
- BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6),
- /* return value of bpf_loop,
- * set R0 to the number of iterations
- */
- BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt),
- /* restore original values of R6, R7, R8 */
- BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset),
- BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset),
- BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset),
- };
- *cnt = ARRAY_SIZE(insn_buf);
- new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt);
+ /* Return error and jump to the end of the patch if
+ * expected number of iterations is too big.
+ */
+ insn_buf[cnt++] = BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2);
+ insn_buf[cnt++] = BPF_MOV32_IMM(BPF_REG_0, -E2BIG);
+ insn_buf[cnt++] = BPF_JMP_IMM(BPF_JA, 0, 0, 16);
+ /* spill R6, R7, R8 to use these as loop vars */
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset);
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset);
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset);
+ /* initialize loop vars */
+ insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_max, BPF_REG_1);
+ insn_buf[cnt++] = BPF_MOV32_IMM(reg_loop_cnt, 0);
+ insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3);
+ /* loop header,
+ * if reg_loop_cnt >= reg_loop_max skip the loop body
+ */
+ insn_buf[cnt++] = BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5);
+ /* callback call,
+ * correct callback offset would be set after patching
+ */
+ insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt);
+ insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx);
+ insn_buf[cnt++] = BPF_CALL_REL(0);
+ /* increment loop counter */
+ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1);
+ /* jump to loop header if callback returned 0 */
+ insn_buf[cnt++] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6);
+ /* return value of bpf_loop,
+ * set R0 to the number of iterations
+ */
+ insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt);
+ /* restore original values of R6, R7, R8 */
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset);
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset);
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset);
+
+ *total_cnt = cnt;
+ new_prog = bpf_patch_insn_data(env, position, insn_buf, cnt);
if (!new_prog)
return new_prog;
@@ -19986,6 +22095,40 @@ static int optimize_bpf_loop(struct bpf_verifier_env *env)
return 0;
}
+/* Remove unnecessary spill/fill pairs, members of fastcall pattern,
+ * adjust subprograms stack depth when possible.
+ */
+static int remove_fastcall_spills_fills(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprog = env->subprog_info;
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ u32 spills_num;
+ bool modified = false;
+ int i, j;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (aux[i].fastcall_spills_num > 0) {
+ spills_num = aux[i].fastcall_spills_num;
+ /* NOPs would be removed by opt_remove_nops() */
+ for (j = 1; j <= spills_num; ++j) {
+ *(insn - j) = NOP;
+ *(insn + j) = NOP;
+ }
+ modified = true;
+ }
+ if ((subprog + 1)->start == i + 1) {
+ if (modified && !subprog->keep_fastcall_stack)
+ subprog->stack_depth = -subprog->fastcall_stack_off;
+ subprog++;
+ modified = false;
+ }
+ }
+
+ return 0;
+}
+
static void free_states(struct bpf_verifier_env *env)
{
struct bpf_verifier_state_list *sl, *sln;
@@ -20046,7 +22189,6 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
state->first_insn_idx = env->subprog_info[subprog].start;
state->last_insn_idx = -1;
-
regs = state->frame[state->curframe]->regs;
if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
const char *sub_name = subprog_name(env, subprog);
@@ -20090,6 +22232,21 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
mark_reg_known_zero(env, regs, i);
reg->mem_size = arg->mem_size;
reg->id = ++env->id_gen;
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
+ reg->type = PTR_TO_BTF_ID;
+ if (arg->arg_type & PTR_MAYBE_NULL)
+ reg->type |= PTR_MAYBE_NULL;
+ if (arg->arg_type & PTR_UNTRUSTED)
+ reg->type |= PTR_UNTRUSTED;
+ if (arg->arg_type & PTR_TRUSTED)
+ reg->type |= PTR_TRUSTED;
+ mark_reg_known_zero(env, regs, i);
+ reg->btf = bpf_get_btf_vmlinux(); /* can't fail at this point */
+ reg->btf_id = arg->btf_id;
+ reg->id = ++env->id_gen;
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
+ /* caller can pass either PTR_TO_ARENA or SCALAR */
+ mark_reg_unknown(env, regs, i);
} else {
WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n",
i - BPF_REG_1, arg->arg_type);
@@ -20238,26 +22395,44 @@ static void print_verification_stats(struct bpf_verifier_env *env)
static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
{
const struct btf_type *t, *func_proto;
+ const struct bpf_struct_ops_desc *st_ops_desc;
const struct bpf_struct_ops *st_ops;
const struct btf_member *member;
struct bpf_prog *prog = env->prog;
u32 btf_id, member_idx;
+ struct btf *btf;
const char *mname;
+ int err;
if (!prog->gpl_compatible) {
verbose(env, "struct ops programs must have a GPL compatible license\n");
return -EINVAL;
}
+ if (!prog->aux->attach_btf_id)
+ return -ENOTSUPP;
+
+ btf = prog->aux->attach_btf;
+ if (btf_is_module(btf)) {
+ /* Make sure st_ops is valid through the lifetime of env */
+ env->attach_btf_mod = btf_try_get_module(btf);
+ if (!env->attach_btf_mod) {
+ verbose(env, "struct_ops module %s is not found\n",
+ btf_get_name(btf));
+ return -ENOTSUPP;
+ }
+ }
+
btf_id = prog->aux->attach_btf_id;
- st_ops = bpf_struct_ops_find(btf_id);
- if (!st_ops) {
+ st_ops_desc = bpf_struct_ops_find(btf, btf_id);
+ if (!st_ops_desc) {
verbose(env, "attach_btf_id %u is not a supported struct\n",
btf_id);
return -ENOTSUPP;
}
+ st_ops = st_ops_desc->st_ops;
- t = st_ops->type;
+ t = st_ops_desc->type;
member_idx = prog->expected_attach_type;
if (member_idx >= btf_type_vlen(t)) {
verbose(env, "attach to invalid member idx %u of struct %s\n",
@@ -20266,8 +22441,8 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
}
member = &btf_type_member(t)[member_idx];
- mname = btf_name_by_offset(btf_vmlinux, member->name_off);
- func_proto = btf_type_resolve_func_ptr(btf_vmlinux, member->type,
+ mname = btf_name_by_offset(btf, member->name_off);
+ func_proto = btf_type_resolve_func_ptr(btf, member->type,
NULL);
if (!func_proto) {
verbose(env, "attach to invalid member %s(@idx %u) of struct %s\n",
@@ -20275,8 +22450,15 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
return -EINVAL;
}
+ err = bpf_struct_ops_supported(st_ops, __btf_member_bit_offset(t, member) / 8);
+ if (err) {
+ verbose(env, "attach to unsupported member %s of struct %s\n",
+ mname, st_ops->name);
+ return err;
+ }
+
if (st_ops->check_member) {
- int err = st_ops->check_member(t, member, prog);
+ err = st_ops->check_member(t, member, prog);
if (err) {
verbose(env, "attach to unsupported member %s of struct %s\n",
@@ -20285,6 +22467,17 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
}
}
+ if (prog->aux->priv_stack_requested && !bpf_jit_supports_private_stack()) {
+ verbose(env, "Private stack not supported by jit\n");
+ return -EACCES;
+ }
+
+ /* btf_ctx_access() used this to provide argument type info */
+ prog->aux->ctx_arg_info =
+ st_ops_desc->arg_info[member_idx].info;
+ prog->aux->ctx_arg_info_size =
+ st_ops_desc->arg_info[member_idx].cnt;
+
prog->aux->attach_func_proto = func_proto;
prog->aux->attach_func_name = mname;
env->ops = st_ops->verifier_ops;
@@ -20310,8 +22503,12 @@ BTF_SET_START(btf_non_sleepable_error_inject)
* Assume non-sleepable from bpf safety point of view.
*/
BTF_ID(func, __filemap_add_folio)
+#ifdef CONFIG_FAIL_PAGE_ALLOC
BTF_ID(func, should_fail_alloc_page)
+#endif
+#ifdef CONFIG_FAILSLAB
BTF_ID(func, should_failslab)
+#endif
BTF_SET_END(btf_non_sleepable_error_inject)
static int check_non_sleepable_error_inject(u32 btf_id)
@@ -20327,11 +22524,13 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
{
bool prog_extension = prog->type == BPF_PROG_TYPE_EXT;
bool prog_tracing = prog->type == BPF_PROG_TYPE_TRACING;
+ char trace_symbol[KSYM_SYMBOL_LEN];
const char prefix[] = "btf_trace_";
+ struct bpf_raw_event_map *btp;
int ret = 0, subprog = -1, i;
const struct btf_type *t;
bool conservative = true;
- const char *tname;
+ const char *tname, *fname;
struct btf *btf;
long addr = 0;
struct module *mod = NULL;
@@ -20358,6 +22557,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
}
if (tgt_prog) {
struct bpf_prog_aux *aux = tgt_prog->aux;
+ bool tgt_changes_pkt_data;
if (bpf_prog_is_dev_bound(prog->aux) &&
!bpf_prog_dev_bound_match(prog, tgt_prog)) {
@@ -20392,6 +22592,14 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
"Extension programs should be JITed\n");
return -EINVAL;
}
+ tgt_changes_pkt_data = aux->func
+ ? aux->func[subprog]->aux->changes_pkt_data
+ : aux->changes_pkt_data;
+ if (prog->aux->changes_pkt_data && !tgt_changes_pkt_data) {
+ bpf_log(log,
+ "Extension program changes packet data, while original does not\n");
+ return -EINVAL;
+ }
}
if (!tgt_prog->jited) {
bpf_log(log, "Can attach to only JITed progs\n");
@@ -20462,10 +22670,34 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
return -EINVAL;
}
tname += sizeof(prefix) - 1;
- t = btf_type_by_id(btf, t->type);
- if (!btf_type_is_ptr(t))
- /* should never happen in valid vmlinux build */
+
+ /* The func_proto of "btf_trace_##tname" is generated from typedef without argument
+ * names. Thus using bpf_raw_event_map to get argument names.
+ */
+ btp = bpf_get_raw_tracepoint(tname);
+ if (!btp)
return -EINVAL;
+ fname = kallsyms_lookup((unsigned long)btp->bpf_func, NULL, NULL, NULL,
+ trace_symbol);
+ bpf_put_raw_tracepoint(btp);
+
+ if (fname)
+ ret = btf_find_by_name_kind(btf, fname, BTF_KIND_FUNC);
+
+ if (!fname || ret < 0) {
+ bpf_log(log, "Cannot find btf of tracepoint template, fall back to %s%s.\n",
+ prefix, tname);
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_ptr(t))
+ /* should never happen in valid vmlinux build */
+ return -EINVAL;
+ } else {
+ t = btf_type_by_id(btf, ret);
+ if (!btf_type_is_func(t))
+ /* should never happen in valid vmlinux build */
+ return -EINVAL;
+ }
+
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_func_proto(t))
/* should never happen in valid vmlinux build */
@@ -20542,7 +22774,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
}
}
- if (prog->aux->sleepable) {
+ if (prog->sleepable) {
ret = -EINVAL;
switch (prog->type) {
case BPF_PROG_TYPE_TRACING:
@@ -20653,14 +22885,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
u64 key;
if (prog->type == BPF_PROG_TYPE_SYSCALL) {
- if (prog->aux->sleepable)
+ if (prog->sleepable)
/* attach_btf_id checked to be zero already */
return 0;
verbose(env, "Syscall programs can only be sleepable\n");
return -EINVAL;
}
- if (prog->aux->sleepable && !can_be_sleepable(prog)) {
+ if (prog->sleepable && !can_be_sleepable(prog)) {
verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
return -EINVAL;
}
@@ -20737,6 +22969,73 @@ struct btf *bpf_get_btf_vmlinux(void)
return btf_vmlinux;
}
+/*
+ * The add_fd_from_fd_array() is executed only if fd_array_cnt is non-zero. In
+ * this case expect that every file descriptor in the array is either a map or
+ * a BTF. Everything else is considered to be trash.
+ */
+static int add_fd_from_fd_array(struct bpf_verifier_env *env, int fd)
+{
+ struct bpf_map *map;
+ struct btf *btf;
+ CLASS(fd, f)(fd);
+ int err;
+
+ map = __bpf_map_get(f);
+ if (!IS_ERR(map)) {
+ err = __add_used_map(env, map);
+ if (err < 0)
+ return err;
+ return 0;
+ }
+
+ btf = __btf_get_by_fd(f);
+ if (!IS_ERR(btf)) {
+ err = __add_used_btf(env, btf);
+ if (err < 0)
+ return err;
+ return 0;
+ }
+
+ verbose(env, "fd %d is not pointing to valid bpf_map or btf\n", fd);
+ return PTR_ERR(map);
+}
+
+static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr, bpfptr_t uattr)
+{
+ size_t size = sizeof(int);
+ int ret;
+ int fd;
+ u32 i;
+
+ env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);
+
+ /*
+ * The only difference between old (no fd_array_cnt is given) and new
+ * APIs is that in the latter case the fd_array is expected to be
+ * continuous and is scanned for map fds right away
+ */
+ if (!attr->fd_array_cnt)
+ return 0;
+
+ /* Check for integer overflow */
+ if (attr->fd_array_cnt >= (U32_MAX / size)) {
+ verbose(env, "fd_array_cnt is too big (%u)\n", attr->fd_array_cnt);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < attr->fd_array_cnt; i++) {
+ if (copy_from_bpfptr_offset(&fd, env->fd_array, i * size, size))
+ return -EFAULT;
+
+ ret = add_fd_from_fd_array(env, fd);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
u64 start_time = ktime_get_ns();
@@ -20752,7 +23051,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
/* 'struct bpf_verifier_env' can be global, but since it's not small,
* allocate/free it every time bpf_check() is called
*/
- env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
+ env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
if (!env)
return -ENOMEM;
@@ -20768,8 +23067,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
env->insn_aux_data[i].orig_idx = i;
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
- env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);
- is_priv = bpf_capable();
+
+ env->allow_ptr_leaks = bpf_allow_ptr_leaks(env->prog->aux->token);
+ env->allow_uninit_stack = bpf_allow_uninit_stack(env->prog->aux->token);
+ env->bypass_spec_v1 = bpf_bypass_spec_v1(env->prog->aux->token);
+ env->bypass_spec_v4 = bpf_bypass_spec_v4(env->prog->aux->token);
+ env->bpf_capable = is_priv = bpf_token_capable(env->prog->aux->token, CAP_BPF);
bpf_get_btf_vmlinux();
@@ -20786,6 +23089,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret)
goto err_unlock;
+ ret = process_fd_array(env, attr, uattr);
+ if (ret)
+ goto skip_full_check;
+
mark_verifier_state_clean(env);
if (IS_ERR(btf_vmlinux)) {
@@ -20801,12 +23108,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
env->strict_alignment = false;
- env->allow_ptr_leaks = bpf_allow_ptr_leaks();
- env->allow_uninit_stack = bpf_allow_uninit_stack();
- env->bypass_spec_v1 = bpf_bypass_spec_v1();
- env->bypass_spec_v4 = bpf_bypass_spec_v4();
- env->bpf_capable = bpf_capable();
-
if (is_priv)
env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS;
@@ -20834,10 +23135,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret < 0)
goto skip_full_check;
- ret = check_attach_btf_id(env);
- if (ret)
- goto skip_full_check;
-
ret = resolve_pseudo_ldimm64(env);
if (ret < 0)
goto skip_full_check;
@@ -20852,6 +23149,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret < 0)
goto skip_full_check;
+ ret = check_attach_btf_id(env);
+ if (ret)
+ goto skip_full_check;
+
+ ret = mark_fastcall_patterns(env);
+ if (ret < 0)
+ goto skip_full_check;
+
ret = do_check_main(env);
ret = ret ?: do_check_subprogs(env);
@@ -20861,6 +23166,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
skip_full_check:
kvfree(env->explored_states);
+ /* might decrease stack depth, keep it before passes that
+ * allocate additional slots.
+ */
+ if (ret == 0)
+ ret = remove_fastcall_spills_fills(env);
+
if (ret == 0)
ret = check_max_stack_depth(env);
@@ -20972,11 +23283,14 @@ err_release_maps:
env->prog->expected_attach_type = 0;
*prog = env->prog;
+
+ module_put(env->attach_btf_mod);
err_unlock:
if (!is_priv)
mutex_unlock(&bpf_verifier_lock);
vfree(env->insn_aux_data);
+ kvfree(env->insn_hist);
err_free_env:
- kfree(env);
+ kvfree(env);
return ret;
}