summaryrefslogtreecommitdiff
path: root/kernel/bpf
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/Kconfig4
-rw-r--r--kernel/bpf/Makefile21
-rw-r--r--kernel/bpf/arena.c77
-rw-r--r--kernel/bpf/arraymap.c103
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c32
-rw-r--r--kernel/bpf/bpf_inode_storage.c42
-rw-r--r--kernel/bpf/bpf_iter.c14
-rw-r--r--kernel/bpf/bpf_local_storage.c123
-rw-r--r--kernel/bpf/bpf_lsm.c74
-rw-r--r--kernel/bpf/bpf_struct_ops.c355
-rw-r--r--kernel/bpf/bpf_task_storage.c23
-rw-r--r--kernel/bpf/btf.c1129
-rw-r--r--kernel/bpf/btf_iter.c2
-rw-r--r--kernel/bpf/btf_relocate.c2
-rw-r--r--kernel/bpf/cgroup.c124
-rw-r--r--kernel/bpf/core.c286
-rw-r--r--kernel/bpf/cpumap.c185
-rw-r--r--kernel/bpf/cpumask.c56
-rw-r--r--kernel/bpf/crypto.c393
-rw-r--r--kernel/bpf/devmap.c85
-rw-r--r--kernel/bpf/disasm.c30
-rw-r--r--kernel/bpf/dispatcher.c3
-rw-r--r--kernel/bpf/dmabuf_iter.c150
-rw-r--r--kernel/bpf/hashtab.c416
-rw-r--r--kernel/bpf/helpers.c1066
-rw-r--r--kernel/bpf/inode.c19
-rw-r--r--kernel/bpf/kmem_cache_iter.c238
-rw-r--r--kernel/bpf/local_storage.c4
-rw-r--r--kernel/bpf/log.c34
-rw-r--r--kernel/bpf/lpm_trie.c153
-rw-r--r--kernel/bpf/map_in_map.c42
-rw-r--r--kernel/bpf/memalloc.c46
-rw-r--r--kernel/bpf/offload.c11
-rw-r--r--kernel/bpf/percpu_freelist.c113
-rw-r--r--kernel/bpf/percpu_freelist.h4
-rw-r--r--kernel/bpf/preload/bpf_preload_kern.c2
-rw-r--r--kernel/bpf/queue_stack_maps.c35
-rw-r--r--kernel/bpf/range_tree.c270
-rw-r--r--kernel/bpf/range_tree.h21
-rw-r--r--kernel/bpf/relo_core.c2
-rw-r--r--kernel/bpf/reuseport_array.c2
-rw-r--r--kernel/bpf/ringbuf.c54
-rw-r--r--kernel/bpf/rqspinlock.c737
-rw-r--r--kernel/bpf/rqspinlock.h48
-rw-r--r--kernel/bpf/stackmap.c131
-rw-r--r--kernel/bpf/syscall.c709
-rw-r--r--kernel/bpf/sysfs_btf.c44
-rw-r--r--kernel/bpf/task_iter.c15
-rw-r--r--kernel/bpf/token.c79
-rw-r--r--kernel/bpf/trampoline.c80
-rw-r--r--kernel/bpf/verifier.c6160
51 files changed, 10392 insertions, 3456 deletions
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index bc25f5098a25..17067dcb4386 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -28,7 +28,7 @@ config BPF_SYSCALL
bool "Enable bpf() system call"
select BPF
select IRQ_WORK
- select TASKS_RCU if PREEMPTION
+ select NEED_TASKS_RCU
select TASKS_TRACE_RCU
select BINARY_PRINTF
select NET_SOCK_MSG if NET
@@ -43,7 +43,7 @@ config BPF_JIT
bool "Enable BPF Just In Time compiler"
depends on BPF
depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT
- depends on MODULES
+ select EXECMEM
help
BPF programs are normally handled by a BPF interpreter. This option
allows the kernel to generate native code when a program is loaded
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e497011261b8..3a335c50e6e3 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -14,9 +14,9 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
-obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o
+obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o
ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy)
-obj-$(CONFIG_BPF_SYSCALL) += arena.o
+obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o
endif
obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
@@ -44,8 +44,21 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
obj-$(CONFIG_BPF_SYSCALL) += cpumask.o
obj-${CONFIG_BPF_LSM} += bpf_lsm.o
endif
+ifneq ($(CONFIG_CRYPTO),)
+obj-$(CONFIG_BPF_SYSCALL) += crypto.o
+endif
obj-$(CONFIG_BPF_PRELOAD) += preload/
obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
-$(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE
- $(call if_changed_rule,cc_o_c)
+obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o
+obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o
+ifeq ($(CONFIG_DMA_SHARED_BUFFER),y)
+obj-$(CONFIG_BPF_SYSCALL) += dmabuf_iter.o
+endif
+
+CFLAGS_REMOVE_percpu_freelist.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_queue_stack_maps.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_lpm_trie.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_ringbuf.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 343c3456c8dd..0d56cea71602 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -3,9 +3,11 @@
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/err.h>
+#include "linux/filter.h"
#include <linux/btf_ids.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
+#include "range_tree.h"
/*
* bpf_arena is a sparsely populated shared memory region between bpf program and
@@ -37,7 +39,7 @@
*/
/* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */
-#define GUARD_SZ (1ull << sizeof(((struct bpf_insn *)0)->off) * 8)
+#define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1)
#define KERN_VM_SZ (SZ_4G + GUARD_SZ)
struct bpf_arena {
@@ -45,7 +47,7 @@ struct bpf_arena {
u64 user_vm_start;
u64 user_vm_end;
struct vm_struct *kern_vm;
- struct maple_tree mt;
+ struct range_tree rt;
struct list_head vma_list;
struct mutex lock;
};
@@ -98,6 +100,9 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
u64 vm_range;
int err = -ENOMEM;
+ if (!bpf_jit_supports_arena())
+ return ERR_PTR(-EOPNOTSUPP);
+
if (attr->key_size || attr->value_size || attr->max_entries == 0 ||
/* BPF_F_MMAPABLE must be set */
!(attr->map_flags & BPF_F_MMAPABLE) ||
@@ -132,7 +137,12 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
INIT_LIST_HEAD(&arena->vma_list);
bpf_map_init_from_attr(&arena->map, attr);
- mt_init_flags(&arena->mt, MT_FLAGS_ALLOC_RANGE);
+ range_tree_init(&arena->rt);
+ err = range_tree_set(&arena->rt, 0, attr->max_entries);
+ if (err) {
+ bpf_map_area_free(arena);
+ goto err;
+ }
mutex_init(&arena->lock);
return &arena->map;
@@ -183,7 +193,7 @@ static void arena_map_free(struct bpf_map *map)
apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
free_vm_area(arena->kern_vm);
- mtree_destroy(&arena->mt);
+ range_tree_destroy(&arena->rt);
bpf_map_area_free(arena);
}
@@ -212,6 +222,7 @@ static u64 arena_map_mem_usage(const struct bpf_map *map)
struct vma_list {
struct vm_area_struct *vma;
struct list_head head;
+ refcount_t mmap_count;
};
static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)
@@ -221,27 +232,35 @@ static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)
vml = kmalloc(sizeof(*vml), GFP_KERNEL);
if (!vml)
return -ENOMEM;
+ refcount_set(&vml->mmap_count, 1);
vma->vm_private_data = vml;
vml->vma = vma;
list_add(&vml->head, &arena->vma_list);
return 0;
}
+static void arena_vm_open(struct vm_area_struct *vma)
+{
+ struct vma_list *vml = vma->vm_private_data;
+
+ refcount_inc(&vml->mmap_count);
+}
+
static void arena_vm_close(struct vm_area_struct *vma)
{
struct bpf_map *map = vma->vm_file->private_data;
struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
- struct vma_list *vml;
+ struct vma_list *vml = vma->vm_private_data;
+ if (!refcount_dec_and_test(&vml->mmap_count))
+ return;
guard(mutex)(&arena->lock);
- vml = vma->vm_private_data;
+ /* update link list under lock */
list_del(&vml->head);
vma->vm_private_data = NULL;
kfree(vml);
}
-#define MT_ENTRY ((void *)&arena_map_ops) /* unused. has to be valid pointer */
-
static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
{
struct bpf_map *map = vmf->vma->vm_file->private_data;
@@ -251,7 +270,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
int ret;
kbase = bpf_arena_get_kern_vm_start(arena);
- kaddr = kbase + (u32)(vmf->address & PAGE_MASK);
+ kaddr = kbase + (u32)(vmf->address);
guard(mutex)(&arena->lock);
page = vmalloc_to_page((void *)kaddr);
@@ -263,20 +282,20 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
/* User space requested to segfault when page is not allocated by bpf prog */
return VM_FAULT_SIGSEGV;
- ret = mtree_insert(&arena->mt, vmf->pgoff, MT_ENTRY, GFP_KERNEL);
+ ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
if (ret)
return VM_FAULT_SIGSEGV;
/* Account into memcg of the process that created bpf_arena */
- ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page);
+ ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
if (ret) {
- mtree_erase(&arena->mt, vmf->pgoff);
+ range_tree_set(&arena->rt, vmf->pgoff, 1);
return VM_FAULT_SIGSEGV;
}
ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page);
if (ret) {
- mtree_erase(&arena->mt, vmf->pgoff);
+ range_tree_set(&arena->rt, vmf->pgoff, 1);
__free_page(page);
return VM_FAULT_SIGSEGV;
}
@@ -287,6 +306,7 @@ out:
}
static const struct vm_operations_struct arena_vm_ops = {
+ .open = arena_vm_open,
.close = arena_vm_close,
.fault = arena_vm_fault,
};
@@ -314,7 +334,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad
return -EINVAL;
}
- ret = current->mm->get_unmapped_area(filp, addr, len * 2, 0, flags);
+ ret = mm_get_unmapped_area(current->mm, filp, addr, len * 2, 0, flags);
if (IS_ERR_VALUE(ret))
return ret;
if ((ret >> 32) == ((ret + len - 1) >> 32))
@@ -425,24 +445,27 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
return 0;
}
- /* zeroing is needed, since alloc_pages_bulk_array() only fills in non-zero entries */
+ /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */
pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL);
if (!pages)
return 0;
guard(mutex)(&arena->lock);
- if (uaddr)
- ret = mtree_insert_range(&arena->mt, pgoff, pgoff + page_cnt - 1,
- MT_ENTRY, GFP_KERNEL);
- else
- ret = mtree_alloc_range(&arena->mt, &pgoff, MT_ENTRY,
- page_cnt, 0, page_cnt_max - 1, GFP_KERNEL);
+ if (uaddr) {
+ ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
+ if (ret)
+ goto out_free_pages;
+ ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
+ } else {
+ ret = pgoff = range_tree_find(&arena->rt, page_cnt);
+ if (pgoff >= 0)
+ ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
+ }
if (ret)
goto out_free_pages;
- ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO,
- node_id, page_cnt, pages);
+ ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
if (ret)
goto out;
@@ -464,7 +487,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
kvfree(pages);
return clear_lo32(arena->user_vm_start) + uaddr32;
out:
- mtree_erase(&arena->mt, pgoff);
+ range_tree_set(&arena->rt, pgoff, page_cnt);
out_free_pages:
kvfree(pages);
return 0;
@@ -504,7 +527,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
pgoff = compute_pgoff(arena, uaddr);
/* clear range */
- mtree_store_range(&arena->mt, pgoff, pgoff + page_cnt - 1, NULL, GFP_KERNEL);
+ range_tree_set(&arena->rt, pgoff, page_cnt);
if (page_cnt > 1)
/* bulk zap if multiple pages being freed */
@@ -553,8 +576,8 @@ __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(arena_kfuncs)
-BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2)
+BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2)
BTF_KFUNCS_END(arena_kfuncs)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 13358675ff2e..eb28c0f219ee 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -73,6 +73,9 @@ int array_map_alloc_check(union bpf_attr *attr)
/* avoid overflow on round_up(map->value_size) */
if (attr->value_size > INT_MAX)
return -E2BIG;
+ /* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */
+ if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE)
+ return -E2BIG;
return 0;
}
@@ -246,6 +249,38 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
return this_cpu_ptr(array->pptrs[index & array->index_mask]);
}
+/* emit BPF instructions equivalent to C code of percpu_array_map_lookup_elem() */
+static int percpu_array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_insn *insn = insn_buf;
+
+ if (!bpf_jit_supports_percpu_insn())
+ return -EOPNOTSUPP;
+
+ if (map->map_flags & BPF_F_INNER_MAP)
+ return -EOPNOTSUPP;
+
+ BUILD_BUG_ON(offsetof(struct bpf_array, map) != 0);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct bpf_array, pptrs));
+
+ *insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0);
+ if (!map->bypass_spec_v1) {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 6);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_0, array->index_mask);
+ } else {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 5);
+ }
+
+ *insn++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
+ *insn++ = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
+ *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
+ *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *insn++ = BPF_MOV64_IMM(BPF_REG_0, 0);
+ return insn - insn_buf;
+}
+
static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
@@ -396,17 +431,22 @@ static void *array_map_vmalloc_addr(struct bpf_array *array)
return (void *)round_down((unsigned long)array, PAGE_SIZE);
}
-static void array_map_free_timers(struct bpf_map *map)
+static void array_map_free_timers_wq(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
- /* We don't reset or free fields other than timer on uref dropping to zero. */
- if (!btf_record_has_field(map->record, BPF_TIMER))
- return;
-
- for (i = 0; i < array->map.max_entries; i++)
- bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
+ /* We don't reset or free fields other than timer and workqueue
+ * on uref dropping to zero.
+ */
+ if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) {
+ for (i = 0; i < array->map.max_entries; i++) {
+ if (btf_record_has_field(map->record, BPF_TIMER))
+ bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
+ if (btf_record_has_field(map->record, BPF_WORKQUEUE))
+ bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i));
+ }
+ }
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
@@ -457,7 +497,7 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
if (map->btf_key_type_id)
seq_printf(m, "%u: ", *(u32 *)key);
btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
rcu_read_unlock();
}
@@ -478,7 +518,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,
seq_printf(m, "\tcpu%d: ", cpu);
btf_type_seq_show(map->btf, map->btf_value_type_id,
per_cpu_ptr(pptr, cpu), m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
}
seq_puts(m, "}\n");
@@ -563,7 +603,7 @@ static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos)
array = container_of(map, struct bpf_array, map);
index = info->index & array->index_mask;
if (info->percpu_value_buf)
- return array->pptrs[index];
+ return (void *)(uintptr_t)array->pptrs[index];
return array_map_elem_ptr(array, index);
}
@@ -582,7 +622,7 @@ static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
array = container_of(map, struct bpf_array, map);
index = info->index & array->index_mask;
if (info->percpu_value_buf)
- return array->pptrs[index];
+ return (void *)(uintptr_t)array->pptrs[index];
return array_map_elem_ptr(array, index);
}
@@ -595,7 +635,7 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
struct bpf_iter_meta meta;
struct bpf_prog *prog;
int off = 0, cpu = 0;
- void __percpu **pptr;
+ void __percpu *pptr;
u32 size;
meta.seq = seq;
@@ -611,7 +651,7 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
if (!info->percpu_value_buf) {
ctx.value = v;
} else {
- pptr = v;
+ pptr = (void __percpu *)(uintptr_t)v;
size = array->elem_size;
for_each_possible_cpu(cpu) {
copy_map_value_long(map, info->percpu_value_buf + off,
@@ -695,13 +735,13 @@ static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback
u64 ret = 0;
void *val;
+ cant_migrate();
+
if (flags != 0)
return -EINVAL;
is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
array = container_of(map, struct bpf_array, map);
- if (is_percpu)
- migrate_disable();
for (i = 0; i < map->max_entries; i++) {
if (is_percpu)
val = this_cpu_ptr(array->pptrs[i]);
@@ -716,8 +756,6 @@ static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback
break;
}
- if (is_percpu)
- migrate_enable();
return num_elems;
}
@@ -750,7 +788,7 @@ const struct bpf_map_ops array_map_ops = {
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
- .map_release_uref = array_map_free_timers,
+ .map_release_uref = array_map_free_timers_wq,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
@@ -776,6 +814,7 @@ const struct bpf_map_ops percpu_array_map_ops = {
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = percpu_array_map_lookup_elem,
+ .map_gen_lookup = percpu_array_map_gen_lookup,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
.map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem,
@@ -906,22 +945,44 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file, int fd)
{
struct bpf_prog *prog = bpf_prog_get(fd);
+ bool is_extended;
if (IS_ERR(prog))
return prog;
- if (!bpf_prog_map_compatible(map, prog)) {
+ if (prog->type == BPF_PROG_TYPE_EXT ||
+ !bpf_prog_map_compatible(map, prog)) {
bpf_prog_put(prog);
return ERR_PTR(-EINVAL);
}
+ mutex_lock(&prog->aux->ext_mutex);
+ is_extended = prog->aux->is_extended;
+ if (!is_extended)
+ prog->aux->prog_array_member_cnt++;
+ mutex_unlock(&prog->aux->ext_mutex);
+ if (is_extended) {
+ /* Extended prog can not be tail callee. It's to prevent a
+ * potential infinite loop like:
+ * tail callee prog entry -> tail callee prog subprog ->
+ * freplace prog entry --tailcall-> tail callee prog entry.
+ */
+ bpf_prog_put(prog);
+ return ERR_PTR(-EBUSY);
+ }
+
return prog;
}
static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
+ struct bpf_prog *prog = ptr;
+
+ mutex_lock(&prog->aux->ext_mutex);
+ prog->aux->prog_array_member_cnt--;
+ mutex_unlock(&prog->aux->ext_mutex);
/* bpf_prog is freed after one RCU or tasks trace grace period */
- bpf_prog_put(ptr);
+ bpf_prog_put(prog);
}
static u32 prog_fd_array_sys_lookup_elem(void *ptr)
@@ -955,7 +1016,7 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
prog_id = prog_fd_array_sys_lookup_elem(ptr);
btf_type_seq_show(map->btf, map->btf_value_type_id,
&prog_id, m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
}
}
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 28efd0a3f220..148da8f7ff36 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -15,22 +15,20 @@ static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy);
static void bpf_cgrp_storage_lock(void)
{
- migrate_disable();
+ cant_migrate();
this_cpu_inc(bpf_cgrp_storage_busy);
}
static void bpf_cgrp_storage_unlock(void)
{
this_cpu_dec(bpf_cgrp_storage_busy);
- migrate_enable();
}
static bool bpf_cgrp_storage_trylock(void)
{
- migrate_disable();
+ cant_migrate();
if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) {
this_cpu_dec(bpf_cgrp_storage_busy);
- migrate_enable();
return false;
}
return true;
@@ -47,17 +45,18 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)
{
struct bpf_local_storage *local_storage;
+ migrate_disable();
rcu_read_lock();
local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
- if (!local_storage) {
- rcu_read_unlock();
- return;
- }
+ if (!local_storage)
+ goto out;
bpf_cgrp_storage_lock();
bpf_local_storage_destroy(local_storage);
bpf_cgrp_storage_unlock();
+out:
rcu_read_unlock();
+ migrate_enable();
}
static struct bpf_local_storage_data *
@@ -107,7 +106,7 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
bpf_cgrp_storage_lock();
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
- value, map_flags, GFP_ATOMIC);
+ value, map_flags, false, GFP_ATOMIC);
bpf_cgrp_storage_unlock();
cgroup_put(cgroup);
return PTR_ERR_OR_ZERO(sdata);
@@ -154,7 +153,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
static void cgroup_storage_map_free(struct bpf_map *map)
{
- bpf_local_storage_map_free(map, &cgroup_cache, NULL);
+ bpf_local_storage_map_free(map, &cgroup_cache, &bpf_cgrp_storage_busy);
}
/* *gfp_flags* is a hidden argument provided by the verifier */
@@ -162,6 +161,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
void *, value, u64, flags, gfp_t, gfp_flags)
{
struct bpf_local_storage_data *sdata;
+ bool nobusy;
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
@@ -170,21 +170,21 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
if (!cgroup)
return (unsigned long)NULL;
- if (!bpf_cgrp_storage_trylock())
- return (unsigned long)NULL;
+ nobusy = bpf_cgrp_storage_trylock();
- sdata = cgroup_storage_lookup(cgroup, map, true);
+ sdata = cgroup_storage_lookup(cgroup, map, nobusy);
if (sdata)
goto unlock;
/* only allocate new storage, when the cgroup is refcounted */
if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
- (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy)
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
- value, BPF_NOEXIST, gfp_flags);
+ value, BPF_NOEXIST, false, gfp_flags);
unlock:
- bpf_cgrp_storage_unlock();
+ if (nobusy)
+ bpf_cgrp_storage_unlock();
return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
}
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index b0ef45db207c..15a3eb9b02d9 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -16,7 +16,6 @@
#include <uapi/linux/btf.h>
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
-#include <linux/fdtable.h>
#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(inode_cache);
@@ -63,28 +62,28 @@ void bpf_inode_storage_free(struct inode *inode)
if (!bsb)
return;
+ migrate_disable();
rcu_read_lock();
local_storage = rcu_dereference(bsb->storage);
- if (!local_storage) {
- rcu_read_unlock();
- return;
- }
+ if (!local_storage)
+ goto out;
bpf_local_storage_destroy(local_storage);
+out:
rcu_read_unlock();
+ migrate_enable();
}
static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_local_storage_data *sdata;
- struct fd f = fdget_raw(*(int *)key);
+ CLASS(fd_raw, f)(*(int *)key);
- if (!f.file)
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- sdata = inode_storage_lookup(file_inode(f.file), map, true);
- fdput(f);
+ sdata = inode_storage_lookup(file_inode(fd_file(f)), map, true);
return sdata ? sdata->data : NULL;
}
@@ -92,19 +91,16 @@ static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags)
{
struct bpf_local_storage_data *sdata;
- struct fd f = fdget_raw(*(int *)key);
+ CLASS(fd_raw, f)(*(int *)key);
- if (!f.file)
+ if (fd_empty(f))
return -EBADF;
- if (!inode_storage_ptr(file_inode(f.file))) {
- fdput(f);
+ if (!inode_storage_ptr(file_inode(fd_file(f))))
return -EBADF;
- }
- sdata = bpf_local_storage_update(file_inode(f.file),
+ sdata = bpf_local_storage_update(file_inode(fd_file(f)),
(struct bpf_local_storage_map *)map,
- value, map_flags, GFP_ATOMIC);
- fdput(f);
+ value, map_flags, false, GFP_ATOMIC);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -123,15 +119,11 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
{
- struct fd f = fdget_raw(*(int *)key);
- int err;
+ CLASS(fd_raw, f)(*(int *)key);
- if (!f.file)
+ if (fd_empty(f))
return -EBADF;
-
- err = inode_storage_delete(file_inode(f.file), map);
- fdput(f);
- return err;
+ return inode_storage_delete(file_inode(fd_file(f)), map);
}
/* *gfp_flags* is a hidden argument provided by the verifier */
@@ -162,7 +154,7 @@ BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
sdata = bpf_local_storage_update(
inode, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST, gfp_flags);
+ BPF_NOEXIST, false, gfp_flags);
return IS_ERR(sdata) ? (unsigned long)NULL :
(unsigned long)sdata->data;
}
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 112581cf97e7..380e9a7cac75 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -283,7 +283,6 @@ static int iter_release(struct inode *inode, struct file *file)
const struct file_operations bpf_iter_fops = {
.open = iter_open,
- .llseek = no_llseek,
.read = bpf_seq_read,
.release = iter_release,
};
@@ -336,7 +335,7 @@ static void cache_btf_id(struct bpf_iter_target_info *tinfo,
tinfo->btf_id = prog->aux->attach_btf_id;
}
-bool bpf_iter_prog_supported(struct bpf_prog *prog)
+int bpf_iter_prog_supported(struct bpf_prog *prog)
{
const char *attach_fname = prog->aux->attach_func_name;
struct bpf_iter_target_info *tinfo = NULL, *iter;
@@ -345,7 +344,7 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog)
int prefix_len = strlen(prefix);
if (strncmp(attach_fname, prefix, prefix_len))
- return false;
+ return -EINVAL;
mutex_lock(&targets_mutex);
list_for_each_entry(iter, &targets, list) {
@@ -361,12 +360,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog)
}
mutex_unlock(&targets_mutex);
- if (tinfo) {
- prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size;
- prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info;
- }
+ if (!tinfo)
+ return -EINVAL;
- return tinfo != NULL;
+ return bpf_prog_ctx_arg_info_init(prog, tinfo->reg_info->ctx_arg_info,
+ tinfo->reg_info->ctx_arg_info_size);
}
const struct bpf_func_proto *
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index bdea1a459153..fa56c30833ff 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -73,7 +73,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
- void *value, bool charge_mem, gfp_t gfp_flags)
+ void *value, bool charge_mem, bool swap_uptrs, gfp_t gfp_flags)
{
struct bpf_local_storage_elem *selem;
@@ -81,9 +81,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return NULL;
if (smap->bpf_ma) {
- migrate_disable();
selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags);
- migrate_enable();
if (selem)
/* Keep the original bpf_map_kzalloc behavior
* before started using the bpf_mem_cache_alloc.
@@ -99,9 +97,12 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
}
if (selem) {
- if (value)
+ if (value) {
+ /* No need to call check_and_init_map_value as memory is zero init */
copy_map_value(&smap->map, SDATA(selem)->data, value);
- /* No need to call check_and_init_map_value as memory is zero init */
+ if (swap_uptrs)
+ bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value);
+ }
return selem;
}
@@ -171,17 +172,14 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
return;
}
- if (smap) {
- migrate_disable();
+ if (smap)
bpf_mem_cache_free(&smap->storage_ma, local_storage);
- migrate_enable();
- } else {
+ else
/* smap could be NULL if the selem that triggered
* this 'local_storage' creation had been long gone.
* In this case, directly do call_rcu().
*/
call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
- }
}
/* rcu tasks trace callback for bpf_ma == false */
@@ -209,8 +207,15 @@ static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
static void bpf_selem_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+ /* The bpf_local_storage_map_free will wait for rcu_barrier */
+ smap = rcu_dereference_check(SDATA(selem)->smap, 1);
+
+ migrate_disable();
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+ migrate_enable();
bpf_mem_cache_raw_free(selem);
}
@@ -226,23 +231,50 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
struct bpf_local_storage_map *smap,
bool reuse_now)
{
- bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
-
if (!smap->bpf_ma) {
+ /* Only task storage has uptrs and task storage
+ * has moved to bpf_mem_alloc. Meaning smap->bpf_ma == true
+ * for task storage, so this bpf_obj_free_fields() won't unpin
+ * any uptr.
+ */
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
__bpf_selem_free(selem, reuse_now);
return;
}
- if (!reuse_now) {
- call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
- } else {
+ if (reuse_now) {
+ /* reuse_now == true only happens when the storage owner
+ * (e.g. task_struct) is being destructed or the map itself
+ * is being destructed (ie map_free). In both cases,
+ * no bpf prog can have a hold on the selem. It is
+ * safe to unpin the uptrs and free the selem now.
+ */
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
/* Instead of using the vanilla call_rcu(),
* bpf_mem_cache_free will be able to reuse selem
* immediately.
*/
- migrate_disable();
bpf_mem_cache_free(&smap->selem_ma, selem);
- migrate_enable();
+ return;
+ }
+
+ call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
+}
+
+static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
+{
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
+ struct hlist_node *n;
+
+ /* The "_safe" iteration is needed.
+ * The loop is not removing the selem from the list
+ * but bpf_selem_free will use the selem->rcu_head
+ * which is union-ized with the selem->free_node.
+ */
+ hlist_for_each_entry_safe(selem, n, list, free_node) {
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+ bpf_selem_free(selem, smap, reuse_now);
}
}
@@ -252,7 +284,7 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
*/
static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
struct bpf_local_storage_elem *selem,
- bool uncharge_mem, bool reuse_now)
+ bool uncharge_mem, struct hlist_head *free_selem_list)
{
struct bpf_local_storage_map *smap;
bool free_local_storage;
@@ -296,7 +328,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
SDATA(selem))
RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
- bpf_selem_free(selem, smap, reuse_now);
+ hlist_add_head(&selem->free_node, free_selem_list);
if (rcu_access_pointer(local_storage->smap) == smap)
RCU_INIT_POINTER(local_storage->smap, NULL);
@@ -318,7 +350,7 @@ static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage,
*
* If the local_storage->list is already empty, the caller will not
* care about the bpf_ma value also because the caller is not
- * responsibile to free the local_storage.
+ * responsible to free the local_storage.
*/
if (storage_smap)
@@ -345,6 +377,7 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage *local_storage;
bool bpf_ma, free_local_storage = false;
+ HLIST_HEAD(selem_free_list);
unsigned long flags;
if (unlikely(!selem_linked_to_storage_lockless(selem)))
@@ -360,9 +393,11 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
raw_spin_lock_irqsave(&local_storage->lock, flags);
if (likely(selem_linked_to_storage(selem)))
free_local_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, true, reuse_now);
+ local_storage, selem, true, &selem_free_list);
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_selem_free_list(&selem_free_list, reuse_now);
+
if (free_local_storage)
bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now);
}
@@ -458,15 +493,11 @@ int bpf_local_storage_alloc(void *owner,
if (err)
return err;
- if (smap->bpf_ma) {
- migrate_disable();
+ if (smap->bpf_ma)
storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags);
- migrate_enable();
- } else {
+ else
storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
gfp_flags | __GFP_NOWARN);
- }
-
if (!storage) {
err = -ENOMEM;
goto uncharge;
@@ -524,11 +555,12 @@ uncharge:
*/
struct bpf_local_storage_data *
bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
- void *value, u64 map_flags, gfp_t gfp_flags)
+ void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags)
{
struct bpf_local_storage_data *old_sdata = NULL;
struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
struct bpf_local_storage *local_storage;
+ HLIST_HEAD(old_selem_free_list);
unsigned long flags;
int err;
@@ -550,7 +582,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (err)
return ERR_PTR(err);
- selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
+ selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags);
if (!selem)
return ERR_PTR(-ENOMEM);
@@ -584,7 +616,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
/* A lookup has just been done before and concluded a new selem is
* needed. The chance of an unnecessary alloc is unlikely.
*/
- alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
+ alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags);
if (!alloc_selem)
return ERR_PTR(-ENOMEM);
@@ -624,11 +656,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (old_sdata) {
bpf_selem_unlink_map(SELEM(old_sdata));
bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
- true, false);
+ true, &old_selem_free_list);
}
unlock:
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_selem_free_list(&old_selem_free_list, false);
if (alloc_selem) {
mem_uncharge(smap, owner, smap->elem_size);
bpf_selem_free(alloc_selem, smap, true);
@@ -706,6 +739,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage_elem *selem;
bool bpf_ma, free_storage = false;
+ HLIST_HEAD(free_selem_list);
struct hlist_node *n;
unsigned long flags;
@@ -734,10 +768,12 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
* of the loop will set the free_cgroup_storage to true.
*/
free_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, true, true);
+ local_storage, selem, true, &free_selem_list);
}
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_selem_free_list(&free_selem_list, true);
+
if (free_storage)
bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true);
}
@@ -782,8 +818,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
nbuckets = max_t(u32, 2, nbuckets);
smap->bucket_log = ilog2(nbuckets);
- smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
- nbuckets, GFP_USER | __GFP_NOWARN);
+ smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets,
+ sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN);
if (!smap->buckets) {
err = -ENOMEM;
goto free_smap;
@@ -797,8 +833,12 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
smap->elem_size = offsetof(struct bpf_local_storage_elem,
sdata.data[attr->value_size]);
- smap->bpf_ma = bpf_ma;
- if (bpf_ma) {
+ /* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non
+ * preemptible context. Thus, enforce all storages to use
+ * bpf_mem_alloc when CONFIG_PREEMPT_RT is enabled.
+ */
+ smap->bpf_ma = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : bpf_ma;
+ if (smap->bpf_ma) {
err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false);
if (err)
goto free_smap;
@@ -854,15 +894,11 @@ void bpf_local_storage_map_free(struct bpf_map *map,
while ((selem = hlist_entry_safe(
rcu_dereference_raw(hlist_first_rcu(&b->list)),
struct bpf_local_storage_elem, map_node))) {
- if (busy_counter) {
- migrate_disable();
+ if (busy_counter)
this_cpu_inc(*busy_counter);
- }
bpf_selem_unlink(selem, true);
- if (busy_counter) {
+ if (busy_counter)
this_cpu_dec(*busy_counter);
- migrate_enable();
- }
cond_resched_rcu();
}
rcu_read_unlock();
@@ -883,6 +919,9 @@ void bpf_local_storage_map_free(struct bpf_map *map,
synchronize_rcu();
if (smap->bpf_ma) {
+ rcu_barrier_tasks_trace();
+ if (!rcu_trace_implies_rcu_gp())
+ rcu_barrier();
bpf_mem_alloc_destroy(&smap->selem_ma);
bpf_mem_alloc_destroy(&smap->storage_ma);
}
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 68240c3c6e7d..0a59df1c550a 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -11,7 +11,6 @@
#include <linux/lsm_hooks.h>
#include <linux/bpf_lsm.h>
#include <linux/kallsyms.h>
-#include <linux/bpf_verifier.h>
#include <net/bpf_sk_storage.h>
#include <linux/bpf_local_storage.h>
#include <linux/btf_ids.h>
@@ -36,6 +35,24 @@ BTF_SET_START(bpf_lsm_hooks)
#undef LSM_HOOK
BTF_SET_END(bpf_lsm_hooks)
+BTF_SET_START(bpf_lsm_disabled_hooks)
+BTF_ID(func, bpf_lsm_vm_enough_memory)
+BTF_ID(func, bpf_lsm_inode_need_killpriv)
+BTF_ID(func, bpf_lsm_inode_getsecurity)
+BTF_ID(func, bpf_lsm_inode_listsecurity)
+BTF_ID(func, bpf_lsm_inode_copy_up_xattr)
+BTF_ID(func, bpf_lsm_getselfattr)
+BTF_ID(func, bpf_lsm_getprocattr)
+BTF_ID(func, bpf_lsm_setprocattr)
+#ifdef CONFIG_KEYS
+BTF_ID(func, bpf_lsm_key_getsecurity)
+#endif
+#ifdef CONFIG_AUDIT
+BTF_ID(func, bpf_lsm_audit_rule_match)
+#endif
+BTF_ID(func, bpf_lsm_ismaclabel)
+BTF_SET_END(bpf_lsm_disabled_hooks)
+
/* List of LSM hooks that should operate on 'current' cgroup regardless
* of function signature.
*/
@@ -97,15 +114,24 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
const struct bpf_prog *prog)
{
+ u32 btf_id = prog->aux->attach_btf_id;
+ const char *func_name = prog->aux->attach_func_name;
+
if (!prog->gpl_compatible) {
bpf_log(vlog,
"LSM programs must have a GPL compatible license\n");
return -EINVAL;
}
- if (!btf_id_set_contains(&bpf_lsm_hooks, prog->aux->attach_btf_id)) {
+ if (btf_id_set_contains(&bpf_lsm_disabled_hooks, btf_id)) {
+ bpf_log(vlog, "attach_btf_id %u points to disabled hook %s\n",
+ btf_id, func_name);
+ return -EINVAL;
+ }
+
+ if (!btf_id_set_contains(&bpf_lsm_hooks, btf_id)) {
bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n",
- prog->aux->attach_btf_id, prog->aux->attach_func_name);
+ btf_id, func_name);
return -EINVAL;
}
@@ -280,6 +306,7 @@ BTF_ID(func, bpf_lsm_cred_prepare)
BTF_ID(func, bpf_lsm_file_ioctl)
BTF_ID(func, bpf_lsm_file_lock)
BTF_ID(func, bpf_lsm_file_open)
+BTF_ID(func, bpf_lsm_file_post_open)
BTF_ID(func, bpf_lsm_file_receive)
BTF_ID(func, bpf_lsm_inode_create)
@@ -289,7 +316,9 @@ BTF_ID(func, bpf_lsm_inode_getxattr)
BTF_ID(func, bpf_lsm_inode_mknod)
BTF_ID(func, bpf_lsm_inode_need_killpriv)
BTF_ID(func, bpf_lsm_inode_post_setxattr)
+BTF_ID(func, bpf_lsm_inode_post_removexattr)
BTF_ID(func, bpf_lsm_inode_readlink)
+BTF_ID(func, bpf_lsm_inode_removexattr)
BTF_ID(func, bpf_lsm_inode_rename)
BTF_ID(func, bpf_lsm_inode_rmdir)
BTF_ID(func, bpf_lsm_inode_setattr)
@@ -312,10 +341,6 @@ BTF_ID(func, bpf_lsm_path_chmod)
BTF_ID(func, bpf_lsm_path_chown)
#endif /* CONFIG_SECURITY_PATH */
-#ifdef CONFIG_KEYS
-BTF_ID(func, bpf_lsm_key_free)
-#endif /* CONFIG_KEYS */
-
BTF_ID(func, bpf_lsm_mmap_file)
BTF_ID(func, bpf_lsm_netlink_send)
BTF_ID(func, bpf_lsm_path_notify)
@@ -352,8 +377,6 @@ BTF_ID(func, bpf_lsm_socket_socketpair)
BTF_ID(func, bpf_lsm_syslog)
BTF_ID(func, bpf_lsm_task_alloc)
-BTF_ID(func, bpf_lsm_current_getsecid_subj)
-BTF_ID(func, bpf_lsm_task_getsecid_obj)
BTF_ID(func, bpf_lsm_task_prctl)
BTF_ID(func, bpf_lsm_task_setscheduler)
BTF_ID(func, bpf_lsm_task_to_inode)
@@ -389,3 +412,36 @@ const struct bpf_verifier_ops lsm_verifier_ops = {
.get_func_proto = bpf_lsm_func_proto,
.is_valid_access = btf_ctx_access,
};
+
+/* hooks return 0 or 1 */
+BTF_SET_START(bool_lsm_hooks)
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+BTF_ID(func, bpf_lsm_xfrm_state_pol_flow_match)
+#endif
+#ifdef CONFIG_AUDIT
+BTF_ID(func, bpf_lsm_audit_rule_known)
+#endif
+BTF_ID(func, bpf_lsm_inode_xattr_skipcap)
+BTF_SET_END(bool_lsm_hooks)
+
+int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
+ struct bpf_retval_range *retval_range)
+{
+ /* no return value range for void hooks */
+ if (!prog->aux->attach_func_proto->type)
+ return -EINVAL;
+
+ if (btf_id_set_contains(&bool_lsm_hooks, prog->aux->attach_btf_id)) {
+ retval_range->minval = 0;
+ retval_range->maxval = 1;
+ } else {
+ /* All other available LSM hooks, except task_prctl, return 0
+ * on success and negative error code on failure.
+ * To keep things simple, we only allow bpf progs to return 0
+ * or negative errno for task_prctl too.
+ */
+ retval_range->minval = -MAX_ERRNO;
+ retval_range->maxval = 0;
+ }
+ return 0;
+}
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 43356faaa057..96113633e391 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -12,6 +12,7 @@
#include <linux/mutex.h>
#include <linux/btf_ids.h>
#include <linux/rcupdate_wait.h>
+#include <linux/poll.h>
struct bpf_struct_ops_value {
struct bpf_struct_ops_common_value common;
@@ -22,7 +23,6 @@ struct bpf_struct_ops_value {
struct bpf_struct_ops_map {
struct bpf_map map;
- struct rcu_head rcu;
const struct bpf_struct_ops_desc *st_ops_desc;
/* protect map_update */
struct mutex lock;
@@ -31,7 +31,9 @@ struct bpf_struct_ops_map {
* (in kvalue.data).
*/
struct bpf_link **links;
- u32 links_cnt;
+ /* ksyms for bpf trampolines */
+ struct bpf_ksym **ksyms;
+ u32 funcs_cnt;
u32 image_pages_cnt;
/* image_pages is an array of pages that has all the trampolines
* that stores the func args before calling the bpf_prog.
@@ -56,6 +58,7 @@ struct bpf_struct_ops_map {
struct bpf_struct_ops_link {
struct bpf_link link;
struct bpf_map __rcu *map;
+ wait_queue_head_t wait_hup;
};
static DEFINE_MUTEX(update_mutex);
@@ -143,39 +146,7 @@ void bpf_struct_ops_image_free(void *image)
}
#define MAYBE_NULL_SUFFIX "__nullable"
-#define MAX_STUB_NAME 128
-
-/* Return the type info of a stub function, if it exists.
- *
- * The name of a stub function is made up of the name of the struct_ops and
- * the name of the function pointer member, separated by "__". For example,
- * if the struct_ops type is named "foo_ops" and the function pointer
- * member is named "bar", the stub function name would be "foo_ops__bar".
- */
-static const struct btf_type *
-find_stub_func_proto(const struct btf *btf, const char *st_op_name,
- const char *member_name)
-{
- char stub_func_name[MAX_STUB_NAME];
- const struct btf_type *func_type;
- s32 btf_id;
- int cp;
-
- cp = snprintf(stub_func_name, MAX_STUB_NAME, "%s__%s",
- st_op_name, member_name);
- if (cp >= MAX_STUB_NAME) {
- pr_warn("Stub function name too long\n");
- return NULL;
- }
- btf_id = btf_find_by_name_kind(btf, stub_func_name, BTF_KIND_FUNC);
- if (btf_id < 0)
- return NULL;
- func_type = btf_type_by_id(btf, btf_id);
- if (!func_type)
- return NULL;
-
- return btf_type_by_id(btf, func_type->type); /* FUNC_PROTO */
-}
+#define REFCOUNTED_SUFFIX "__ref"
/* Prepare argument info for every nullable argument of a member of a
* struct_ops type.
@@ -200,27 +171,44 @@ find_stub_func_proto(const struct btf *btf, const char *st_op_name,
static int prepare_arg_info(struct btf *btf,
const char *st_ops_name,
const char *member_name,
- const struct btf_type *func_proto,
+ const struct btf_type *func_proto, void *stub_func_addr,
struct bpf_struct_ops_arg_info *arg_info)
{
const struct btf_type *stub_func_proto, *pointed_type;
+ bool is_nullable = false, is_refcounted = false;
const struct btf_param *stub_args, *args;
struct bpf_ctx_arg_aux *info, *info_buf;
u32 nargs, arg_no, info_cnt = 0;
+ char ksym[KSYM_SYMBOL_LEN];
+ const char *stub_fname;
+ const char *suffix;
+ s32 stub_func_id;
u32 arg_btf_id;
int offset;
- stub_func_proto = find_stub_func_proto(btf, st_ops_name, member_name);
- if (!stub_func_proto)
- return 0;
+ stub_fname = kallsyms_lookup((unsigned long)stub_func_addr, NULL, NULL, NULL, ksym);
+ if (!stub_fname) {
+ pr_warn("Cannot find the stub function name for the %s in struct %s\n",
+ member_name, st_ops_name);
+ return -ENOENT;
+ }
+
+ stub_func_id = btf_find_by_name_kind(btf, stub_fname, BTF_KIND_FUNC);
+ if (stub_func_id < 0) {
+ pr_warn("Cannot find the stub function %s in btf\n", stub_fname);
+ return -ENOENT;
+ }
+
+ stub_func_proto = btf_type_by_id(btf, stub_func_id);
+ stub_func_proto = btf_type_by_id(btf, stub_func_proto->type);
/* Check if the number of arguments of the stub function is the same
* as the number of arguments of the function pointer.
*/
nargs = btf_type_vlen(func_proto);
if (nargs != btf_type_vlen(stub_func_proto)) {
- pr_warn("the number of arguments of the stub function %s__%s does not match the number of arguments of the member %s of struct %s\n",
- st_ops_name, member_name, member_name, st_ops_name);
+ pr_warn("the number of arguments of the stub function %s does not match the number of arguments of the member %s of struct %s\n",
+ stub_fname, member_name, st_ops_name);
return -EINVAL;
}
@@ -238,10 +226,18 @@ static int prepare_arg_info(struct btf *btf,
info = info_buf;
for (arg_no = 0; arg_no < nargs; arg_no++) {
/* Skip arguments that is not suffixed with
- * "__nullable".
+ * "__nullable or __ref".
*/
- if (!btf_param_match_suffix(btf, &stub_args[arg_no],
- MAYBE_NULL_SUFFIX))
+ is_nullable = btf_param_match_suffix(btf, &stub_args[arg_no],
+ MAYBE_NULL_SUFFIX);
+ is_refcounted = btf_param_match_suffix(btf, &stub_args[arg_no],
+ REFCOUNTED_SUFFIX);
+
+ if (is_nullable)
+ suffix = MAYBE_NULL_SUFFIX;
+ else if (is_refcounted)
+ suffix = REFCOUNTED_SUFFIX;
+ else
continue;
/* Should be a pointer to struct */
@@ -250,30 +246,34 @@ static int prepare_arg_info(struct btf *btf,
&arg_btf_id);
if (!pointed_type ||
!btf_type_is_struct(pointed_type)) {
- pr_warn("stub function %s__%s has %s tagging to an unsupported type\n",
- st_ops_name, member_name, MAYBE_NULL_SUFFIX);
+ pr_warn("stub function %s has %s tagging to an unsupported type\n",
+ stub_fname, suffix);
goto err_out;
}
offset = btf_ctx_arg_offset(btf, func_proto, arg_no);
if (offset < 0) {
- pr_warn("stub function %s__%s has an invalid trampoline ctx offset for arg#%u\n",
- st_ops_name, member_name, arg_no);
+ pr_warn("stub function %s has an invalid trampoline ctx offset for arg#%u\n",
+ stub_fname, arg_no);
goto err_out;
}
if (args[arg_no].type != stub_args[arg_no].type) {
- pr_warn("arg#%u type in stub function %s__%s does not match with its original func_proto\n",
- arg_no, st_ops_name, member_name);
+ pr_warn("arg#%u type in stub function %s does not match with its original func_proto\n",
+ arg_no, stub_fname);
goto err_out;
}
/* Fill the information of the new argument */
- info->reg_type =
- PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL;
info->btf_id = arg_btf_id;
info->btf = btf;
info->offset = offset;
+ if (is_nullable) {
+ info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL;
+ } else if (is_refcounted) {
+ info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID;
+ info->refcounted = true;
+ }
info++;
info_cnt++;
@@ -307,6 +307,27 @@ void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc)
kfree(arg_info);
}
+static bool is_module_member(const struct btf *btf, u32 id)
+{
+ const struct btf_type *t;
+
+ t = btf_type_resolve_ptr(btf, id, NULL);
+ if (!t)
+ return false;
+
+ if (!__btf_type_is_struct(t) && !btf_type_is_fwd(t))
+ return false;
+
+ return !strcmp(btf_name_by_offset(btf, t->name_off), "module");
+}
+
+int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff)
+{
+ void *func_ptr = *(void **)(st_ops->cfi_stubs + moff);
+
+ return func_ptr ? 0 : -ENOTSUPP;
+}
+
int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
struct btf *btf,
struct bpf_verifier_log *log)
@@ -369,8 +390,11 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
st_ops_desc->value_type = btf_type_by_id(btf, value_id);
for_each_member(i, t, member) {
- const struct btf_type *func_proto;
+ const struct btf_type *func_proto, *ret_type;
+ void **stub_func_addr;
+ u32 moff;
+ moff = __btf_member_bit_offset(t, member) / 8;
mname = btf_name_by_offset(btf, member->name_off);
if (!*mname) {
pr_warn("anon member in struct %s is not supported\n",
@@ -386,12 +410,33 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
goto errout;
}
+ if (!st_ops_ids[IDX_MODULE_ID] && is_module_member(btf, member->type)) {
+ pr_warn("'struct module' btf id not found. Is CONFIG_MODULES enabled? bpf_struct_ops '%s' needs module support.\n",
+ st_ops->name);
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
func_proto = btf_type_resolve_func_ptr(btf,
member->type,
NULL);
- if (!func_proto)
+
+ /* The member is not a function pointer or
+ * the function pointer is not supported.
+ */
+ if (!func_proto || bpf_struct_ops_supported(st_ops, moff))
continue;
+ if (func_proto->type) {
+ ret_type = btf_type_resolve_ptr(btf, func_proto->type, NULL);
+ if (ret_type && !__btf_type_is_struct(ret_type)) {
+ pr_warn("func ptr %s in struct %s returns non-struct pointer, which is not supported\n",
+ mname, st_ops->name);
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+ }
+
if (btf_distill_func_proto(log, btf,
func_proto, mname,
&st_ops->func_models[i])) {
@@ -401,8 +446,9 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
goto errout;
}
+ stub_func_addr = *(void **)(st_ops->cfi_stubs + moff);
err = prepare_arg_info(btf, st_ops->name, mname,
- func_proto,
+ func_proto, stub_func_addr,
arg_info + i);
if (err)
goto errout;
@@ -479,11 +525,11 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
{
u32 i;
- for (i = 0; i < st_map->links_cnt; i++) {
- if (st_map->links[i]) {
- bpf_link_put(st_map->links[i]);
- st_map->links[i] = NULL;
- }
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->links[i])
+ break;
+ bpf_link_put(st_map->links[i]);
+ st_map->links[i] = NULL;
}
}
@@ -555,7 +601,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
if (model->ret_size > 0)
flags |= BPF_TRAMP_F_RET_FENTRY_RET;
- size = arch_bpf_trampoline_size(model, flags, tlinks, NULL);
+ size = arch_bpf_trampoline_size(model, flags, tlinks, stub_func);
if (size <= 0)
return size ? : -EFAULT;
@@ -571,7 +617,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
}
size = arch_prepare_bpf_trampoline(NULL, image + image_off,
- image + PAGE_SIZE,
+ image + image_off + size,
model, flags, tlinks, stub_func);
if (size <= 0) {
if (image != *_image)
@@ -584,6 +630,49 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
return 0;
}
+static void bpf_struct_ops_ksym_init(const char *tname, const char *mname,
+ void *image, unsigned int size,
+ struct bpf_ksym *ksym)
+{
+ snprintf(ksym->name, KSYM_NAME_LEN, "bpf__%s_%s", tname, mname);
+ INIT_LIST_HEAD_RCU(&ksym->lnode);
+ bpf_image_ksym_init(image, size, ksym);
+}
+
+static void bpf_struct_ops_map_add_ksyms(struct bpf_struct_ops_map *st_map)
+{
+ u32 i;
+
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->ksyms[i])
+ break;
+ bpf_image_ksym_add(st_map->ksyms[i]);
+ }
+}
+
+static void bpf_struct_ops_map_del_ksyms(struct bpf_struct_ops_map *st_map)
+{
+ u32 i;
+
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->ksyms[i])
+ break;
+ bpf_image_ksym_del(st_map->ksyms[i]);
+ }
+}
+
+static void bpf_struct_ops_map_free_ksyms(struct bpf_struct_ops_map *st_map)
+{
+ u32 i;
+
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->ksyms[i])
+ break;
+ kfree(st_map->ksyms[i]);
+ st_map->ksyms[i] = NULL;
+ }
+}
+
static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 flags)
{
@@ -599,6 +688,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
int prog_fd, err;
u32 i, trampoline_start, image_off = 0;
void *cur_image = NULL, *image = NULL;
+ struct bpf_link **plink;
+ struct bpf_ksym **pksym;
+ const char *tname, *mname;
if (flags)
return -EINVAL;
@@ -637,14 +729,19 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
udata = &uvalue->data;
kdata = &kvalue->data;
+ plink = st_map->links;
+ pksym = st_map->ksyms;
+ tname = btf_name_by_offset(st_map->btf, t->name_off);
module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
for_each_member(i, t, member) {
const struct btf_type *mtype, *ptype;
struct bpf_prog *prog;
struct bpf_tramp_link *link;
+ struct bpf_ksym *ksym;
u32 moff;
moff = __btf_member_bit_offset(t, member) / 8;
+ mname = btf_name_by_offset(st_map->btf, member->name_off);
ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL);
if (ptype == module_type) {
if (*(void **)(udata + moff))
@@ -712,7 +809,14 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
}
bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,
&bpf_struct_ops_link_lops, prog);
- st_map->links[i] = &link->link;
+ *plink++ = &link->link;
+
+ ksym = kzalloc(sizeof(*ksym), GFP_USER);
+ if (!ksym) {
+ err = -ENOMEM;
+ goto reset_unlock;
+ }
+ *pksym++ = ksym;
trampoline_start = image_off;
err = bpf_struct_ops_prepare_trampoline(tlinks, link,
@@ -728,13 +832,17 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
cur_image = image;
trampoline_start = 0;
}
- if (err < 0)
- goto reset_unlock;
*(void **)(kdata + moff) = image + trampoline_start + cfi_get_offset();
/* put prog_id to udata */
*(unsigned long *)(udata + moff) = prog->aux->id;
+
+ /* init ksym for this trampoline */
+ bpf_struct_ops_ksym_init(tname, mname,
+ image + trampoline_start,
+ image_off - trampoline_start,
+ ksym);
}
if (st_ops->validate) {
@@ -742,8 +850,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
if (err)
goto reset_unlock;
}
- for (i = 0; i < st_map->image_pages_cnt; i++)
- arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
+ for (i = 0; i < st_map->image_pages_cnt; i++) {
+ err = arch_protect_bpf_trampoline(st_map->image_pages[i],
+ PAGE_SIZE);
+ if (err)
+ goto reset_unlock;
+ }
if (st_map->map.map_flags & BPF_F_LINK) {
err = 0;
@@ -755,7 +867,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
goto unlock;
}
- err = st_ops->reg(kdata);
+ err = st_ops->reg(kdata, NULL);
if (likely(!err)) {
/* This refcnt increment on the map here after
* 'st_ops->reg()' is secure since the state of the
@@ -779,6 +891,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
*/
reset_unlock:
+ bpf_struct_ops_map_free_ksyms(st_map);
bpf_struct_ops_map_free_image(st_map);
bpf_struct_ops_map_put_progs(st_map);
memset(uvalue, 0, map->value_size);
@@ -786,6 +899,8 @@ reset_unlock:
unlock:
kfree(tlinks);
mutex_unlock(&st_map->lock);
+ if (!err)
+ bpf_struct_ops_map_add_ksyms(st_map);
return err;
}
@@ -803,7 +918,7 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
BPF_STRUCT_OPS_STATE_TOBEFREE);
switch (prev_state) {
case BPF_STRUCT_OPS_STATE_INUSE:
- st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data);
+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, NULL);
bpf_map_put(map);
return 0;
case BPF_STRUCT_OPS_STATE_TOBEFREE:
@@ -833,7 +948,7 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
btf_type_seq_show(st_map->btf,
map->btf_vmlinux_value_type_id,
value, m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
}
kfree(value);
@@ -845,7 +960,10 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
if (st_map->links)
bpf_struct_ops_map_put_progs(st_map);
+ if (st_map->ksyms)
+ bpf_struct_ops_map_free_ksyms(st_map);
bpf_map_area_free(st_map->links);
+ bpf_map_area_free(st_map->ksyms);
bpf_struct_ops_map_free_image(st_map);
bpf_map_area_free(st_map->uvalue);
bpf_map_area_free(st_map);
@@ -862,6 +980,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
if (btf_is_module(st_map->btf))
module_put(st_map->st_ops_desc->st_ops->owner);
+ bpf_struct_ops_map_del_ksyms(st_map);
+
/* The struct_ops's function may switch to another struct_ops.
*
* For example, bpf_tcp_cc_x->init() may switch to
@@ -891,6 +1011,19 @@ static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
return 0;
}
+static u32 count_func_ptrs(const struct btf *btf, const struct btf_type *t)
+{
+ int i;
+ u32 count;
+ const struct btf_member *member;
+
+ count = 0;
+ for_each_member(i, t, member)
+ if (btf_type_resolve_func_ptr(btf, member->type, NULL))
+ count++;
+ return count;
+}
+
static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
{
const struct bpf_struct_ops_desc *st_ops_desc;
@@ -957,11 +1090,15 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
map = &st_map->map;
st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
- st_map->links_cnt = btf_type_vlen(t);
+ st_map->funcs_cnt = count_func_ptrs(btf, t);
st_map->links =
- bpf_map_area_alloc(st_map->links_cnt * sizeof(struct bpf_links *),
+ bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_link *),
+ NUMA_NO_NODE);
+
+ st_map->ksyms =
+ bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_ksym *),
NUMA_NO_NODE);
- if (!st_map->uvalue || !st_map->links) {
+ if (!st_map->uvalue || !st_map->links || !st_map->ksyms) {
ret = -ENOMEM;
goto errout_free;
}
@@ -990,7 +1127,8 @@ static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map)
usage = sizeof(*st_map) +
vt->size - sizeof(struct bpf_struct_ops_value);
usage += vt->size;
- usage += btf_type_vlen(vt) * sizeof(struct bpf_links *);
+ usage += st_map->funcs_cnt * sizeof(struct bpf_link *);
+ usage += st_map->funcs_cnt * sizeof(struct bpf_ksym *);
usage += PAGE_SIZE;
return usage;
}
@@ -1055,10 +1193,7 @@ static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
st_map = (struct bpf_struct_ops_map *)
rcu_dereference_protected(st_link->map, true);
if (st_map) {
- /* st_link->map can be NULL if
- * bpf_struct_ops_link_create() fails to register.
- */
- st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data);
+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);
bpf_map_put(&st_map->map);
}
kfree(st_link);
@@ -1073,7 +1208,8 @@ static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
st_link = container_of(link, struct bpf_struct_ops_link, link);
rcu_read_lock();
map = rcu_dereference(st_link->map);
- seq_printf(seq, "map_id:\t%d\n", map->id);
+ if (map)
+ seq_printf(seq, "map_id:\t%d\n", map->id);
rcu_read_unlock();
}
@@ -1086,7 +1222,8 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
st_link = container_of(link, struct bpf_struct_ops_link, link);
rcu_read_lock();
map = rcu_dereference(st_link->map);
- info->struct_ops.map_id = map->id;
+ if (map)
+ info->struct_ops.map_id = map->id;
rcu_read_unlock();
return 0;
}
@@ -1111,6 +1248,10 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
mutex_lock(&update_mutex);
old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+ if (!old_map) {
+ err = -ENOLINK;
+ goto err_out;
+ }
if (expected_old_map && old_map != expected_old_map) {
err = -EPERM;
goto err_out;
@@ -1123,7 +1264,7 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
goto err_out;
}
- err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
+ err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data, link);
if (err)
goto err_out;
@@ -1137,11 +1278,53 @@ err_out:
return err;
}
+static int bpf_struct_ops_map_link_detach(struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link = container_of(link, struct bpf_struct_ops_link, link);
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
+
+ mutex_lock(&update_mutex);
+
+ map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+ if (!map) {
+ mutex_unlock(&update_mutex);
+ return 0;
+ }
+ st_map = container_of(map, struct bpf_struct_ops_map, map);
+
+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);
+
+ RCU_INIT_POINTER(st_link->map, NULL);
+ /* Pair with bpf_map_get() in bpf_struct_ops_link_create() or
+ * bpf_map_inc() in bpf_struct_ops_map_link_update().
+ */
+ bpf_map_put(&st_map->map);
+
+ mutex_unlock(&update_mutex);
+
+ wake_up_interruptible_poll(&st_link->wait_hup, EPOLLHUP);
+
+ return 0;
+}
+
+static __poll_t bpf_struct_ops_map_link_poll(struct file *file,
+ struct poll_table_struct *pts)
+{
+ struct bpf_struct_ops_link *st_link = file->private_data;
+
+ poll_wait(file, &st_link->wait_hup, pts);
+
+ return rcu_access_pointer(st_link->map) ? 0 : EPOLLHUP;
+}
+
static const struct bpf_link_ops bpf_struct_ops_map_lops = {
.dealloc = bpf_struct_ops_map_link_dealloc,
+ .detach = bpf_struct_ops_map_link_detach,
.show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
.fill_link_info = bpf_struct_ops_map_link_fill_link_info,
.update_map = bpf_struct_ops_map_link_update,
+ .poll = bpf_struct_ops_map_link_poll,
};
int bpf_struct_ops_link_create(union bpf_attr *attr)
@@ -1174,13 +1357,21 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
if (err)
goto err_out;
- err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data);
+ init_waitqueue_head(&link->wait_hup);
+
+ /* Hold the update_mutex such that the subsystem cannot
+ * do link->ops->detach() before the link is fully initialized.
+ */
+ mutex_lock(&update_mutex);
+ err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link);
if (err) {
+ mutex_unlock(&update_mutex);
bpf_link_cleanup(&link_primer);
link = NULL;
goto err_out;
}
RCU_INIT_POINTER(link->map, map);
+ mutex_unlock(&update_mutex);
return bpf_link_settle(&link_primer);
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index adf6dfe0ba68..1109475953c0 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -16,7 +16,6 @@
#include <linux/filter.h>
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
-#include <linux/fdtable.h>
#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(task_cache);
@@ -25,22 +24,20 @@ static DEFINE_PER_CPU(int, bpf_task_storage_busy);
static void bpf_task_storage_lock(void)
{
- migrate_disable();
+ cant_migrate();
this_cpu_inc(bpf_task_storage_busy);
}
static void bpf_task_storage_unlock(void)
{
this_cpu_dec(bpf_task_storage_busy);
- migrate_enable();
}
static bool bpf_task_storage_trylock(void)
{
- migrate_disable();
+ cant_migrate();
if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
this_cpu_dec(bpf_task_storage_busy);
- migrate_enable();
return false;
}
return true;
@@ -73,18 +70,19 @@ void bpf_task_storage_free(struct task_struct *task)
{
struct bpf_local_storage *local_storage;
+ migrate_disable();
rcu_read_lock();
local_storage = rcu_dereference(task->bpf_storage);
- if (!local_storage) {
- rcu_read_unlock();
- return;
- }
+ if (!local_storage)
+ goto out;
bpf_task_storage_lock();
bpf_local_storage_destroy(local_storage);
bpf_task_storage_unlock();
+out:
rcu_read_unlock();
+ migrate_enable();
}
static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
@@ -129,6 +127,9 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
struct pid *pid;
int fd, err;
+ if ((map_flags & BPF_F_LOCK) && btf_record_has_field(map->record, BPF_UPTR))
+ return -EOPNOTSUPP;
+
fd = *(int *)key;
pid = pidfd_get_pid(fd, &f_flags);
if (IS_ERR(pid))
@@ -147,7 +148,7 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
bpf_task_storage_lock();
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value, map_flags,
- GFP_ATOMIC);
+ true, GFP_ATOMIC);
bpf_task_storage_unlock();
err = PTR_ERR_OR_ZERO(sdata);
@@ -219,7 +220,7 @@ static void *__bpf_task_storage_get(struct bpf_map *map,
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) {
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST, gfp_flags);
+ BPF_NOEXIST, false, gfp_flags);
return IS_ERR(sdata) ? NULL : sdata->data;
}
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 90c4a32d89ff..1d2cf898e21e 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -26,6 +26,7 @@
#include <linux/bsearch.h>
#include <linux/kobject.h>
#include <linux/sysfs.h>
+#include <linux/overflow.h>
#include <net/netfilter/nf_bpf_link.h>
@@ -212,12 +213,13 @@ enum btf_kfunc_hook {
BTF_KFUNC_HOOK_TRACING,
BTF_KFUNC_HOOK_SYSCALL,
BTF_KFUNC_HOOK_FMODRET,
- BTF_KFUNC_HOOK_CGROUP_SKB,
+ BTF_KFUNC_HOOK_CGROUP,
BTF_KFUNC_HOOK_SCHED_ACT,
BTF_KFUNC_HOOK_SK_SKB,
BTF_KFUNC_HOOK_SOCKET_FILTER,
BTF_KFUNC_HOOK_LWT,
BTF_KFUNC_HOOK_NETFILTER,
+ BTF_KFUNC_HOOK_KPROBE,
BTF_KFUNC_HOOK_MAX,
};
@@ -273,6 +275,7 @@ struct btf {
u32 start_str_off; /* first string offset (0 for base BTF) */
char name[MODULE_NAME_LEN];
bool kernel_btf;
+ __u32 *base_id_map; /* map from distilled base BTF -> vmlinux BTF ids */
};
enum verifier_phase {
@@ -413,7 +416,7 @@ const char *btf_type_str(const struct btf_type *t)
struct btf_show {
u64 flags;
void *target; /* target of show operation (seq file, buffer) */
- void (*showfn)(struct btf_show *show, const char *fmt, va_list args);
+ __printf(2, 0) void (*showfn)(struct btf_show *show, const char *fmt, va_list args);
const struct btf *btf;
/* below are used during iteration */
struct {
@@ -496,11 +499,6 @@ bool btf_type_is_void(const struct btf_type *t)
return t == &btf_void;
}
-static bool btf_type_is_fwd(const struct btf_type *t)
-{
- return BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
-}
-
static bool btf_type_is_datasec(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
@@ -529,6 +527,11 @@ static bool btf_type_is_decl_tag_target(const struct btf_type *t)
btf_type_is_var(t) || btf_type_is_typedef(t);
}
+bool btf_is_vmlinux(const struct btf *btf)
+{
+ return btf->kernel_btf && !btf->base_btf;
+}
+
u32 btf_nr_types(const struct btf *btf)
{
u32 total = 0;
@@ -604,6 +607,7 @@ s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p)
spin_unlock_bh(&btf_idr_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(bpf_find_btf_id);
const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
u32 id, u32 *res_id)
@@ -771,7 +775,7 @@ static bool __btf_name_char_ok(char c, bool first)
return true;
}
-static const char *btf_str_by_offset(const struct btf *btf, u32 offset)
+const char *btf_str_by_offset(const struct btf *btf, u32 offset)
{
while (offset < btf->start_str_off)
btf = btf->base_btf;
@@ -783,7 +787,7 @@ static const char *btf_str_by_offset(const struct btf *btf, u32 offset)
return NULL;
}
-static bool __btf_name_valid(const struct btf *btf, u32 offset)
+static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
{
/* offset must be valid */
const char *src = btf_str_by_offset(btf, offset);
@@ -804,11 +808,6 @@ static bool __btf_name_valid(const struct btf *btf, u32 offset)
return !*src;
}
-static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
-{
- return __btf_name_valid(btf, offset);
-}
-
/* Allow any printable character in DATASEC names */
static bool btf_name_valid_section(const struct btf *btf, u32 offset)
{
@@ -816,9 +815,11 @@ static bool btf_name_valid_section(const struct btf *btf, u32 offset)
const char *src = btf_str_by_offset(btf, offset);
const char *src_limit;
+ if (!*src)
+ return false;
+
/* set a limit on identifier length */
src_limit = src + KSYM_NAME_LEN;
- src++;
while (*src && src < src_limit) {
if (!isprint(*src))
return false;
@@ -1669,14 +1670,8 @@ static void btf_free_kfunc_set_tab(struct btf *btf)
if (!tab)
return;
- /* For module BTF, we directly assign the sets being registered, so
- * there is nothing to free except kfunc_set_tab.
- */
- if (btf_is_module(btf))
- goto free_tab;
for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++)
kfree(tab->sets[hook]);
-free_tab:
kfree(tab);
btf->kfunc_set_tab = NULL;
}
@@ -1734,7 +1729,12 @@ static void btf_free(struct btf *btf)
kvfree(btf->types);
kvfree(btf->resolved_sizes);
kvfree(btf->resolved_ids);
- kvfree(btf->data);
+ /* vmlinux does not allocate btf->data, it simply points it at
+ * __start_BTF.
+ */
+ if (!btf_is_vmlinux(btf))
+ kvfree(btf->data);
+ kvfree(btf->base_id_map);
kfree(btf);
}
@@ -1763,6 +1763,23 @@ void btf_put(struct btf *btf)
}
}
+struct btf *btf_base_btf(const struct btf *btf)
+{
+ return btf->base_btf;
+}
+
+const struct btf_header *btf_header(const struct btf *btf)
+{
+ return &btf->hdr;
+}
+
+void btf_set_base_btf(struct btf *btf, const struct btf *base_btf)
+{
+ btf->base_btf = (struct btf *)base_btf;
+ btf->start_id = btf_nr_types(base_btf);
+ btf->start_str_off = base_btf->hdr.str_len;
+}
+
static int env_resolve_init(struct btf_verifier_env *env)
{
struct btf *btf = env->btf;
@@ -2560,7 +2577,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- if (btf_type_kflag(t)) {
+ if (btf_type_kflag(t) && !btf_type_is_type_tag(t)) {
btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
return -EINVAL;
}
@@ -2788,7 +2805,7 @@ static void btf_ref_type_log(struct btf_verifier_env *env,
btf_verifier_log(env, "type_id=%u", t->type);
}
-static struct btf_kind_operations modifier_ops = {
+static const struct btf_kind_operations modifier_ops = {
.check_meta = btf_ref_type_check_meta,
.resolve = btf_modifier_resolve,
.check_member = btf_modifier_check_member,
@@ -2797,7 +2814,7 @@ static struct btf_kind_operations modifier_ops = {
.show = btf_modifier_show,
};
-static struct btf_kind_operations ptr_ops = {
+static const struct btf_kind_operations ptr_ops = {
.check_meta = btf_ref_type_check_meta,
.resolve = btf_ptr_resolve,
.check_member = btf_ptr_check_member,
@@ -2838,7 +2855,7 @@ static void btf_fwd_type_log(struct btf_verifier_env *env,
btf_verifier_log(env, "%s", btf_type_kflag(t) ? "union" : "struct");
}
-static struct btf_kind_operations fwd_ops = {
+static const struct btf_kind_operations fwd_ops = {
.check_meta = btf_fwd_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_df_check_member,
@@ -3089,7 +3106,7 @@ static void btf_array_show(const struct btf *btf, const struct btf_type *t,
__btf_array_show(btf, t, type_id, data, bits_offset, show);
}
-static struct btf_kind_operations array_ops = {
+static const struct btf_kind_operations array_ops = {
.check_meta = btf_array_check_meta,
.resolve = btf_array_resolve,
.check_member = btf_array_check_member,
@@ -3314,9 +3331,11 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
}
static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
- u32 off, int sz, struct btf_field_info *info)
+ u32 off, int sz, struct btf_field_info *info, u32 field_mask)
{
enum btf_field_type type;
+ const char *tag_value;
+ bool is_type_tag;
u32 res_id;
/* Permit modifiers on the pointer itself */
@@ -3326,21 +3345,27 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
if (!btf_type_is_ptr(t))
return BTF_FIELD_IGNORE;
t = btf_type_by_id(btf, t->type);
-
- if (!btf_type_is_type_tag(t))
+ is_type_tag = btf_type_is_type_tag(t) && !btf_type_kflag(t);
+ if (!is_type_tag)
return BTF_FIELD_IGNORE;
/* Reject extra tags */
if (btf_type_is_type_tag(btf_type_by_id(btf, t->type)))
return -EINVAL;
- if (!strcmp("kptr_untrusted", __btf_name_by_offset(btf, t->name_off)))
+ tag_value = __btf_name_by_offset(btf, t->name_off);
+ if (!strcmp("kptr_untrusted", tag_value))
type = BPF_KPTR_UNREF;
- else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off)))
+ else if (!strcmp("kptr", tag_value))
type = BPF_KPTR_REF;
- else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off)))
+ else if (!strcmp("percpu_kptr", tag_value))
type = BPF_KPTR_PERCPU;
+ else if (!strcmp("uptr", tag_value))
+ type = BPF_UPTR;
else
return -EINVAL;
+ if (!(type & field_mask))
+ return BTF_FIELD_IGNORE;
+
/* Get the base type */
t = btf_type_skip_modifiers(btf, t->type, &res_id);
/* Only pointer to struct is allowed */
@@ -3441,10 +3466,12 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
goto end; \
}
-static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
+static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_type,
+ u32 field_mask, u32 *seen_mask,
int *align, int *sz)
{
int type = 0;
+ const char *name = __btf_name_by_offset(btf, var_type->name_off);
if (field_mask & BPF_SPIN_LOCK) {
if (!strcmp(name, "bpf_spin_lock")) {
@@ -3455,6 +3482,15 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
goto end;
}
}
+ if (field_mask & BPF_RES_SPIN_LOCK) {
+ if (!strcmp(name, "bpf_res_spin_lock")) {
+ if (*seen_mask & BPF_RES_SPIN_LOCK)
+ return -E2BIG;
+ *seen_mask |= BPF_RES_SPIN_LOCK;
+ type = BPF_RES_SPIN_LOCK;
+ goto end;
+ }
+ }
if (field_mask & BPF_TIMER) {
if (!strcmp(name, "bpf_timer")) {
if (*seen_mask & BPF_TIMER)
@@ -3464,6 +3500,15 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
goto end;
}
}
+ if (field_mask & BPF_WORKQUEUE) {
+ if (!strcmp(name, "bpf_wq")) {
+ if (*seen_mask & BPF_WORKQUEUE)
+ return -E2BIG;
+ *seen_mask |= BPF_WORKQUEUE;
+ type = BPF_WORKQUEUE;
+ goto end;
+ }
+ }
field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head");
field_mask_test_name(BPF_LIST_NODE, "bpf_list_node");
field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root");
@@ -3471,7 +3516,7 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
field_mask_test_name(BPF_REFCOUNT, "bpf_refcount");
/* Only return BPF_KPTR when all other types with matchable names fail */
- if (field_mask & BPF_KPTR) {
+ if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) {
type = BPF_KPTR_REF;
goto end;
}
@@ -3484,138 +3529,241 @@ end:
#undef field_mask_test_name
+/* Repeat a number of fields for a specified number of times.
+ *
+ * Copy the fields starting from the first field and repeat them for
+ * repeat_cnt times. The fields are repeated by adding the offset of each
+ * field with
+ * (i + 1) * elem_size
+ * where i is the repeat index and elem_size is the size of an element.
+ */
+static int btf_repeat_fields(struct btf_field_info *info, int info_cnt,
+ u32 field_cnt, u32 repeat_cnt, u32 elem_size)
+{
+ u32 i, j;
+ u32 cur;
+
+ /* Ensure not repeating fields that should not be repeated. */
+ for (i = 0; i < field_cnt; i++) {
+ switch (info[i].type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ /* The type of struct size or variable size is u32,
+ * so the multiplication will not overflow.
+ */
+ if (field_cnt * (repeat_cnt + 1) > info_cnt)
+ return -E2BIG;
+
+ cur = field_cnt;
+ for (i = 0; i < repeat_cnt; i++) {
+ memcpy(&info[cur], &info[0], field_cnt * sizeof(info[0]));
+ for (j = 0; j < field_cnt; j++)
+ info[cur++].off += (i + 1) * elem_size;
+ }
+
+ return 0;
+}
+
static int btf_find_struct_field(const struct btf *btf,
const struct btf_type *t, u32 field_mask,
- struct btf_field_info *info, int info_cnt)
+ struct btf_field_info *info, int info_cnt,
+ u32 level);
+
+/* Find special fields in the struct type of a field.
+ *
+ * This function is used to find fields of special types that is not a
+ * global variable or a direct field of a struct type. It also handles the
+ * repetition if it is the element type of an array.
+ */
+static int btf_find_nested_struct(const struct btf *btf, const struct btf_type *t,
+ u32 off, u32 nelems,
+ u32 field_mask, struct btf_field_info *info,
+ int info_cnt, u32 level)
{
- int ret, idx = 0, align, sz, field_type;
- const struct btf_member *member;
+ int ret, err, i;
+
+ level++;
+ if (level >= MAX_RESOLVE_DEPTH)
+ return -E2BIG;
+
+ ret = btf_find_struct_field(btf, t, field_mask, info, info_cnt, level);
+
+ if (ret <= 0)
+ return ret;
+
+ /* Shift the offsets of the nested struct fields to the offsets
+ * related to the container.
+ */
+ for (i = 0; i < ret; i++)
+ info[i].off += off;
+
+ if (nelems > 1) {
+ err = btf_repeat_fields(info, info_cnt, ret, nelems - 1, t->size);
+ if (err == 0)
+ ret *= nelems;
+ else
+ ret = err;
+ }
+
+ return ret;
+}
+
+static int btf_find_field_one(const struct btf *btf,
+ const struct btf_type *var,
+ const struct btf_type *var_type,
+ int var_idx,
+ u32 off, u32 expected_size,
+ u32 field_mask, u32 *seen_mask,
+ struct btf_field_info *info, int info_cnt,
+ u32 level)
+{
+ int ret, align, sz, field_type;
struct btf_field_info tmp;
+ const struct btf_array *array;
+ u32 i, nelems = 1;
+
+ /* Walk into array types to find the element type and the number of
+ * elements in the (flattened) array.
+ */
+ for (i = 0; i < MAX_RESOLVE_DEPTH && btf_type_is_array(var_type); i++) {
+ array = btf_array(var_type);
+ nelems *= array->nelems;
+ var_type = btf_type_by_id(btf, array->type);
+ }
+ if (i == MAX_RESOLVE_DEPTH)
+ return -E2BIG;
+ if (nelems == 0)
+ return 0;
+
+ field_type = btf_get_field_type(btf, var_type,
+ field_mask, seen_mask, &align, &sz);
+ /* Look into variables of struct types */
+ if (!field_type && __btf_type_is_struct(var_type)) {
+ sz = var_type->size;
+ if (expected_size && expected_size != sz * nelems)
+ return 0;
+ ret = btf_find_nested_struct(btf, var_type, off, nelems, field_mask,
+ &info[0], info_cnt, level);
+ return ret;
+ }
+
+ if (field_type == 0)
+ return 0;
+ if (field_type < 0)
+ return field_type;
+
+ if (expected_size && expected_size != sz * nelems)
+ return 0;
+ if (off % align)
+ return 0;
+
+ switch (field_type) {
+ case BPF_SPIN_LOCK:
+ case BPF_RES_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_WORKQUEUE:
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ case BPF_REFCOUNT:
+ ret = btf_find_struct(btf, var_type, off, sz, field_type,
+ info_cnt ? &info[0] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
+ ret = btf_find_kptr(btf, var_type, off, sz,
+ info_cnt ? &info[0] : &tmp, field_mask);
+ if (ret < 0)
+ return ret;
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ ret = btf_find_graph_root(btf, var, var_type,
+ var_idx, off, sz,
+ info_cnt ? &info[0] : &tmp,
+ field_type);
+ if (ret < 0)
+ return ret;
+ break;
+ default:
+ return -EFAULT;
+ }
+
+ if (ret == BTF_FIELD_IGNORE)
+ return 0;
+ if (!info_cnt)
+ return -E2BIG;
+ if (nelems > 1) {
+ ret = btf_repeat_fields(info, info_cnt, 1, nelems - 1, sz);
+ if (ret < 0)
+ return ret;
+ }
+ return nelems;
+}
+
+static int btf_find_struct_field(const struct btf *btf,
+ const struct btf_type *t, u32 field_mask,
+ struct btf_field_info *info, int info_cnt,
+ u32 level)
+{
+ int ret, idx = 0;
+ const struct btf_member *member;
u32 i, off, seen_mask = 0;
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
- field_type = btf_get_field_type(__btf_name_by_offset(btf, member_type->name_off),
- field_mask, &seen_mask, &align, &sz);
- if (field_type == 0)
- continue;
- if (field_type < 0)
- return field_type;
-
off = __btf_member_bit_offset(t, member);
if (off % 8)
/* valid C code cannot generate such BTF */
return -EINVAL;
off /= 8;
- if (off % align)
- continue;
-
- switch (field_type) {
- case BPF_SPIN_LOCK:
- case BPF_TIMER:
- case BPF_LIST_NODE:
- case BPF_RB_NODE:
- case BPF_REFCOUNT:
- ret = btf_find_struct(btf, member_type, off, sz, field_type,
- idx < info_cnt ? &info[idx] : &tmp);
- if (ret < 0)
- return ret;
- break;
- case BPF_KPTR_UNREF:
- case BPF_KPTR_REF:
- case BPF_KPTR_PERCPU:
- ret = btf_find_kptr(btf, member_type, off, sz,
- idx < info_cnt ? &info[idx] : &tmp);
- if (ret < 0)
- return ret;
- break;
- case BPF_LIST_HEAD:
- case BPF_RB_ROOT:
- ret = btf_find_graph_root(btf, t, member_type,
- i, off, sz,
- idx < info_cnt ? &info[idx] : &tmp,
- field_type);
- if (ret < 0)
- return ret;
- break;
- default:
- return -EFAULT;
- }
- if (ret == BTF_FIELD_IGNORE)
- continue;
- if (idx >= info_cnt)
- return -E2BIG;
- ++idx;
+ ret = btf_find_field_one(btf, t, member_type, i,
+ off, 0,
+ field_mask, &seen_mask,
+ &info[idx], info_cnt - idx, level);
+ if (ret < 0)
+ return ret;
+ idx += ret;
}
return idx;
}
static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
u32 field_mask, struct btf_field_info *info,
- int info_cnt)
+ int info_cnt, u32 level)
{
- int ret, idx = 0, align, sz, field_type;
+ int ret, idx = 0;
const struct btf_var_secinfo *vsi;
- struct btf_field_info tmp;
u32 i, off, seen_mask = 0;
for_each_vsi(i, t, vsi) {
const struct btf_type *var = btf_type_by_id(btf, vsi->type);
const struct btf_type *var_type = btf_type_by_id(btf, var->type);
- field_type = btf_get_field_type(__btf_name_by_offset(btf, var_type->name_off),
- field_mask, &seen_mask, &align, &sz);
- if (field_type == 0)
- continue;
- if (field_type < 0)
- return field_type;
-
off = vsi->offset;
- if (vsi->size != sz)
- continue;
- if (off % align)
- continue;
-
- switch (field_type) {
- case BPF_SPIN_LOCK:
- case BPF_TIMER:
- case BPF_LIST_NODE:
- case BPF_RB_NODE:
- case BPF_REFCOUNT:
- ret = btf_find_struct(btf, var_type, off, sz, field_type,
- idx < info_cnt ? &info[idx] : &tmp);
- if (ret < 0)
- return ret;
- break;
- case BPF_KPTR_UNREF:
- case BPF_KPTR_REF:
- case BPF_KPTR_PERCPU:
- ret = btf_find_kptr(btf, var_type, off, sz,
- idx < info_cnt ? &info[idx] : &tmp);
- if (ret < 0)
- return ret;
- break;
- case BPF_LIST_HEAD:
- case BPF_RB_ROOT:
- ret = btf_find_graph_root(btf, var, var_type,
- -1, off, sz,
- idx < info_cnt ? &info[idx] : &tmp,
- field_type);
- if (ret < 0)
- return ret;
- break;
- default:
- return -EFAULT;
- }
-
- if (ret == BTF_FIELD_IGNORE)
- continue;
- if (idx >= info_cnt)
- return -E2BIG;
- ++idx;
+ ret = btf_find_field_one(btf, var, var_type, -1, off, vsi->size,
+ field_mask, &seen_mask,
+ &info[idx], info_cnt - idx,
+ level);
+ if (ret < 0)
+ return ret;
+ idx += ret;
}
return idx;
}
@@ -3625,12 +3773,13 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
int info_cnt)
{
if (__btf_type_is_struct(t))
- return btf_find_struct_field(btf, t, field_mask, info, info_cnt);
+ return btf_find_struct_field(btf, t, field_mask, info, info_cnt, 0);
else if (btf_type_is_datasec(t))
- return btf_find_datasec_var(btf, t, field_mask, info, info_cnt);
+ return btf_find_datasec_var(btf, t, field_mask, info, info_cnt, 0);
return -EINVAL;
}
+/* Callers have to ensure the life cycle of btf if it is program BTF */
static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
struct btf_field_info *info)
{
@@ -3659,7 +3808,6 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
field->kptr.dtor = NULL;
id = info->kptr.type_id;
kptr_btf = (struct btf *)btf;
- btf_get(kptr_btf);
goto found_dtor;
}
if (id < 0)
@@ -3810,12 +3958,14 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
/* This needs to be kzalloc to zero out padding and unused fields, see
* comment in btf_record_equal.
*/
- rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN);
+ rec = kzalloc(struct_size(rec, fields, cnt), GFP_KERNEL | __GFP_NOWARN);
if (!rec)
return ERR_PTR(-ENOMEM);
rec->spin_lock_off = -EINVAL;
+ rec->res_spin_lock_off = -EINVAL;
rec->timer_off = -EINVAL;
+ rec->wq_off = -EINVAL;
rec->refcount_off = -EINVAL;
for (i = 0; i < cnt; i++) {
field_type_size = btf_field_type_size(info_arr[i].type);
@@ -3841,11 +3991,21 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
/* Cache offset for faster lookup at runtime */
rec->spin_lock_off = rec->fields[i].offset;
break;
+ case BPF_RES_SPIN_LOCK:
+ WARN_ON_ONCE(rec->spin_lock_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->res_spin_lock_off = rec->fields[i].offset;
+ break;
case BPF_TIMER:
WARN_ON_ONCE(rec->timer_off >= 0);
/* Cache offset for faster lookup at runtime */
rec->timer_off = rec->fields[i].offset;
break;
+ case BPF_WORKQUEUE:
+ WARN_ON_ONCE(rec->wq_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->wq_off = rec->fields[i].offset;
+ break;
case BPF_REFCOUNT:
WARN_ON_ONCE(rec->refcount_off >= 0);
/* Cache offset for faster lookup at runtime */
@@ -3854,6 +4014,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
if (ret < 0)
goto end;
@@ -3878,9 +4039,15 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
rec->cnt++;
}
+ if (rec->spin_lock_off >= 0 && rec->res_spin_lock_off >= 0) {
+ ret = -EINVAL;
+ goto end;
+ }
+
/* bpf_{list_head, rb_node} require bpf_spin_lock */
if ((btf_record_has_field(rec, BPF_LIST_HEAD) ||
- btf_record_has_field(rec, BPF_RB_ROOT)) && rec->spin_lock_off < 0) {
+ btf_record_has_field(rec, BPF_RB_ROOT)) &&
+ (rec->spin_lock_off < 0 && rec->res_spin_lock_off < 0)) {
ret = -EINVAL;
goto end;
}
@@ -3913,12 +4080,28 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
* Hence we only need to ensure that bpf_{list_head,rb_root} ownership
* does not form cycles.
*/
- if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_GRAPH_ROOT))
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & (BPF_GRAPH_ROOT | BPF_UPTR)))
return 0;
for (i = 0; i < rec->cnt; i++) {
struct btf_struct_meta *meta;
+ const struct btf_type *t;
u32 btf_id;
+ if (rec->fields[i].type == BPF_UPTR) {
+ /* The uptr only supports pinning one page and cannot
+ * point to a kernel struct
+ */
+ if (btf_is_kernel(rec->fields[i].kptr.btf))
+ return -EINVAL;
+ t = btf_type_by_id(rec->fields[i].kptr.btf,
+ rec->fields[i].kptr.btf_id);
+ if (!t->size)
+ return -EINVAL;
+ if (t->size > PAGE_SIZE)
+ return -E2BIG;
+ continue;
+ }
+
if (!(rec->fields[i].type & BPF_GRAPH_ROOT))
continue;
btf_id = rec->fields[i].graph_root.value_btf_id;
@@ -4054,7 +4237,7 @@ static void btf_struct_show(const struct btf *btf, const struct btf_type *t,
__btf_struct_show(btf, t, type_id, data, bits_offset, show);
}
-static struct btf_kind_operations struct_ops = {
+static const struct btf_kind_operations struct_ops = {
.check_meta = btf_struct_check_meta,
.resolve = btf_struct_resolve,
.check_member = btf_struct_check_member,
@@ -4222,7 +4405,7 @@ static void btf_enum_show(const struct btf *btf, const struct btf_type *t,
btf_show_end_type(show);
}
-static struct btf_kind_operations enum_ops = {
+static const struct btf_kind_operations enum_ops = {
.check_meta = btf_enum_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_enum_check_member,
@@ -4325,7 +4508,7 @@ static void btf_enum64_show(const struct btf *btf, const struct btf_type *t,
btf_show_end_type(show);
}
-static struct btf_kind_operations enum64_ops = {
+static const struct btf_kind_operations enum64_ops = {
.check_meta = btf_enum64_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_enum_check_member,
@@ -4403,7 +4586,7 @@ done:
btf_verifier_log(env, ")");
}
-static struct btf_kind_operations func_proto_ops = {
+static const struct btf_kind_operations func_proto_ops = {
.check_meta = btf_func_proto_check_meta,
.resolve = btf_df_resolve,
/*
@@ -4461,7 +4644,7 @@ static int btf_func_resolve(struct btf_verifier_env *env,
return 0;
}
-static struct btf_kind_operations func_ops = {
+static const struct btf_kind_operations func_ops = {
.check_meta = btf_func_check_meta,
.resolve = btf_func_resolve,
.check_member = btf_df_check_member,
@@ -4495,7 +4678,7 @@ static s32 btf_var_check_meta(struct btf_verifier_env *env,
}
if (!t->name_off ||
- !__btf_name_valid(env->btf, t->name_off)) {
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
@@ -4788,11 +4971,6 @@ static s32 btf_decl_tag_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- if (btf_type_kflag(t)) {
- btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
- return -EINVAL;
- }
-
component_idx = btf_type_decl_tag(t)->component_idx;
if (component_idx < -1) {
btf_verifier_log_type(env, t, "Invalid component_idx");
@@ -5383,36 +5561,72 @@ static const char *alloc_obj_fields[] = {
static struct btf_struct_metas *
btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
{
- union {
- struct btf_id_set set;
- struct {
- u32 _cnt;
- u32 _ids[ARRAY_SIZE(alloc_obj_fields)];
- } _arr;
- } aof;
struct btf_struct_metas *tab = NULL;
+ struct btf_id_set *aof;
int i, n, id, ret;
BUILD_BUG_ON(offsetof(struct btf_id_set, cnt) != 0);
BUILD_BUG_ON(sizeof(struct btf_id_set) != sizeof(u32));
- memset(&aof, 0, sizeof(aof));
+ aof = kmalloc(sizeof(*aof), GFP_KERNEL | __GFP_NOWARN);
+ if (!aof)
+ return ERR_PTR(-ENOMEM);
+ aof->cnt = 0;
+
for (i = 0; i < ARRAY_SIZE(alloc_obj_fields); i++) {
/* Try to find whether this special type exists in user BTF, and
* if so remember its ID so we can easily find it among members
* of structs that we iterate in the next loop.
*/
+ struct btf_id_set *new_aof;
+
id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT);
if (id < 0)
continue;
- aof.set.ids[aof.set.cnt++] = id;
+
+ new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_aof) {
+ ret = -ENOMEM;
+ goto free_aof;
+ }
+ aof = new_aof;
+ aof->ids[aof->cnt++] = id;
+ }
+
+ n = btf_nr_types(btf);
+ for (i = 1; i < n; i++) {
+ /* Try to find if there are kptrs in user BTF and remember their ID */
+ struct btf_id_set *new_aof;
+ struct btf_field_info tmp;
+ const struct btf_type *t;
+
+ t = btf_type_by_id(btf, i);
+ if (!t) {
+ ret = -EINVAL;
+ goto free_aof;
+ }
+
+ ret = btf_find_kptr(btf, t, 0, 0, &tmp, BPF_KPTR);
+ if (ret != BTF_FIELD_FOUND)
+ continue;
+
+ new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_aof) {
+ ret = -ENOMEM;
+ goto free_aof;
+ }
+ aof = new_aof;
+ aof->ids[aof->cnt++] = i;
}
- if (!aof.set.cnt)
+ if (!aof->cnt) {
+ kfree(aof);
return NULL;
- sort(&aof.set.ids, aof.set.cnt, sizeof(aof.set.ids[0]), btf_id_cmp_func, NULL);
+ }
+ sort(&aof->ids, aof->cnt, sizeof(aof->ids[0]), btf_id_cmp_func, NULL);
- n = btf_nr_types(btf);
for (i = 1; i < n; i++) {
struct btf_struct_metas *new_tab;
const struct btf_member *member;
@@ -5422,23 +5636,19 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
int j, tab_cnt;
t = btf_type_by_id(btf, i);
- if (!t) {
- ret = -EINVAL;
- goto free;
- }
if (!__btf_type_is_struct(t))
continue;
cond_resched();
for_each_member(j, t, member) {
- if (btf_id_set_contains(&aof.set, member->type))
+ if (btf_id_set_contains(aof, member->type))
goto parse;
}
continue;
parse:
tab_cnt = tab ? tab->cnt : 0;
- new_tab = krealloc(tab, offsetof(struct btf_struct_metas, types[tab_cnt + 1]),
+ new_tab = krealloc(tab, struct_size(new_tab, types, tab_cnt + 1),
GFP_KERNEL | __GFP_NOWARN);
if (!new_tab) {
ret = -ENOMEM;
@@ -5450,8 +5660,9 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
type = &tab->types[tab->cnt];
type->btf_id = i;
- record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
- BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size);
+ record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
+ BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT |
+ BPF_KPTR, t->size);
/* The record cannot be unset, treat it as an error if so */
if (IS_ERR_OR_NULL(record)) {
ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
@@ -5460,9 +5671,12 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
type->record = record;
tab->cnt++;
}
+ kfree(aof);
return tab;
free:
btf_struct_metas_free(tab);
+free_aof:
+ kfree(aof);
return ERR_PTR(ret);
}
@@ -5642,8 +5856,8 @@ errout_free:
return ERR_PTR(err);
}
-extern char __weak __start_BTF[];
-extern char __weak __stop_BTF[];
+extern char __start_BTF[];
+extern char __stop_BTF[];
extern struct btf *btf_vmlinux;
#define BPF_MAP_TYPE(_id, _ops)
@@ -5708,6 +5922,15 @@ static int find_kern_ctx_type_id(enum bpf_prog_type prog_type)
return ctx_type->type;
}
+bool btf_is_projection_of(const char *pname, const char *tname)
+{
+ if (strcmp(pname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0)
+ return true;
+ if (strcmp(pname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0)
+ return true;
+ return false;
+}
+
bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, enum bpf_prog_type prog_type,
int arg)
@@ -5770,9 +5993,7 @@ again:
* int socket_filter_bpf_prog(struct __sk_buff *skb)
* { // no fields of skb are ever used }
*/
- if (strcmp(ctx_tname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0)
- return true;
- if (strcmp(ctx_tname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0)
+ if (btf_is_projection_of(ctx_tname, tname))
return true;
if (strcmp(ctx_tname, tname)) {
/* bpf_user_pt_regs_t is a typedef, so resolve it to
@@ -5964,19 +6185,14 @@ int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_ty
BTF_ID_LIST(bpf_ctx_convert_btf_id)
BTF_ID(struct, bpf_ctx_convert)
-struct btf *btf_parse_vmlinux(void)
+static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name,
+ void *data, unsigned int data_size)
{
- struct btf_verifier_env *env = NULL;
- struct bpf_verifier_log *log;
struct btf *btf = NULL;
int err;
- env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
- if (!env)
- return ERR_PTR(-ENOMEM);
-
- log = &env->log;
- log->level = BPF_LOG_KERNEL;
+ if (!IS_ENABLED(CONFIG_DEBUG_INFO_BTF))
+ return ERR_PTR(-ENOENT);
btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
if (!btf) {
@@ -5985,10 +6201,10 @@ struct btf *btf_parse_vmlinux(void)
}
env->btf = btf;
- btf->data = __start_BTF;
- btf->data_size = __stop_BTF - __start_BTF;
+ btf->data = data;
+ btf->data_size = data_size;
btf->kernel_btf = true;
- snprintf(btf->name, sizeof(btf->name), "vmlinux");
+ snprintf(btf->name, sizeof(btf->name), "%s", name);
err = btf_parse_hdr(env);
if (err)
@@ -6008,20 +6224,11 @@ struct btf *btf_parse_vmlinux(void)
if (err)
goto errout;
- /* btf_parse_vmlinux() runs under bpf_verifier_lock */
- bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
-
refcount_set(&btf->refcnt, 1);
- err = btf_alloc_id(btf);
- if (err)
- goto errout;
-
- btf_verifier_env_free(env);
return btf;
errout:
- btf_verifier_env_free(env);
if (btf) {
kvfree(btf->types);
kfree(btf);
@@ -6029,19 +6236,61 @@ errout:
return ERR_PTR(err);
}
+struct btf *btf_parse_vmlinux(void)
+{
+ struct btf_verifier_env *env = NULL;
+ struct bpf_verifier_log *log;
+ struct btf *btf;
+ int err;
+
+ env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
+ if (!env)
+ return ERR_PTR(-ENOMEM);
+
+ log = &env->log;
+ log->level = BPF_LOG_KERNEL;
+ btf = btf_parse_base(env, "vmlinux", __start_BTF, __stop_BTF - __start_BTF);
+ if (IS_ERR(btf))
+ goto err_out;
+
+ /* btf_parse_vmlinux() runs under bpf_verifier_lock */
+ bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
+ err = btf_alloc_id(btf);
+ if (err) {
+ btf_free(btf);
+ btf = ERR_PTR(err);
+ }
+err_out:
+ btf_verifier_env_free(env);
+ return btf;
+}
+
+/* If .BTF_ids section was created with distilled base BTF, both base and
+ * split BTF ids will need to be mapped to actual base/split ids for
+ * BTF now that it has been relocated.
+ */
+static __u32 btf_relocate_id(const struct btf *btf, __u32 id)
+{
+ if (!btf->base_btf || !btf->base_id_map)
+ return id;
+ return btf->base_id_map[id];
+}
+
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
-static struct btf *btf_parse_module(const char *module_name, const void *data, unsigned int data_size)
+static struct btf *btf_parse_module(const char *module_name, const void *data,
+ unsigned int data_size, void *base_data,
+ unsigned int base_data_size)
{
+ struct btf *btf = NULL, *vmlinux_btf, *base_btf = NULL;
struct btf_verifier_env *env = NULL;
struct bpf_verifier_log *log;
- struct btf *btf = NULL, *base_btf;
- int err;
+ int err = 0;
- base_btf = bpf_get_btf_vmlinux();
- if (IS_ERR(base_btf))
- return base_btf;
- if (!base_btf)
+ vmlinux_btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(vmlinux_btf))
+ return vmlinux_btf;
+ if (!vmlinux_btf)
return ERR_PTR(-EINVAL);
env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
@@ -6051,6 +6300,16 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u
log = &env->log;
log->level = BPF_LOG_KERNEL;
+ if (base_data) {
+ base_btf = btf_parse_base(env, ".BTF.base", base_data, base_data_size);
+ if (IS_ERR(base_btf)) {
+ err = PTR_ERR(base_btf);
+ goto errout;
+ }
+ } else {
+ base_btf = vmlinux_btf;
+ }
+
btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
if (!btf) {
err = -ENOMEM;
@@ -6064,12 +6323,11 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u
btf->kernel_btf = true;
snprintf(btf->name, sizeof(btf->name), "%s", module_name);
- btf->data = kvmalloc(data_size, GFP_KERNEL | __GFP_NOWARN);
+ btf->data = kvmemdup(data, data_size, GFP_KERNEL | __GFP_NOWARN);
if (!btf->data) {
err = -ENOMEM;
goto errout;
}
- memcpy(btf->data, data, data_size);
btf->data_size = data_size;
err = btf_parse_hdr(env);
@@ -6090,12 +6348,22 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u
if (err)
goto errout;
+ if (base_btf != vmlinux_btf) {
+ err = btf_relocate(btf, vmlinux_btf, &btf->base_id_map);
+ if (err)
+ goto errout;
+ btf_free(base_btf);
+ base_btf = vmlinux_btf;
+ }
+
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
return btf;
errout:
btf_verifier_env_free(env);
+ if (!IS_ERR(base_btf) && base_btf != vmlinux_btf)
+ btf_free(base_btf);
if (btf) {
kvfree(btf->data);
kvfree(btf->types);
@@ -6116,16 +6384,15 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
return prog->aux->attach_btf;
}
-static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
+static bool is_void_or_int_ptr(struct btf *btf, const struct btf_type *t)
{
/* skip modifiers */
t = btf_type_skip_modifiers(btf, t->type, NULL);
-
- return btf_type_is_int(t);
+ return btf_type_is_void(t) || btf_type_is_int(t);
}
-static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
- int off)
+u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
+ int off)
{
const struct btf_param *args;
const struct btf_type *t;
@@ -6188,6 +6455,203 @@ int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto,
return off;
}
+struct bpf_raw_tp_null_args {
+ const char *func;
+ u64 mask;
+};
+
+static const struct bpf_raw_tp_null_args raw_tp_null_args[] = {
+ /* sched */
+ { "sched_pi_setprio", 0x10 },
+ /* ... from sched_numa_pair_template event class */
+ { "sched_stick_numa", 0x100 },
+ { "sched_swap_numa", 0x100 },
+ /* afs */
+ { "afs_make_fs_call", 0x10 },
+ { "afs_make_fs_calli", 0x10 },
+ { "afs_make_fs_call1", 0x10 },
+ { "afs_make_fs_call2", 0x10 },
+ { "afs_protocol_error", 0x1 },
+ { "afs_flock_ev", 0x10 },
+ /* cachefiles */
+ { "cachefiles_lookup", 0x1 | 0x200 },
+ { "cachefiles_unlink", 0x1 },
+ { "cachefiles_rename", 0x1 },
+ { "cachefiles_prep_read", 0x1 },
+ { "cachefiles_mark_active", 0x1 },
+ { "cachefiles_mark_failed", 0x1 },
+ { "cachefiles_mark_inactive", 0x1 },
+ { "cachefiles_vfs_error", 0x1 },
+ { "cachefiles_io_error", 0x1 },
+ { "cachefiles_ondemand_open", 0x1 },
+ { "cachefiles_ondemand_copen", 0x1 },
+ { "cachefiles_ondemand_close", 0x1 },
+ { "cachefiles_ondemand_read", 0x1 },
+ { "cachefiles_ondemand_cread", 0x1 },
+ { "cachefiles_ondemand_fd_write", 0x1 },
+ { "cachefiles_ondemand_fd_release", 0x1 },
+ /* ext4, from ext4__mballoc event class */
+ { "ext4_mballoc_discard", 0x10 },
+ { "ext4_mballoc_free", 0x10 },
+ /* fib */
+ { "fib_table_lookup", 0x100 },
+ /* filelock */
+ /* ... from filelock_lock event class */
+ { "posix_lock_inode", 0x10 },
+ { "fcntl_setlk", 0x10 },
+ { "locks_remove_posix", 0x10 },
+ { "flock_lock_inode", 0x10 },
+ /* ... from filelock_lease event class */
+ { "break_lease_noblock", 0x10 },
+ { "break_lease_block", 0x10 },
+ { "break_lease_unblock", 0x10 },
+ { "generic_delete_lease", 0x10 },
+ { "time_out_leases", 0x10 },
+ /* host1x */
+ { "host1x_cdma_push_gather", 0x10000 },
+ /* huge_memory */
+ { "mm_khugepaged_scan_pmd", 0x10 },
+ { "mm_collapse_huge_page_isolate", 0x1 },
+ { "mm_khugepaged_scan_file", 0x10 },
+ { "mm_khugepaged_collapse_file", 0x10 },
+ /* kmem */
+ { "mm_page_alloc", 0x1 },
+ { "mm_page_pcpu_drain", 0x1 },
+ /* .. from mm_page event class */
+ { "mm_page_alloc_zone_locked", 0x1 },
+ /* netfs */
+ { "netfs_failure", 0x10 },
+ /* power */
+ { "device_pm_callback_start", 0x10 },
+ /* qdisc */
+ { "qdisc_dequeue", 0x1000 },
+ /* rxrpc */
+ { "rxrpc_recvdata", 0x1 },
+ { "rxrpc_resend", 0x10 },
+ { "rxrpc_tq", 0x10 },
+ { "rxrpc_client", 0x1 },
+ /* skb */
+ {"kfree_skb", 0x1000},
+ /* sunrpc */
+ { "xs_stream_read_data", 0x1 },
+ /* ... from xprt_cong_event event class */
+ { "xprt_reserve_cong", 0x10 },
+ { "xprt_release_cong", 0x10 },
+ { "xprt_get_cong", 0x10 },
+ { "xprt_put_cong", 0x10 },
+ /* tcp */
+ { "tcp_send_reset", 0x11 },
+ { "tcp_sendmsg_locked", 0x100 },
+ /* tegra_apb_dma */
+ { "tegra_dma_tx_status", 0x100 },
+ /* timer_migration */
+ { "tmigr_update_events", 0x1 },
+ /* writeback, from writeback_folio_template event class */
+ { "writeback_dirty_folio", 0x10 },
+ { "folio_wait_writeback", 0x10 },
+ /* rdma */
+ { "mr_integ_alloc", 0x2000 },
+ /* bpf_testmod */
+ { "bpf_testmod_test_read", 0x0 },
+ /* amdgpu */
+ { "amdgpu_vm_bo_map", 0x1 },
+ { "amdgpu_vm_bo_unmap", 0x1 },
+ /* netfs */
+ { "netfs_folioq", 0x1 },
+ /* xfs from xfs_defer_pending_class */
+ { "xfs_defer_create_intent", 0x1 },
+ { "xfs_defer_cancel_list", 0x1 },
+ { "xfs_defer_pending_finish", 0x1 },
+ { "xfs_defer_pending_abort", 0x1 },
+ { "xfs_defer_relog_intent", 0x1 },
+ { "xfs_defer_isolate_paused", 0x1 },
+ { "xfs_defer_item_pause", 0x1 },
+ { "xfs_defer_item_unpause", 0x1 },
+ /* xfs from xfs_defer_pending_item_class */
+ { "xfs_defer_add_item", 0x1 },
+ { "xfs_defer_cancel_item", 0x1 },
+ { "xfs_defer_finish_item", 0x1 },
+ /* xfs from xfs_icwalk_class */
+ { "xfs_ioc_free_eofblocks", 0x10 },
+ { "xfs_blockgc_free_space", 0x10 },
+ /* xfs from xfs_btree_cur_class */
+ { "xfs_btree_updkeys", 0x100 },
+ { "xfs_btree_overlapped_query_range", 0x100 },
+ /* xfs from xfs_imap_class*/
+ { "xfs_map_blocks_found", 0x10000 },
+ { "xfs_map_blocks_alloc", 0x10000 },
+ { "xfs_iomap_alloc", 0x1000 },
+ { "xfs_iomap_found", 0x1000 },
+ /* xfs from xfs_fs_class */
+ { "xfs_inodegc_flush", 0x1 },
+ { "xfs_inodegc_push", 0x1 },
+ { "xfs_inodegc_start", 0x1 },
+ { "xfs_inodegc_stop", 0x1 },
+ { "xfs_inodegc_queue", 0x1 },
+ { "xfs_inodegc_throttle", 0x1 },
+ { "xfs_fs_sync_fs", 0x1 },
+ { "xfs_blockgc_start", 0x1 },
+ { "xfs_blockgc_stop", 0x1 },
+ { "xfs_blockgc_worker", 0x1 },
+ { "xfs_blockgc_flush_all", 0x1 },
+ /* xfs_scrub */
+ { "xchk_nlinks_live_update", 0x10 },
+ /* xfs_scrub from xchk_metapath_class */
+ { "xchk_metapath_lookup", 0x100 },
+ /* nfsd */
+ { "nfsd_dirent", 0x1 },
+ { "nfsd_file_acquire", 0x1001 },
+ { "nfsd_file_insert_err", 0x1 },
+ { "nfsd_file_cons_err", 0x1 },
+ /* nfs4 */
+ { "nfs4_setup_sequence", 0x1 },
+ { "pnfs_update_layout", 0x10000 },
+ { "nfs4_inode_callback_event", 0x200 },
+ { "nfs4_inode_stateid_callback_event", 0x200 },
+ /* nfs from pnfs_layout_event */
+ { "pnfs_mds_fallback_pg_init_read", 0x10000 },
+ { "pnfs_mds_fallback_pg_init_write", 0x10000 },
+ { "pnfs_mds_fallback_pg_get_mirror_count", 0x10000 },
+ { "pnfs_mds_fallback_read_done", 0x10000 },
+ { "pnfs_mds_fallback_write_done", 0x10000 },
+ { "pnfs_mds_fallback_read_pagelist", 0x10000 },
+ { "pnfs_mds_fallback_write_pagelist", 0x10000 },
+ /* coda */
+ { "coda_dec_pic_run", 0x10 },
+ { "coda_dec_pic_done", 0x10 },
+ /* cfg80211 */
+ { "cfg80211_scan_done", 0x11 },
+ { "rdev_set_coalesce", 0x10 },
+ { "cfg80211_report_wowlan_wakeup", 0x100 },
+ { "cfg80211_inform_bss_frame", 0x100 },
+ { "cfg80211_michael_mic_failure", 0x10000 },
+ /* cfg80211 from wiphy_work_event */
+ { "wiphy_work_queue", 0x10 },
+ { "wiphy_work_run", 0x10 },
+ { "wiphy_work_cancel", 0x10 },
+ { "wiphy_work_flush", 0x10 },
+ /* hugetlbfs */
+ { "hugetlbfs_alloc_inode", 0x10 },
+ /* spufs */
+ { "spufs_context", 0x10 },
+ /* kvm_hv */
+ { "kvm_page_fault_enter", 0x100 },
+ /* dpu */
+ { "dpu_crtc_setup_mixer", 0x100 },
+ /* binder */
+ { "binder_transaction", 0x100 },
+ /* bcachefs */
+ { "btree_path_free", 0x100 },
+ /* hfi1_tx */
+ { "hfi1_sdma_progress", 0x1000 },
+ /* iptfs */
+ { "iptfs_ingress_postq_event", 0x1000 },
+ /* neigh */
+ { "neigh_update", 0x10 },
+ /* snd_firewire_lib */
+ { "amdtp_packet", 0x100 },
+};
+
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
@@ -6198,6 +6662,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const char *tname = prog->aux->attach_func_name;
struct bpf_verifier_log *log = info->log;
const struct btf_param *args;
+ bool ptr_err_raw_tp = false;
const char *tag_value;
u32 nr_args, arg;
int i, ret;
@@ -6207,7 +6672,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
tname, off);
return false;
}
- arg = get_ctx_arg_idx(btf, t, off);
+ arg = btf_ctx_arg_idx(btf, t, off);
args = (const struct btf_param *)(t + 1);
/* if (t == NULL) Fall back to default BPF prog with
* MAX_BPF_FUNC_REG_ARGS u64 arguments.
@@ -6227,8 +6692,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (arg == nr_args) {
switch (prog->expected_attach_type) {
- case BPF_LSM_CGROUP:
case BPF_LSM_MAC:
+ /* mark we are accessing the return value */
+ info->is_retval = true;
+ fallthrough;
+ case BPF_LSM_CGROUP:
case BPF_TRACE_FEXIT:
/* When LSM programs are attached to void LSM hooks
* they use FEXIT trampolines and when attached to
@@ -6289,6 +6757,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
return false;
}
+ if (size != sizeof(u64)) {
+ bpf_log(log, "func '%s' size %d must be 8\n",
+ tname, size);
+ return false;
+ }
+
/* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
@@ -6303,14 +6777,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
}
- if (t->type == 0)
- /* This is a pointer to void.
- * It is the same as scalar from the verifier safety pov.
- * No further pointer walking is allowed.
- */
- return true;
-
- if (is_int_ptr(btf, t))
+ /*
+ * If it's a pointer to void, it's the same as scalar from the verifier
+ * safety POV. Either way, no futher pointer walking is allowed.
+ */
+ if (is_void_or_int_ptr(btf, t))
return true;
/* this is a pointer to another type */
@@ -6326,6 +6797,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
info->reg_type = ctx_arg_info->reg_type;
info->btf = ctx_arg_info->btf ? : btf_vmlinux;
info->btf_id = ctx_arg_info->btf_id;
+ info->ref_obj_id = ctx_arg_info->ref_obj_id;
return true;
}
}
@@ -6334,6 +6806,42 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (prog_args_trusted(prog))
info->reg_type |= PTR_TRUSTED;
+ if (btf_param_match_suffix(btf, &args[arg], "__nullable"))
+ info->reg_type |= PTR_MAYBE_NULL;
+
+ if (prog->expected_attach_type == BPF_TRACE_RAW_TP) {
+ struct btf *btf = prog->aux->attach_btf;
+ const struct btf_type *t;
+ const char *tname;
+
+ /* BTF lookups cannot fail, return false on error */
+ t = btf_type_by_id(btf, prog->aux->attach_btf_id);
+ if (!t)
+ return false;
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (!tname)
+ return false;
+ /* Checked by bpf_check_attach_target */
+ tname += sizeof("btf_trace_") - 1;
+ for (i = 0; i < ARRAY_SIZE(raw_tp_null_args); i++) {
+ /* Is this a func with potential NULL args? */
+ if (strcmp(tname, raw_tp_null_args[i].func))
+ continue;
+ if (raw_tp_null_args[i].mask & (0x1ULL << (arg * 4)))
+ info->reg_type |= PTR_MAYBE_NULL;
+ /* Is the current arg IS_ERR? */
+ if (raw_tp_null_args[i].mask & (0x2ULL << (arg * 4)))
+ ptr_err_raw_tp = true;
+ break;
+ }
+ /* If we don't know NULL-ness specification and the tracepoint
+ * is coming from a loadable module, be conservative and mark
+ * argument as PTR_MAYBE_NULL.
+ */
+ if (i == ARRAY_SIZE(raw_tp_null_args) && btf_is_module(btf))
+ info->reg_type |= PTR_MAYBE_NULL;
+ }
+
if (tgt_prog) {
enum bpf_prog_type tgt_type;
@@ -6356,7 +6864,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
info->btf_id = t->type;
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_type_tag(t)) {
+ if (btf_type_is_type_tag(t) && !btf_type_kflag(t)) {
tag_value = __btf_name_by_offset(btf, t->name_off);
if (strcmp(tag_value, "user") == 0)
info->reg_type |= MEM_USER;
@@ -6378,6 +6886,15 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n",
tname, arg, info->btf_id, btf_type_str(t),
__btf_name_by_offset(btf, t->name_off));
+
+ /* Perform all checks on the validity of type for this argument, but if
+ * we know it can be IS_ERR at runtime, scrub pointer type and mark as
+ * scalar.
+ */
+ if (ptr_err_raw_tp) {
+ bpf_log(log, "marking pointer arg%d as scalar as it may encode error", arg);
+ info->reg_type = SCALAR_VALUE;
+ }
return true;
}
EXPORT_SYMBOL_GPL(btf_ctx_access);
@@ -6606,7 +7123,7 @@ error:
/* check type tag */
t = btf_type_by_id(btf, mtype->type);
- if (btf_type_is_type_tag(t)) {
+ if (btf_type_is_type_tag(t) && !btf_type_kflag(t)) {
tag_value = __btf_name_by_offset(btf, t->name_off);
/* check __user tag */
if (strcmp(tag_value, "user") == 0)
@@ -6672,7 +7189,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
for (i = 0; i < rec->cnt; i++) {
struct btf_field *field = &rec->fields[i];
u32 offset = field->offset;
- if (off < offset + btf_field_type_size(field->type) && offset < off + size) {
+ if (off < offset + field->size && offset < off + size) {
bpf_log(log,
"direct access to %s is disallowed\n",
btf_field_type_name(field->type));
@@ -7143,7 +7660,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
return 0;
if (!prog->aux->func_info) {
- bpf_log(log, "Verifier bug\n");
+ verifier_bug(env, "func_info undefined");
return -EFAULT;
}
@@ -7167,7 +7684,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
tname = btf_name_by_offset(btf, fn_t->name_off);
if (prog->aux->func_info_aux[subprog].unreliable) {
- bpf_log(log, "Verifier bug in function %s()\n", tname);
+ verifier_bug(env, "unreliable BTF for function %s()", tname);
return -EFAULT;
}
if (prog_type == BPF_PROG_TYPE_EXT)
@@ -7349,8 +7866,8 @@ static void btf_type_show(const struct btf *btf, u32 type_id, void *obj,
btf_type_ops(t)->show(btf, t, type_id, obj, 0, show);
}
-static void btf_seq_show(struct btf_show *show, const char *fmt,
- va_list args)
+__printf(2, 0) static void btf_seq_show(struct btf_show *show, const char *fmt,
+ va_list args)
{
seq_vprintf((struct seq_file *)show->target, fmt, args);
}
@@ -7383,8 +7900,8 @@ struct btf_show_snprintf {
int len; /* length we would have written */
};
-static void btf_snprintf_show(struct btf_show *show, const char *fmt,
- va_list args)
+__printf(2, 0) static void btf_snprintf_show(struct btf_show *show, const char *fmt,
+ va_list args)
{
struct btf_show_snprintf *ssnprintf = (struct btf_show_snprintf *)show;
int len;
@@ -7484,21 +8001,11 @@ int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
struct btf *btf_get_by_fd(int fd)
{
struct btf *btf;
- struct fd f;
-
- f = fdget(fd);
-
- if (!f.file)
- return ERR_PTR(-EBADF);
-
- if (f.file->f_op != &btf_fops) {
- fdput(f);
- return ERR_PTR(-EINVAL);
- }
+ CLASS(fd, f)(fd);
- btf = f.file->private_data;
- refcount_inc(&btf->refcnt);
- fdput(f);
+ btf = __btf_get_by_fd(f);
+ if (!IS_ERR(btf))
+ refcount_inc(&btf->refcnt);
return btf;
}
@@ -7615,17 +8122,6 @@ struct btf_module {
static LIST_HEAD(btf_modules);
static DEFINE_MUTEX(btf_module_mutex);
-static ssize_t
-btf_module_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t off, size_t len)
-{
- const struct btf *btf = bin_attr->private;
-
- memcpy(buf, btf->data + off, len);
- return len;
-}
-
static void purge_cand_cache(struct btf *btf);
static int btf_module_notify(struct notifier_block *nb, unsigned long op,
@@ -7648,7 +8144,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
err = -ENOMEM;
goto out;
}
- btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size);
+ btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size,
+ mod->btf_base_data, mod->btf_base_data_size);
if (IS_ERR(btf)) {
kfree(btf_mod);
if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) {
@@ -7685,8 +8182,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
attr->attr.name = btf->name;
attr->attr.mode = 0444;
attr->size = btf->data_size;
- attr->private = btf;
- attr->read = btf_module_read;
+ attr->private = btf->data;
+ attr->read_new = sysfs_bin_attr_simple_read;
err = sysfs_create_bin_file(btf_kobj, attr);
if (err) {
@@ -7859,15 +8356,44 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
BTF_TRACING_TYPE_xxx
#undef BTF_TRACING_TYPE
+/* Validate well-formedness of iter argument type.
+ * On success, return positive BTF ID of iter state's STRUCT type.
+ * On error, negative error is returned.
+ */
+int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx)
+{
+ const struct btf_param *arg;
+ const struct btf_type *t;
+ const char *name;
+ int btf_id;
+
+ if (btf_type_vlen(func) <= arg_idx)
+ return -EINVAL;
+
+ arg = &btf_params(func)[arg_idx];
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ t = btf_type_skip_modifiers(btf, t->type, &btf_id);
+ if (!t || !__btf_type_is_struct(t))
+ return -EINVAL;
+
+ name = btf_name_by_offset(btf, t->name_off);
+ if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
+ return -EINVAL;
+
+ return btf_id;
+}
+
static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
const struct btf_type *func, u32 func_flags)
{
u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
- const char *name, *sfx, *iter_name;
- const struct btf_param *arg;
+ const char *sfx, *iter_name;
const struct btf_type *t;
char exp_name[128];
u32 nr_args;
+ int btf_id;
/* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */
if (!flags || (flags & (flags - 1)))
@@ -7878,28 +8404,21 @@ static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
if (nr_args < 1)
return -EINVAL;
- arg = &btf_params(func)[0];
- t = btf_type_skip_modifiers(btf, arg->type, NULL);
- if (!t || !btf_type_is_ptr(t))
- return -EINVAL;
- t = btf_type_skip_modifiers(btf, t->type, NULL);
- if (!t || !__btf_type_is_struct(t))
- return -EINVAL;
-
- name = btf_name_by_offset(btf, t->name_off);
- if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
- return -EINVAL;
+ btf_id = btf_check_iter_arg(btf, func, 0);
+ if (btf_id < 0)
+ return btf_id;
/* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to
* fit nicely in stack slots
*/
+ t = btf_type_by_id(btf, btf_id);
if (t->size == 0 || (t->size % 8))
return -EINVAL;
/* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *)
* naming pattern
*/
- iter_name = name + sizeof(ITER_PREFIX) - 1;
+ iter_name = btf_name_by_offset(btf, t->name_off) + sizeof(ITER_PREFIX) - 1;
if (flags & KF_ITER_NEW)
sfx = "new";
else if (flags & KF_ITER_NEXT)
@@ -7972,7 +8491,7 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
bool add_filter = !!kset->filter;
struct btf_kfunc_set_tab *tab;
struct btf_id_set8 *set;
- u32 set_cnt;
+ u32 set_cnt, i;
int ret;
if (hook >= BTF_KFUNC_HOOK_MAX) {
@@ -8018,21 +8537,15 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
goto end;
}
- /* We don't need to allocate, concatenate, and sort module sets, because
- * only one is allowed per hook. Hence, we can directly assign the
- * pointer and return.
- */
- if (!vmlinux_set) {
- tab->sets[hook] = add_set;
- goto do_add_filter;
- }
-
/* In case of vmlinux sets, there may be more than one set being
* registered per hook. To create a unified set, we allocate a new set
* and concatenate all individual sets being registered. While each set
* is individually sorted, they may become unsorted when concatenated,
* hence re-sorting the final set again is required to make binary
* searching the set using btf_id_set8_contains function work.
+ *
+ * For module sets, we need to allocate as we may need to relocate
+ * BTF ids.
*/
set_cnt = set ? set->cnt : 0;
@@ -8048,7 +8561,7 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
/* Grow set */
set = krealloc(tab->sets[hook],
- offsetof(struct btf_id_set8, pairs[set_cnt + add_set->cnt]),
+ struct_size(set, pairs, set_cnt + add_set->cnt),
GFP_KERNEL | __GFP_NOWARN);
if (!set) {
ret = -ENOMEM;
@@ -8062,11 +8575,14 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
/* Concatenate the two sets */
memcpy(set->pairs + set->cnt, add_set->pairs, add_set->cnt * sizeof(set->pairs[0]));
+ /* Now that the set is copied, update with relocated BTF ids */
+ for (i = set->cnt; i < set->cnt + add_set->cnt; i++)
+ set->pairs[i].id = btf_relocate_id(btf, set->pairs[i].id);
+
set->cnt += add_set->cnt;
sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL);
-do_add_filter:
if (add_filter) {
hook_filter = &tab->hook_filters[hook];
hook_filter->filters[hook_filter->nr_filters++] = kset->filter;
@@ -8117,13 +8633,20 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_STRUCT_OPS:
return BTF_KFUNC_HOOK_STRUCT_OPS;
case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_LSM:
return BTF_KFUNC_HOOK_TRACING;
case BPF_PROG_TYPE_SYSCALL:
return BTF_KFUNC_HOOK_SYSCALL;
case BPF_PROG_TYPE_CGROUP_SKB:
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
- return BTF_KFUNC_HOOK_CGROUP_SKB;
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_SOCK_OPS:
+ return BTF_KFUNC_HOOK_CGROUP;
case BPF_PROG_TYPE_SCHED_ACT:
return BTF_KFUNC_HOOK_SCHED_ACT;
case BPF_PROG_TYPE_SK_SKB:
@@ -8137,6 +8660,8 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
return BTF_KFUNC_HOOK_LWT;
case BPF_PROG_TYPE_NETFILTER:
return BTF_KFUNC_HOOK_NETFILTER;
+ case BPF_PROG_TYPE_KPROBE:
+ return BTF_KFUNC_HOOK_KPROBE;
default:
return BTF_KFUNC_HOOK_MAX;
}
@@ -8184,7 +8709,7 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
return PTR_ERR(btf);
for (i = 0; i < kset->set->cnt; i++) {
- ret = btf_check_kfunc_protos(btf, kset->set->pairs[i].id,
+ ret = btf_check_kfunc_protos(btf, btf_relocate_id(btf, kset->set->pairs[i].id),
kset->set->pairs[i].flags);
if (ret)
goto err_out;
@@ -8248,7 +8773,7 @@ static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc
u32 nr_args, i;
for (i = 0; i < cnt; i++) {
- dtor_btf_id = dtors[i].kfunc_btf_id;
+ dtor_btf_id = btf_relocate_id(btf, dtors[i].kfunc_btf_id);
dtor_func = btf_type_by_id(btf, dtor_btf_id);
if (!dtor_func || !btf_type_is_func(dtor_func))
@@ -8283,7 +8808,7 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c
{
struct btf_id_dtor_kfunc_tab *tab;
struct btf *btf;
- u32 tab_cnt;
+ u32 tab_cnt, i;
int ret;
btf = btf_get_module_btf(owner);
@@ -8322,7 +8847,7 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c
}
tab = krealloc(btf->dtor_kfunc_tab,
- offsetof(struct btf_id_dtor_kfunc_tab, dtors[tab_cnt + add_cnt]),
+ struct_size(tab, dtors, tab_cnt + add_cnt),
GFP_KERNEL | __GFP_NOWARN);
if (!tab) {
ret = -ENOMEM;
@@ -8334,6 +8859,13 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c
btf->dtor_kfunc_tab = tab;
memcpy(tab->dtors + tab->cnt, dtors, add_cnt * sizeof(tab->dtors[0]));
+
+ /* remap BTF ids based on BTF relocation (if any) */
+ for (i = tab_cnt; i < tab_cnt + add_cnt; i++) {
+ tab->dtors[i].btf_id = btf_relocate_id(btf, tab->dtors[i].btf_id);
+ tab->dtors[i].kfunc_btf_id = btf_relocate_id(btf, tab->dtors[i].kfunc_btf_id);
+ }
+
tab->cnt += add_cnt;
sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL);
@@ -8690,6 +9222,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
struct bpf_core_cand_list cands = {};
struct bpf_core_relo_res targ_res;
struct bpf_core_spec *specs;
+ const struct btf_type *type;
int err;
/* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5"
@@ -8699,6 +9232,14 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
if (!specs)
return -ENOMEM;
+ type = btf_type_by_id(ctx->btf, relo->type_id);
+ if (!type) {
+ bpf_log(ctx->log, "relo #%u: bad type id %u\n",
+ relo_idx, relo->type_id);
+ kfree(specs);
+ return -EINVAL;
+ }
+
if (need_cands) {
struct bpf_cand_cache *cc;
int i;
@@ -8864,8 +9405,7 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops,
tab = btf->struct_ops_tab;
if (!tab) {
- tab = kzalloc(offsetof(struct btf_struct_ops_tab, ops[4]),
- GFP_KERNEL);
+ tab = kzalloc(struct_size(tab, ops, 4), GFP_KERNEL);
if (!tab)
return -ENOMEM;
tab->capacity = 4;
@@ -8878,8 +9418,7 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops,
if (tab->cnt == tab->capacity) {
new_tab = krealloc(tab,
- offsetof(struct btf_struct_ops_tab,
- ops[tab->capacity * 2]),
+ struct_size(tab, ops, tab->capacity * 2),
GFP_KERNEL);
if (!new_tab)
return -ENOMEM;
diff --git a/kernel/bpf/btf_iter.c b/kernel/bpf/btf_iter.c
new file mode 100644
index 000000000000..0e2c66a52df9
--- /dev/null
+++ b/kernel/bpf/btf_iter.c
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+#include "../../tools/lib/bpf/btf_iter.c"
diff --git a/kernel/bpf/btf_relocate.c b/kernel/bpf/btf_relocate.c
new file mode 100644
index 000000000000..c12ccbf66507
--- /dev/null
+++ b/kernel/bpf/btf_relocate.c
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+#include "../../tools/lib/bpf/btf_relocate.c"
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 82243cb6c54d..9122c39870bf 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -24,6 +24,36 @@
DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+/*
+ * cgroup bpf destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions. Use a separate workqueue so that cgroup bpf
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_bpf_destroy_wq;
+
+static int __init cgroup_bpf_wq_init(void)
+{
+ cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1);
+ if (!cgroup_bpf_destroy_wq)
+ panic("Failed to alloc workqueue for cgroup bpf destroy.\n");
+ return 0;
+}
+core_initcall(cgroup_bpf_wq_init);
+
+static int cgroup_bpf_lifetime_notify(struct notifier_block *nb,
+ unsigned long action, void *data);
+
+static struct notifier_block cgroup_bpf_lifetime_nb = {
+ .notifier_call = cgroup_bpf_lifetime_notify,
+};
+
+void __init cgroup_bpf_lifetime_notifier_init(void)
+{
+ BUG_ON(blocking_notifier_chain_register(&cgroup_lifetime_notifier,
+ &cgroup_bpf_lifetime_nb));
+}
+
/* __always_inline is necessary to prevent indirect call through run_prog
* function pointer.
*/
@@ -189,7 +219,7 @@ bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
}
#endif /* CONFIG_BPF_LSM */
-void cgroup_bpf_offline(struct cgroup *cgrp)
+static void cgroup_bpf_offline(struct cgroup *cgrp)
{
cgroup_get(cgrp);
percpu_ref_kill(&cgrp->bpf.refcnt);
@@ -334,7 +364,7 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref)
struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
- queue_work(system_wq, &cgrp->bpf.release_work);
+ queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work);
}
/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
@@ -352,7 +382,7 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
/* count number of elements in the list.
* it's slow but the list cannot be long
*/
-static u32 prog_list_length(struct hlist_head *head)
+static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt)
{
struct bpf_prog_list *pl;
u32 cnt = 0;
@@ -360,6 +390,8 @@ static u32 prog_list_length(struct hlist_head *head)
hlist_for_each_entry(pl, head, node) {
if (!prog_list_prog(pl))
continue;
+ if (preorder_cnt && (pl->flags & BPF_F_PREORDER))
+ (*preorder_cnt)++;
cnt++;
}
return cnt;
@@ -383,7 +415,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
if (flags & BPF_F_ALLOW_MULTI)
return true;
- cnt = prog_list_length(&p->bpf.progs[atype]);
+ cnt = prog_list_length(&p->bpf.progs[atype], NULL);
WARN_ON_ONCE(cnt > 1);
if (cnt == 1)
return !!(flags & BPF_F_ALLOW_OVERRIDE);
@@ -406,12 +438,12 @@ static int compute_effective_progs(struct cgroup *cgrp,
struct bpf_prog_array *progs;
struct bpf_prog_list *pl;
struct cgroup *p = cgrp;
- int cnt = 0;
+ int i, j, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart;
/* count number of effective programs by walking parents */
do {
if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
- cnt += prog_list_length(&p->bpf.progs[atype]);
+ cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt);
p = cgroup_parent(p);
} while (p);
@@ -422,20 +454,34 @@ static int compute_effective_progs(struct cgroup *cgrp,
/* populate the array with effective progs */
cnt = 0;
p = cgrp;
+ fstart = preorder_cnt;
+ bstart = preorder_cnt - 1;
do {
if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
continue;
+ init_bstart = bstart;
hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
if (!prog_list_prog(pl))
continue;
- item = &progs->items[cnt];
+ if (pl->flags & BPF_F_PREORDER) {
+ item = &progs->items[bstart];
+ bstart--;
+ } else {
+ item = &progs->items[fstart];
+ fstart++;
+ }
item->prog = prog_list_prog(pl);
bpf_cgroup_storages_assign(item->cgroup_storage,
pl->storage);
cnt++;
}
+
+ /* reverse pre-ordering progs at this cgroup level */
+ for (i = bstart + 1, j = init_bstart; i < j; i++, j--)
+ swap(progs->items[i], progs->items[j]);
+
} while ((p = cgroup_parent(p)));
*array = progs;
@@ -458,7 +504,7 @@ static void activate_effective_progs(struct cgroup *cgrp,
* cgroup_bpf_inherit() - inherit effective programs from parent
* @cgrp: the cgroup to modify
*/
-int cgroup_bpf_inherit(struct cgroup *cgrp)
+static int cgroup_bpf_inherit(struct cgroup *cgrp)
{
/* has to use marco instead of const int, since compiler thinks
* that array below is variable length
@@ -501,6 +547,27 @@ cleanup:
return -ENOMEM;
}
+static int cgroup_bpf_lifetime_notify(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct cgroup *cgrp = data;
+ int ret = 0;
+
+ if (cgrp->root != &cgrp_dfl_root)
+ return NOTIFY_OK;
+
+ switch (action) {
+ case CGROUP_LIFETIME_ONLINE:
+ ret = cgroup_bpf_inherit(cgrp);
+ break;
+ case CGROUP_LIFETIME_OFFLINE:
+ cgroup_bpf_offline(cgrp);
+ break;
+ }
+
+ return notifier_from_errno(ret);
+}
+
static int update_effective_progs(struct cgroup *cgrp,
enum cgroup_bpf_attach_type atype)
{
@@ -646,7 +713,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
*/
return -EPERM;
- if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
+ if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS)
return -E2BIG;
pl = find_attach_entry(progs, prog, link, replace_prog,
@@ -681,6 +748,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
pl->prog = prog;
pl->link = link;
+ pl->flags = flags;
bpf_cgroup_storages_assign(pl->storage, storage);
cgrp->bpf.flags[atype] = saved_flags;
@@ -1056,7 +1124,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
lockdep_is_held(&cgroup_mutex));
total_cnt += bpf_prog_array_length(effective);
} else {
- total_cnt += prog_list_length(&cgrp->bpf.progs[atype]);
+ total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL);
}
}
@@ -1088,7 +1156,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
u32 id;
progs = &cgrp->bpf.progs[atype];
- cnt = min_t(int, prog_list_length(progs), total_cnt);
+ cnt = min_t(int, prog_list_length(progs, NULL), total_cnt);
i = 0;
hlist_for_each_entry(pl, progs, node) {
prog = prog_list_prog(pl);
@@ -1619,10 +1687,6 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
if (func_proto)
return func_proto;
- func_proto = cgroup_current_func_proto(func_id, prog);
- if (func_proto)
- return func_proto;
-
switch (func_id) {
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
@@ -1691,7 +1755,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
* returned value != 1 during execution. In all other cases 0 is returned.
*/
int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
- struct ctl_table *table, int write,
+ const struct ctl_table *table, int write,
char **buf, size_t *pcount, loff_t *ppos,
enum cgroup_bpf_attach_type atype)
{
@@ -2170,10 +2234,6 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
if (func_proto)
return func_proto;
- func_proto = cgroup_current_func_proto(func_id, prog);
- if (func_proto)
- return func_proto;
-
switch (func_id) {
case BPF_FUNC_sysctl_get_name:
return &bpf_sysctl_get_name_proto;
@@ -2317,10 +2377,6 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
if (func_proto)
return func_proto;
- func_proto = cgroup_current_func_proto(func_id, prog);
- if (func_proto)
- return func_proto;
-
switch (func_id) {
#ifdef CONFIG_NET
case BPF_FUNC_get_netns_cookie:
@@ -2567,23 +2623,3 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return NULL;
}
}
-
-/* Common helpers for cgroup hooks with valid process context. */
-const struct bpf_func_proto *
-cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-{
- switch (func_id) {
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_current_pid_tgid:
- return &bpf_get_current_pid_tgid_proto;
- case BPF_FUNC_get_current_comm:
- return &bpf_get_current_comm_proto;
-#ifdef CONFIG_CGROUP_NET_CLASSID
- case BPF_FUNC_get_cgroup_classid:
- return &bpf_get_cgroup_classid_curr_proto;
-#endif
- default:
- return NULL;
- }
-}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 696bc55de8e8..c20babbf998f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -21,11 +21,11 @@
#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
-#include <linux/random.h>
-#include <linux/moduleloader.h>
+#include <linux/prandom.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/objtool.h>
+#include <linux/overflow.h>
#include <linux/rbtree_latch.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
@@ -37,9 +37,10 @@
#include <linux/nospec.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/memcontrol.h>
+#include <linux/execmem.h>
#include <asm/barrier.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
/* Registers */
#define BPF_R0 regs[BPF_REG_0]
@@ -130,6 +131,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode);
#endif
mutex_init(&fp->aux->used_maps_mutex);
+ mutex_init(&fp->aux->ext_mutex);
mutex_init(&fp->aux->dst_mutex);
return fp;
@@ -537,6 +539,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
{
+ int err;
+
/* Branch offsets can't overflow when program is shrinking, no need
* to call bpf_adj_branches(..., true) here
*/
@@ -544,7 +548,9 @@ int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
sizeof(struct bpf_insn) * (prog->len - off - cnt));
prog->len -= cnt;
- return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false));
+ err = bpf_adj_branches(prog, off, off + cnt, off, false);
+ WARN_ON_ONCE(err);
+ return err;
}
static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
@@ -735,11 +741,11 @@ static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
return n ? container_of(n, struct bpf_ksym, tnode) : NULL;
}
-const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
+int __bpf_address_lookup(unsigned long addr, unsigned long *size,
unsigned long *off, char *sym)
{
struct bpf_ksym *ksym;
- char *ret = NULL;
+ int ret = 0;
rcu_read_lock();
ksym = bpf_ksym_find(addr);
@@ -747,9 +753,8 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
unsigned long symbol_start = ksym->start;
unsigned long symbol_end = ksym->end;
- strncpy(sym, ksym->name, KSYM_NAME_LEN);
+ ret = strscpy(sym, ksym->name, KSYM_NAME_LEN);
- ret = sym;
if (size)
*size = symbol_end - symbol_start;
if (off)
@@ -813,7 +818,7 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
if (it++ != symnum)
continue;
- strncpy(sym, ksym->name, KSYM_NAME_LEN);
+ strscpy(sym, ksym->name, KSYM_NAME_LEN);
*value = ksym->start;
*type = BPF_SYM_ELF_TYPE;
@@ -849,7 +854,7 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
return -EINVAL;
}
- tab = krealloc(tab, size * sizeof(*poke), GFP_KERNEL);
+ tab = krealloc_array(tab, size, sizeof(*poke), GFP_KERNEL);
if (!tab)
return -ENOMEM;
@@ -908,23 +913,30 @@ static LIST_HEAD(pack_list);
static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_prog_pack *pack;
+ int err;
pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)),
GFP_KERNEL);
if (!pack)
return NULL;
pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE);
- if (!pack->ptr) {
- kfree(pack);
- return NULL;
- }
+ if (!pack->ptr)
+ goto out;
bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE);
bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
- list_add_tail(&pack->list, &pack_list);
set_vm_flush_reset_perms(pack->ptr);
- set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+ err = set_memory_rox((unsigned long)pack->ptr,
+ BPF_PROG_PACK_SIZE / PAGE_SIZE);
+ if (err)
+ goto out;
+ list_add_tail(&pack->list, &pack_list);
return pack;
+
+out:
+ bpf_jit_free_exec(pack->ptr);
+ kfree(pack);
+ return NULL;
}
void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
@@ -939,9 +951,16 @@ void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
size = round_up(size, PAGE_SIZE);
ptr = bpf_jit_alloc_exec(size);
if (ptr) {
+ int err;
+
bpf_fill_ill_insns(ptr, size);
set_vm_flush_reset_perms(ptr);
- set_memory_rox((unsigned long)ptr, size / PAGE_SIZE);
+ err = set_memory_rox((unsigned long)ptr,
+ size / PAGE_SIZE);
+ if (err) {
+ bpf_jit_free_exec(ptr);
+ ptr = NULL;
+ }
}
goto out;
}
@@ -1050,12 +1069,12 @@ void bpf_jit_uncharge_modmem(u32 size)
void *__weak bpf_jit_alloc_exec(unsigned long size)
{
- return module_alloc(size);
+ return execmem_alloc(EXECMEM_BPF, size);
}
void __weak bpf_jit_free_exec(void *addr)
{
- module_memfree(addr);
+ execmem_free(addr);
}
struct bpf_binary_header *
@@ -1159,8 +1178,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
}
/* Copy JITed text from rw_header to its final location, the ro_header. */
-int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
- struct bpf_binary_header *ro_header,
+int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header,
struct bpf_binary_header *rw_header)
{
void *ptr;
@@ -1645,14 +1663,17 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_3(JMP, JSET, K), \
INSN_2(JMP, JA), \
INSN_2(JMP32, JA), \
+ /* Atomic operations. */ \
+ INSN_3(STX, ATOMIC, B), \
+ INSN_3(STX, ATOMIC, H), \
+ INSN_3(STX, ATOMIC, W), \
+ INSN_3(STX, ATOMIC, DW), \
/* Store instructions. */ \
/* Register based. */ \
INSN_3(STX, MEM, B), \
INSN_3(STX, MEM, H), \
INSN_3(STX, MEM, W), \
INSN_3(STX, MEM, DW), \
- INSN_3(STX, ATOMIC, W), \
- INSN_3(STX, ATOMIC, DW), \
/* Immediate based. */ \
INSN_3(ST, MEM, B), \
INSN_3(ST, MEM, H), \
@@ -2134,24 +2155,33 @@ out:
if (BPF_SIZE(insn->code) == BPF_W) \
atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \
(DST + insn->off)); \
- else \
+ else if (BPF_SIZE(insn->code) == BPF_DW) \
atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \
(DST + insn->off)); \
+ else \
+ goto default_label; \
break; \
case BOP | BPF_FETCH: \
if (BPF_SIZE(insn->code) == BPF_W) \
SRC = (u32) atomic_fetch_##KOP( \
(u32) SRC, \
(atomic_t *)(unsigned long) (DST + insn->off)); \
- else \
+ else if (BPF_SIZE(insn->code) == BPF_DW) \
SRC = (u64) atomic64_fetch_##KOP( \
(u64) SRC, \
(atomic64_t *)(unsigned long) (DST + insn->off)); \
+ else \
+ goto default_label; \
break;
STX_ATOMIC_DW:
STX_ATOMIC_W:
+ STX_ATOMIC_H:
+ STX_ATOMIC_B:
switch (IMM) {
+ /* Atomic read-modify-write instructions support only W and DW
+ * size modifiers.
+ */
ATOMIC_ALU_OP(BPF_ADD, add)
ATOMIC_ALU_OP(BPF_AND, and)
ATOMIC_ALU_OP(BPF_OR, or)
@@ -2163,20 +2193,63 @@ out:
SRC = (u32) atomic_xchg(
(atomic_t *)(unsigned long) (DST + insn->off),
(u32) SRC);
- else
+ else if (BPF_SIZE(insn->code) == BPF_DW)
SRC = (u64) atomic64_xchg(
(atomic64_t *)(unsigned long) (DST + insn->off),
(u64) SRC);
+ else
+ goto default_label;
break;
case BPF_CMPXCHG:
if (BPF_SIZE(insn->code) == BPF_W)
BPF_R0 = (u32) atomic_cmpxchg(
(atomic_t *)(unsigned long) (DST + insn->off),
(u32) BPF_R0, (u32) SRC);
- else
+ else if (BPF_SIZE(insn->code) == BPF_DW)
BPF_R0 = (u64) atomic64_cmpxchg(
(atomic64_t *)(unsigned long) (DST + insn->off),
(u64) BPF_R0, (u64) SRC);
+ else
+ goto default_label;
+ break;
+ /* Atomic load and store instructions support all size
+ * modifiers.
+ */
+ case BPF_LOAD_ACQ:
+ switch (BPF_SIZE(insn->code)) {
+#define LOAD_ACQUIRE(SIZEOP, SIZE) \
+ case BPF_##SIZEOP: \
+ DST = (SIZE)smp_load_acquire( \
+ (SIZE *)(unsigned long)(SRC + insn->off)); \
+ break;
+ LOAD_ACQUIRE(B, u8)
+ LOAD_ACQUIRE(H, u16)
+ LOAD_ACQUIRE(W, u32)
+#ifdef CONFIG_64BIT
+ LOAD_ACQUIRE(DW, u64)
+#endif
+#undef LOAD_ACQUIRE
+ default:
+ goto default_label;
+ }
+ break;
+ case BPF_STORE_REL:
+ switch (BPF_SIZE(insn->code)) {
+#define STORE_RELEASE(SIZEOP, SIZE) \
+ case BPF_##SIZEOP: \
+ smp_store_release( \
+ (SIZE *)(unsigned long)(DST + insn->off), (SIZE)SRC); \
+ break;
+ STORE_RELEASE(B, u8)
+ STORE_RELEASE(H, u16)
+ STORE_RELEASE(W, u32)
+#ifdef CONFIG_64BIT
+ STORE_RELEASE(DW, u64)
+#endif
+#undef STORE_RELEASE
+ default:
+ goto default_label;
+ }
break;
default:
@@ -2204,6 +2277,7 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn
u64 stack[stack_size / sizeof(u64)]; \
u64 regs[MAX_BPF_EXT_REG] = {}; \
\
+ kmsan_unpoison_memory(stack, sizeof(stack)); \
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
ARG1 = (u64) (unsigned long) ctx; \
return ___bpf_prog_run(regs, insn); \
@@ -2217,6 +2291,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
u64 stack[stack_size / sizeof(u64)]; \
u64 regs[MAX_BPF_EXT_REG]; \
\
+ kmsan_unpoison_memory(stack, sizeof(stack)); \
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
BPF_R1 = r1; \
BPF_R2 = r2; \
@@ -2270,35 +2345,29 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
insn->code = BPF_JMP | BPF_CALL_ARGS;
}
#endif
-#else
+#endif
+
static unsigned int __bpf_prog_ret0_warn(const void *ctx,
const struct bpf_insn *insn)
{
/* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
- * is not working properly, so warn about it!
+ * is not working properly, or interpreter is being used when
+ * prog->jit_requested is not 0, so warn about it!
*/
WARN_ON_ONCE(1);
return 0;
}
-#endif
-bool bpf_prog_map_compatible(struct bpf_map *map,
- const struct bpf_prog *fp)
+static bool __bpf_prog_map_compatible(struct bpf_map *map,
+ const struct bpf_prog *fp)
{
enum bpf_prog_type prog_type = resolve_prog_type(fp);
bool ret;
+ struct bpf_prog_aux *aux = fp->aux;
if (fp->kprobe_override)
return false;
- /* XDP programs inserted into maps are not guaranteed to run on
- * a particular netdev (and can run outside driver context entirely
- * in the case of devmap and cpumap). Until device checks
- * are implemented, prohibit adding dev-bound programs to program maps.
- */
- if (bpf_prog_is_dev_bound(fp->aux))
- return false;
-
spin_lock(&map->owner.lock);
if (!map->owner.type) {
/* There's no owner yet where we could check for
@@ -2306,18 +2375,45 @@ bool bpf_prog_map_compatible(struct bpf_map *map,
*/
map->owner.type = prog_type;
map->owner.jited = fp->jited;
- map->owner.xdp_has_frags = fp->aux->xdp_has_frags;
+ map->owner.xdp_has_frags = aux->xdp_has_frags;
+ map->owner.attach_func_proto = aux->attach_func_proto;
ret = true;
} else {
ret = map->owner.type == prog_type &&
map->owner.jited == fp->jited &&
- map->owner.xdp_has_frags == fp->aux->xdp_has_frags;
+ map->owner.xdp_has_frags == aux->xdp_has_frags;
+ if (ret &&
+ map->owner.attach_func_proto != aux->attach_func_proto) {
+ switch (prog_type) {
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_EXT:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ ret = false;
+ break;
+ default:
+ break;
+ }
+ }
}
spin_unlock(&map->owner.lock);
return ret;
}
+bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp)
+{
+ /* XDP programs inserted into maps are not guaranteed to run on
+ * a particular netdev (and can run outside driver context entirely
+ * in the case of devmap and cpumap). Until device checks
+ * are implemented, prohibit adding dev-bound programs to program maps.
+ */
+ if (bpf_prog_is_dev_bound(fp->aux))
+ return false;
+
+ return __bpf_prog_map_compatible(map, fp);
+}
+
static int bpf_check_tail_call(const struct bpf_prog *fp)
{
struct bpf_prog_aux *aux = fp->aux;
@@ -2330,7 +2426,7 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
if (!map_type_contains_progs(map))
continue;
- if (!bpf_prog_map_compatible(map, fp)) {
+ if (!__bpf_prog_map_compatible(map, fp)) {
ret = -EINVAL;
goto out;
}
@@ -2345,8 +2441,18 @@ static void bpf_prog_select_func(struct bpf_prog *fp)
{
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
+ u32 idx = (round_up(stack_depth, 32) / 32) - 1;
- fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
+ /* may_goto may cause stack size > 512, leading to idx out-of-bounds.
+ * But for non-JITed programs, we don't need bpf_func, so no bounds
+ * check needed.
+ */
+ if (!fp->jit_requested &&
+ !WARN_ON_ONCE(idx >= ARRAY_SIZE(interpreters))) {
+ fp->bpf_func = interpreters[idx];
+ } else {
+ fp->bpf_func = __bpf_prog_ret0_warn;
+ }
#else
fp->bpf_func = __bpf_prog_ret0_warn;
#endif
@@ -2368,7 +2474,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
/* In case of BPF to BPF calls, verifier did all the prep
* work with regards to JITing, etc.
*/
- bool jit_needed = false;
+ bool jit_needed = fp->jit_requested;
if (fp->bpf_func)
goto finalize;
@@ -2403,7 +2509,9 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
}
finalize:
- bpf_prog_lock_ro(fp);
+ *err = bpf_prog_lock_ro(fp);
+ if (*err)
+ return fp;
/* The tail call compatibility check can only be done at
* this late stage as we need to determine, if we deal
@@ -2437,13 +2545,14 @@ EXPORT_SYMBOL(bpf_empty_prog_array);
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
{
+ struct bpf_prog_array *p;
+
if (prog_cnt)
- return kzalloc(sizeof(struct bpf_prog_array) +
- sizeof(struct bpf_prog_array_item) *
- (prog_cnt + 1),
- flags);
+ p = kzalloc(struct_size(p, items, prog_cnt + 1), flags);
+ else
+ p = &bpf_empty_prog_array.hdr;
- return &bpf_empty_prog_array.hdr;
+ return p;
}
void bpf_prog_array_free(struct bpf_prog_array *progs)
@@ -2723,8 +2832,7 @@ static void bpf_free_used_maps(struct bpf_prog_aux *aux)
kfree(aux->used_maps);
}
-void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
- struct btf_mod_pair *used_btfs, u32 len)
+void __bpf_free_used_btfs(struct btf_mod_pair *used_btfs, u32 len)
{
#ifdef CONFIG_BPF_SYSCALL
struct btf_mod_pair *btf_mod;
@@ -2741,7 +2849,7 @@ void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
static void bpf_free_used_btfs(struct bpf_prog_aux *aux)
{
- __bpf_free_used_btfs(aux, aux->used_btfs, aux->used_btf_cnt);
+ __bpf_free_used_btfs(aux->used_btfs, aux->used_btf_cnt);
kfree(aux->used_btfs);
}
@@ -2796,7 +2904,7 @@ void bpf_prog_free(struct bpf_prog *fp)
}
EXPORT_SYMBOL_GPL(bpf_prog_free);
-/* RNG for unpriviledged user space with separated state from prandom_u32(). */
+/* RNG for unprivileged user space with separated state from prandom_u32(). */
static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);
void bpf_user_rnd_init_once(void)
@@ -2869,6 +2977,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void)
return NULL;
}
+const struct bpf_func_proto * __weak bpf_get_perf_event_read_value_proto(void)
+{
+ return NULL;
+}
+
u64 __weak
bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
@@ -2903,7 +3016,7 @@ void __weak bpf_jit_compile(struct bpf_prog *prog)
{
}
-bool __weak bpf_helper_changes_pkt_data(void *func)
+bool __weak bpf_helper_changes_pkt_data(enum bpf_func_id func_id)
{
return false;
}
@@ -2921,12 +3034,28 @@ bool __weak bpf_jit_needs_zext(void)
return false;
}
+/* Return true if the JIT inlines the call to the helper corresponding to
+ * the imm.
+ *
+ * The verifier will not patch the insn->imm for the call to the helper if
+ * this returns true.
+ */
+bool __weak bpf_jit_inlines_helper_call(s32 imm)
+{
+ return false;
+}
+
/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
bool __weak bpf_jit_supports_subprog_tailcalls(void)
{
return false;
}
+bool __weak bpf_jit_supports_percpu_insn(void)
+{
+ return false;
+}
+
bool __weak bpf_jit_supports_kfunc_call(void)
{
return false;
@@ -2942,6 +3071,20 @@ bool __weak bpf_jit_supports_arena(void)
return false;
}
+bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
+{
+ return false;
+}
+
+u64 __weak bpf_arch_uaddress_limit(void)
+{
+#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE)
+ return TASK_SIZE;
+#else
+ return 0;
+#endif
+}
+
/* Return TRUE if the JIT backend satisfies the following two conditions:
* 1) JIT backend supports atomic_xchg() on pointer-sized words.
* 2) Under the specific arch, the implementation of xchg() is the same
@@ -2982,10 +3125,41 @@ bool __weak bpf_jit_supports_exceptions(void)
return false;
}
+bool __weak bpf_jit_supports_private_stack(void)
+{
+ return false;
+}
+
void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
{
}
+bool __weak bpf_jit_supports_timed_may_goto(void)
+{
+ return false;
+}
+
+u64 __weak arch_bpf_timed_may_goto(void)
+{
+ return 0;
+}
+
+u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p)
+{
+ u64 time = ktime_get_mono_fast_ns();
+
+ /* Populate the timestamp for this stack frame, and refresh count. */
+ if (!p->timestamp) {
+ p->timestamp = time;
+ return BPF_MAX_TIMED_LOOPS;
+ }
+ /* Check if we've exhausted our time slice, and zero count. */
+ if (time - p->timestamp >= (NSEC_PER_SEC / 4))
+ return 0;
+ /* Refresh the count for the stack frame. */
+ return BPF_MAX_TIMED_LOOPS;
+}
+
/* for configs without MMU or 32-bit */
__weak const struct bpf_map_ops arena_map_ops;
__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index a8e34416e960..67e8a2fc1a99 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -33,8 +33,8 @@
#include <trace/events/xdp.h>
#include <linux/btf_ids.h>
-#include <linux/netdevice.h> /* netif_receive_skb_list */
-#include <linux/etherdevice.h> /* eth_type_trans */
+#include <linux/netdevice.h>
+#include <net/gro.h>
/* General idea: XDP packets getting XDP redirected to another CPU,
* will maximum be stored/queued for one driver ->poll() call. It is
@@ -68,6 +68,7 @@ struct bpf_cpu_map_entry {
struct bpf_cpumap_val value;
struct bpf_prog *prog;
+ struct gro_node gro;
struct completion kthread_running;
struct rcu_work free_work;
@@ -79,8 +80,6 @@ struct bpf_cpu_map {
struct bpf_cpu_map_entry __rcu **cpu_map;
};
-static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list);
-
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
u32 value_size = attr->value_size;
@@ -135,22 +134,23 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
}
}
-static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
- struct list_head *listp,
- struct xdp_cpumap_stats *stats)
+static u32 cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
+ void **skbs, u32 skb_n,
+ struct xdp_cpumap_stats *stats)
{
- struct sk_buff *skb, *tmp;
struct xdp_buff xdp;
- u32 act;
+ u32 act, pass = 0;
int err;
- list_for_each_entry_safe(skb, tmp, listp, list) {
+ for (u32 i = 0; i < skb_n; i++) {
+ struct sk_buff *skb = skbs[i];
+
act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog);
switch (act) {
case XDP_PASS:
+ skbs[pass++] = skb;
break;
case XDP_REDIRECT:
- skb_list_del_init(skb);
err = xdp_do_generic_redirect(skb->dev, skb, &xdp,
rcpu->prog);
if (unlikely(err)) {
@@ -159,7 +159,7 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
} else {
stats->redirect++;
}
- return;
+ break;
default:
bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
fallthrough;
@@ -167,12 +167,15 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
trace_xdp_exception(skb->dev, rcpu->prog, act);
fallthrough;
case XDP_DROP:
- skb_list_del_init(skb);
- kfree_skb(skb);
+ napi_consume_skb(skb, true);
stats->drop++;
- return;
+ break;
}
}
+
+ stats->pass += pass;
+
+ return pass;
}
static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
@@ -192,7 +195,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
int err;
rxq.dev = xdpf->dev_rx;
- rxq.mem = xdpf->mem;
+ rxq.mem.type = xdpf->mem_type;
/* TODO: report queue_index to xdp_rxq_info */
xdp_convert_frame_to_buff(xdpf, &xdp);
@@ -206,7 +209,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
stats->drop++;
} else {
frames[nframes++] = xdpf;
- stats->pass++;
}
break;
case XDP_REDIRECT:
@@ -230,40 +232,65 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
}
xdp_clear_return_frame_no_direct();
+ stats->pass += nframes;
return nframes;
}
#define CPUMAP_BATCH 8
-static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
- int xdp_n, struct xdp_cpumap_stats *stats,
- struct list_head *list)
+struct cpu_map_ret {
+ u32 xdp_n;
+ u32 skb_n;
+};
+
+static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
+ void **skbs, struct cpu_map_ret *ret,
+ struct xdp_cpumap_stats *stats)
{
- int nframes;
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
if (!rcpu->prog)
- return xdp_n;
+ goto out;
- rcu_read_lock_bh();
+ rcu_read_lock();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
- nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats);
+ ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats);
+ if (unlikely(ret->skb_n))
+ ret->skb_n = cpu_map_bpf_prog_run_skb(rcpu, skbs, ret->skb_n,
+ stats);
if (stats->redirect)
xdp_do_flush();
- if (unlikely(!list_empty(list)))
- cpu_map_bpf_prog_run_skb(rcpu, list, stats);
+ bpf_net_ctx_clear(bpf_net_ctx);
+ rcu_read_unlock();
- rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
+out:
+ if (unlikely(ret->skb_n) && ret->xdp_n)
+ memmove(&skbs[ret->xdp_n], skbs, ret->skb_n * sizeof(*skbs));
+}
- return nframes;
+static void cpu_map_gro_flush(struct bpf_cpu_map_entry *rcpu, bool empty)
+{
+ /*
+ * If the ring is not empty, there'll be a new iteration soon, and we
+ * only need to do a full flush if a tick is long (> 1 ms).
+ * If the ring is empty, to not hold GRO packets in the stack for too
+ * long, do a full flush.
+ * This is equivalent to how NAPI decides whether to perform a full
+ * flush.
+ */
+ gro_flush(&rcpu->gro, !empty && HZ >= 1000);
+ gro_normal_list(&rcpu->gro);
}
static int cpu_map_kthread_run(void *data)
{
struct bpf_cpu_map_entry *rcpu = data;
unsigned long last_qs = jiffies;
+ u32 packets = 0;
complete(&rcpu->kthread_running);
set_current_state(TASK_INTERRUPTIBLE);
@@ -276,11 +303,11 @@ static int cpu_map_kthread_run(void *data)
while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
struct xdp_cpumap_stats stats = {}; /* zero stats */
unsigned int kmem_alloc_drops = 0, sched = 0;
- gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
- int i, n, m, nframes, xdp_n;
+ struct cpu_map_ret ret = { };
void *frames[CPUMAP_BATCH];
void *skbs[CPUMAP_BATCH];
- LIST_HEAD(list);
+ u32 i, n, m;
+ bool empty;
/* Release CPU reschedule checks */
if (__ptr_ring_empty(rcpu->queue)) {
@@ -305,7 +332,7 @@ static int cpu_map_kthread_run(void *data)
*/
n = __ptr_ring_consume_batched(rcpu->queue, frames,
CPUMAP_BATCH);
- for (i = 0, xdp_n = 0; i < n; i++) {
+ for (i = 0; i < n; i++) {
void *f = frames[i];
struct page *page;
@@ -313,11 +340,11 @@ static int cpu_map_kthread_run(void *data)
struct sk_buff *skb = f;
__ptr_clear_bit(0, &skb);
- list_add_tail(&skb->list, &list);
+ skbs[ret.skb_n++] = skb;
continue;
}
- frames[xdp_n++] = f;
+ frames[ret.xdp_n++] = f;
page = virt_to_page(f);
/* Bring struct page memory area to curr CPU. Read by
@@ -327,38 +354,51 @@ static int cpu_map_kthread_run(void *data)
prefetchw(page);
}
+ local_bh_disable();
+
/* Support running another XDP prog on this CPU */
- nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list);
- if (nframes) {
- m = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
- gfp, nframes, skbs);
- if (unlikely(m == 0)) {
- for (i = 0; i < nframes; i++)
- skbs[i] = NULL; /* effect: xdp_return_frame */
- kmem_alloc_drops += nframes;
- }
+ cpu_map_bpf_prog_run(rcpu, frames, skbs, &ret, &stats);
+ if (!ret.xdp_n)
+ goto stats;
+
+ m = napi_skb_cache_get_bulk(skbs, ret.xdp_n);
+ if (unlikely(m < ret.xdp_n)) {
+ for (i = m; i < ret.xdp_n; i++)
+ xdp_return_frame(frames[i]);
+
+ if (ret.skb_n)
+ memmove(&skbs[m], &skbs[ret.xdp_n],
+ ret.skb_n * sizeof(*skbs));
+
+ kmem_alloc_drops += ret.xdp_n - m;
+ ret.xdp_n = m;
}
- local_bh_disable();
- for (i = 0; i < nframes; i++) {
+ for (i = 0; i < ret.xdp_n; i++) {
struct xdp_frame *xdpf = frames[i];
- struct sk_buff *skb = skbs[i];
-
- skb = __xdp_build_skb_from_frame(xdpf, skb,
- xdpf->dev_rx);
- if (!skb) {
- xdp_return_frame(xdpf);
- continue;
- }
- list_add_tail(&skb->list, &list);
+ /* Can fail only when !skb -- already handled above */
+ __xdp_build_skb_from_frame(xdpf, skbs[i], xdpf->dev_rx);
}
- netif_receive_skb_list(&list);
- /* Feedback loop via tracepoint */
+stats:
+ /* Feedback loop via tracepoint.
+ * NB: keep before recv to allow measuring enqueue/dequeue latency.
+ */
trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops,
sched, &stats);
+ for (i = 0; i < ret.xdp_n + ret.skb_n; i++)
+ gro_receive_skb(&rcpu->gro, skbs[i]);
+
+ /* Flush either every 64 packets or in case of empty ring */
+ packets += n;
+ empty = __ptr_ring_empty(rcpu->queue);
+ if (packets >= NAPI_POLL_WEIGHT || empty) {
+ cpu_map_gro_flush(rcpu, empty);
+ packets = 0;
+ }
+
local_bh_enable(); /* resched point, may call do_softirq() */
}
__set_current_state(TASK_RUNNING);
@@ -427,6 +467,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
rcpu->cpu = cpu;
rcpu->map_id = map->id;
rcpu->value.qsize = value->qsize;
+ gro_init(&rcpu->gro);
if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd))
goto free_ptr_ring;
@@ -455,6 +496,7 @@ free_prog:
if (rcpu->prog)
bpf_prog_put(rcpu->prog);
free_ptr_ring:
+ gro_cleanup(&rcpu->gro);
ptr_ring_cleanup(rcpu->queue, NULL);
free_queue:
kfree(rcpu->queue);
@@ -484,6 +526,7 @@ static void __cpu_map_entry_free(struct work_struct *work)
if (rcpu->prog)
bpf_prog_put(rcpu->prog);
+ gro_cleanup(&rcpu->gro);
/* The queue should be empty at this point */
__cpu_map_ring_cleanup(rcpu->queue);
ptr_ring_cleanup(rcpu->queue, NULL);
@@ -706,7 +749,6 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
*/
static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
- struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
@@ -723,8 +765,11 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
*/
bq->q[bq->count++] = xdpf;
- if (!bq->flush_node.prev)
+ if (!bq->flush_node.prev) {
+ struct list_head *flush_list = bpf_net_ctx_get_cpu_map_flush_list();
+
list_add(&bq->flush_node, flush_list);
+ }
}
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
@@ -756,9 +801,8 @@ trace:
return ret;
}
-void __cpu_map_flush(void)
+void __cpu_map_flush(struct list_head *flush_list)
{
- struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
@@ -768,24 +812,3 @@ void __cpu_map_flush(void)
wake_up_process(bq->obj->kthread);
}
}
-
-#ifdef CONFIG_DEBUG_NET
-bool cpu_map_check_flush(void)
-{
- if (list_empty(this_cpu_ptr(&cpu_map_flush_list)))
- return false;
- __cpu_map_flush();
- return true;
-}
-#endif
-
-static int __init cpu_map_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu));
- return 0;
-}
-
-subsys_initcall(cpu_map_init);
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index dad0fb1c8e87..9876c5fe6c2a 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -45,6 +45,10 @@ __bpf_kfunc_start_defs();
*
* bpf_cpumask_create() allocates memory using the BPF memory allocator, and
* will not block. It may return NULL if no memory is available.
+ *
+ * Return:
+ * * A pointer to a new struct bpf_cpumask instance on success.
+ * * NULL if the BPF memory allocator is out of memory.
*/
__bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void)
{
@@ -71,6 +75,10 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void)
* Acquires a reference to a BPF cpumask. The cpumask returned by this function
* must either be embedded in a map as a kptr, or freed with
* bpf_cpumask_release().
+ *
+ * Return:
+ * * The struct bpf_cpumask pointer passed to the function.
+ *
*/
__bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask)
{
@@ -91,9 +99,7 @@ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
if (!refcount_dec_and_test(&cpumask->usage))
return;
- migrate_disable();
bpf_mem_cache_free_rcu(&bpf_cpumask_ma, cpumask);
- migrate_enable();
}
__bpf_kfunc void bpf_cpumask_release_dtor(void *cpumask)
@@ -108,6 +114,9 @@ CFI_NOSEAL(bpf_cpumask_release_dtor);
*
* Find the index of the first nonzero bit of the cpumask. A struct bpf_cpumask
* pointer may be safely passed to this function.
+ *
+ * Return:
+ * * The index of the first nonzero bit in the struct cpumask.
*/
__bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask)
{
@@ -121,6 +130,9 @@ __bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask)
*
* Find the index of the first unset bit of the cpumask. A struct bpf_cpumask
* pointer may be safely passed to this function.
+ *
+ * Return:
+ * * The index of the first zero bit in the struct cpumask.
*/
__bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask)
{
@@ -135,6 +147,9 @@ __bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask)
*
* Find the index of the first nonzero bit of the AND of two cpumasks.
* struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ *
+ * Return:
+ * * The index of the first bit that is nonzero in both cpumask instances.
*/
__bpf_kfunc u32 bpf_cpumask_first_and(const struct cpumask *src1,
const struct cpumask *src2)
@@ -416,12 +431,47 @@ __bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
* @cpumask: The cpumask being queried.
*
* Count the number of set bits in the given cpumask.
+ *
+ * Return:
+ * * The number of bits set in the mask.
*/
__bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask)
{
return cpumask_weight(cpumask);
}
+/**
+ * bpf_cpumask_populate() - Populate the CPU mask from the contents of
+ * a BPF memory region.
+ *
+ * @cpumask: The cpumask being populated.
+ * @src: The BPF memory holding the bit pattern.
+ * @src__sz: Length of the BPF memory region in bytes.
+ *
+ * Return:
+ * * 0 if the struct cpumask * instance was populated successfully.
+ * * -EACCES if the memory region is too small to populate the cpumask.
+ * * -EINVAL if the memory region is not aligned to the size of a long
+ * and the architecture does not support efficient unaligned accesses.
+ */
+__bpf_kfunc int bpf_cpumask_populate(struct cpumask *cpumask, void *src, size_t src__sz)
+{
+ unsigned long source = (unsigned long)src;
+
+ /* The memory region must be large enough to populate the entire CPU mask. */
+ if (src__sz < bitmap_size(nr_cpu_ids))
+ return -EACCES;
+
+ /* If avoiding unaligned accesses, the input region must be aligned to the nearest long. */
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
+ !IS_ALIGNED(source, sizeof(long)))
+ return -EINVAL;
+
+ bitmap_copy(cpumask_bits(cpumask), src, nr_cpu_ids);
+
+ return 0;
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(cpumask_kfunc_btf_ids)
@@ -450,6 +500,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_populate, KF_RCU)
BTF_KFUNCS_END(cpumask_kfunc_btf_ids)
static const struct btf_kfunc_id_set cpumask_kfunc_set = {
@@ -474,6 +525,7 @@ static int __init cpumask_kfunc_init(void)
ret = bpf_mem_alloc_init(&bpf_cpumask_ma, sizeof(struct bpf_cpumask), false);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &cpumask_kfunc_set);
return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors,
ARRAY_SIZE(cpumask_dtors),
THIS_MODULE);
diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c
new file mode 100644
index 000000000000..94854cd9c4cc
--- /dev/null
+++ b/kernel/bpf/crypto.c
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Meta, Inc */
+#include <linux/bpf.h>
+#include <linux/bpf_crypto.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/filter.h>
+#include <linux/scatterlist.h>
+#include <linux/skbuff.h>
+#include <crypto/skcipher.h>
+
+struct bpf_crypto_type_list {
+ const struct bpf_crypto_type *type;
+ struct list_head list;
+};
+
+/* BPF crypto initialization parameters struct */
+/**
+ * struct bpf_crypto_params - BPF crypto initialization parameters structure
+ * @type: The string of crypto operation type.
+ * @reserved: Reserved member, will be reused for more options in future
+ * Values:
+ * 0
+ * @algo: The string of algorithm to initialize.
+ * @key: The cipher key used to init crypto algorithm.
+ * @key_len: The length of cipher key.
+ * @authsize: The length of authentication tag used by algorithm.
+ */
+struct bpf_crypto_params {
+ char type[14];
+ u8 reserved[2];
+ char algo[128];
+ u8 key[256];
+ u32 key_len;
+ u32 authsize;
+};
+
+static LIST_HEAD(bpf_crypto_types);
+static DECLARE_RWSEM(bpf_crypto_types_sem);
+
+/**
+ * struct bpf_crypto_ctx - refcounted BPF crypto context structure
+ * @type: The pointer to bpf crypto type
+ * @tfm: The pointer to instance of crypto API struct.
+ * @siv_len: Size of IV and state storage for cipher
+ * @rcu: The RCU head used to free the crypto context with RCU safety.
+ * @usage: Object reference counter. When the refcount goes to 0, the
+ * memory is released back to the BPF allocator, which provides
+ * RCU safety.
+ */
+struct bpf_crypto_ctx {
+ const struct bpf_crypto_type *type;
+ void *tfm;
+ u32 siv_len;
+ struct rcu_head rcu;
+ refcount_t usage;
+};
+
+int bpf_crypto_register_type(const struct bpf_crypto_type *type)
+{
+ struct bpf_crypto_type_list *node;
+ int err = -EEXIST;
+
+ down_write(&bpf_crypto_types_sem);
+ list_for_each_entry(node, &bpf_crypto_types, list) {
+ if (!strcmp(node->type->name, type->name))
+ goto unlock;
+ }
+
+ node = kmalloc(sizeof(*node), GFP_KERNEL);
+ err = -ENOMEM;
+ if (!node)
+ goto unlock;
+
+ node->type = type;
+ list_add(&node->list, &bpf_crypto_types);
+ err = 0;
+
+unlock:
+ up_write(&bpf_crypto_types_sem);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(bpf_crypto_register_type);
+
+int bpf_crypto_unregister_type(const struct bpf_crypto_type *type)
+{
+ struct bpf_crypto_type_list *node;
+ int err = -ENOENT;
+
+ down_write(&bpf_crypto_types_sem);
+ list_for_each_entry(node, &bpf_crypto_types, list) {
+ if (strcmp(node->type->name, type->name))
+ continue;
+
+ list_del(&node->list);
+ kfree(node);
+ err = 0;
+ break;
+ }
+ up_write(&bpf_crypto_types_sem);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(bpf_crypto_unregister_type);
+
+static const struct bpf_crypto_type *bpf_crypto_get_type(const char *name)
+{
+ const struct bpf_crypto_type *type = ERR_PTR(-ENOENT);
+ struct bpf_crypto_type_list *node;
+
+ down_read(&bpf_crypto_types_sem);
+ list_for_each_entry(node, &bpf_crypto_types, list) {
+ if (strcmp(node->type->name, name))
+ continue;
+
+ if (try_module_get(node->type->owner))
+ type = node->type;
+ break;
+ }
+ up_read(&bpf_crypto_types_sem);
+
+ return type;
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_crypto_ctx_create() - Create a mutable BPF crypto context.
+ *
+ * Allocates a crypto context that can be used, acquired, and released by
+ * a BPF program. The crypto context returned by this function must either
+ * be embedded in a map as a kptr, or freed with bpf_crypto_ctx_release().
+ * As crypto API functions use GFP_KERNEL allocations, this function can
+ * only be used in sleepable BPF programs.
+ *
+ * bpf_crypto_ctx_create() allocates memory for crypto context.
+ * It may return NULL if no memory is available.
+ * @params: pointer to struct bpf_crypto_params which contains all the
+ * details needed to initialise crypto context.
+ * @params__sz: size of steuct bpf_crypto_params usef by bpf program
+ * @err: integer to store error code when NULL is returned.
+ */
+__bpf_kfunc struct bpf_crypto_ctx *
+bpf_crypto_ctx_create(const struct bpf_crypto_params *params, u32 params__sz,
+ int *err)
+{
+ const struct bpf_crypto_type *type;
+ struct bpf_crypto_ctx *ctx;
+
+ if (!params || params->reserved[0] || params->reserved[1] ||
+ params__sz != sizeof(struct bpf_crypto_params)) {
+ *err = -EINVAL;
+ return NULL;
+ }
+
+ type = bpf_crypto_get_type(params->type);
+ if (IS_ERR(type)) {
+ *err = PTR_ERR(type);
+ return NULL;
+ }
+
+ if (!type->has_algo(params->algo)) {
+ *err = -EOPNOTSUPP;
+ goto err_module_put;
+ }
+
+ if (!!params->authsize ^ !!type->setauthsize) {
+ *err = -EOPNOTSUPP;
+ goto err_module_put;
+ }
+
+ if (!params->key_len || params->key_len > sizeof(params->key)) {
+ *err = -EINVAL;
+ goto err_module_put;
+ }
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx) {
+ *err = -ENOMEM;
+ goto err_module_put;
+ }
+
+ ctx->type = type;
+ ctx->tfm = type->alloc_tfm(params->algo);
+ if (IS_ERR(ctx->tfm)) {
+ *err = PTR_ERR(ctx->tfm);
+ goto err_free_ctx;
+ }
+
+ if (params->authsize) {
+ *err = type->setauthsize(ctx->tfm, params->authsize);
+ if (*err)
+ goto err_free_tfm;
+ }
+
+ *err = type->setkey(ctx->tfm, params->key, params->key_len);
+ if (*err)
+ goto err_free_tfm;
+
+ if (type->get_flags(ctx->tfm) & CRYPTO_TFM_NEED_KEY) {
+ *err = -EINVAL;
+ goto err_free_tfm;
+ }
+
+ ctx->siv_len = type->ivsize(ctx->tfm) + type->statesize(ctx->tfm);
+
+ refcount_set(&ctx->usage, 1);
+
+ return ctx;
+
+err_free_tfm:
+ type->free_tfm(ctx->tfm);
+err_free_ctx:
+ kfree(ctx);
+err_module_put:
+ module_put(type->owner);
+
+ return NULL;
+}
+
+static void crypto_free_cb(struct rcu_head *head)
+{
+ struct bpf_crypto_ctx *ctx;
+
+ ctx = container_of(head, struct bpf_crypto_ctx, rcu);
+ ctx->type->free_tfm(ctx->tfm);
+ module_put(ctx->type->owner);
+ kfree(ctx);
+}
+
+/**
+ * bpf_crypto_ctx_acquire() - Acquire a reference to a BPF crypto context.
+ * @ctx: The BPF crypto context being acquired. The ctx must be a trusted
+ * pointer.
+ *
+ * Acquires a reference to a BPF crypto context. The context returned by this function
+ * must either be embedded in a map as a kptr, or freed with
+ * bpf_crypto_ctx_release().
+ */
+__bpf_kfunc struct bpf_crypto_ctx *
+bpf_crypto_ctx_acquire(struct bpf_crypto_ctx *ctx)
+{
+ if (!refcount_inc_not_zero(&ctx->usage))
+ return NULL;
+ return ctx;
+}
+
+/**
+ * bpf_crypto_ctx_release() - Release a previously acquired BPF crypto context.
+ * @ctx: The crypto context being released.
+ *
+ * Releases a previously acquired reference to a BPF crypto context. When the final
+ * reference of the BPF crypto context has been released, its memory
+ * will be released.
+ */
+__bpf_kfunc void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx)
+{
+ if (refcount_dec_and_test(&ctx->usage))
+ call_rcu(&ctx->rcu, crypto_free_cb);
+}
+
+static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx,
+ const struct bpf_dynptr_kern *src,
+ const struct bpf_dynptr_kern *dst,
+ const struct bpf_dynptr_kern *siv,
+ bool decrypt)
+{
+ u32 src_len, dst_len, siv_len;
+ const u8 *psrc;
+ u8 *pdst, *piv;
+ int err;
+
+ if (__bpf_dynptr_is_rdonly(dst))
+ return -EINVAL;
+
+ siv_len = siv ? __bpf_dynptr_size(siv) : 0;
+ src_len = __bpf_dynptr_size(src);
+ dst_len = __bpf_dynptr_size(dst);
+ if (!src_len || !dst_len)
+ return -EINVAL;
+
+ if (siv_len != ctx->siv_len)
+ return -EINVAL;
+
+ psrc = __bpf_dynptr_data(src, src_len);
+ if (!psrc)
+ return -EINVAL;
+ pdst = __bpf_dynptr_data_rw(dst, dst_len);
+ if (!pdst)
+ return -EINVAL;
+
+ piv = siv_len ? __bpf_dynptr_data_rw(siv, siv_len) : NULL;
+ if (siv_len && !piv)
+ return -EINVAL;
+
+ err = decrypt ? ctx->type->decrypt(ctx->tfm, psrc, pdst, src_len, piv)
+ : ctx->type->encrypt(ctx->tfm, psrc, pdst, src_len, piv);
+
+ return err;
+}
+
+/**
+ * bpf_crypto_decrypt() - Decrypt buffer using configured context and IV provided.
+ * @ctx: The crypto context being used. The ctx must be a trusted pointer.
+ * @src: bpf_dynptr to the encrypted data. Must be a trusted pointer.
+ * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer.
+ * @siv__nullable: bpf_dynptr to IV data and state data to be used by decryptor. May be NULL.
+ *
+ * Decrypts provided buffer using IV data and the crypto context. Crypto context must be configured.
+ */
+__bpf_kfunc int bpf_crypto_decrypt(struct bpf_crypto_ctx *ctx,
+ const struct bpf_dynptr *src,
+ const struct bpf_dynptr *dst,
+ const struct bpf_dynptr *siv__nullable)
+{
+ const struct bpf_dynptr_kern *src_kern = (struct bpf_dynptr_kern *)src;
+ const struct bpf_dynptr_kern *dst_kern = (struct bpf_dynptr_kern *)dst;
+ const struct bpf_dynptr_kern *siv_kern = (struct bpf_dynptr_kern *)siv__nullable;
+
+ return bpf_crypto_crypt(ctx, src_kern, dst_kern, siv_kern, true);
+}
+
+/**
+ * bpf_crypto_encrypt() - Encrypt buffer using configured context and IV provided.
+ * @ctx: The crypto context being used. The ctx must be a trusted pointer.
+ * @src: bpf_dynptr to the plain data. Must be a trusted pointer.
+ * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer.
+ * @siv__nullable: bpf_dynptr to IV data and state data to be used by decryptor. May be NULL.
+ *
+ * Encrypts provided buffer using IV data and the crypto context. Crypto context must be configured.
+ */
+__bpf_kfunc int bpf_crypto_encrypt(struct bpf_crypto_ctx *ctx,
+ const struct bpf_dynptr *src,
+ const struct bpf_dynptr *dst,
+ const struct bpf_dynptr *siv__nullable)
+{
+ const struct bpf_dynptr_kern *src_kern = (struct bpf_dynptr_kern *)src;
+ const struct bpf_dynptr_kern *dst_kern = (struct bpf_dynptr_kern *)dst;
+ const struct bpf_dynptr_kern *siv_kern = (struct bpf_dynptr_kern *)siv__nullable;
+
+ return bpf_crypto_crypt(ctx, src_kern, dst_kern, siv_kern, false);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(crypt_init_kfunc_btf_ids)
+BTF_ID_FLAGS(func, bpf_crypto_ctx_create, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_crypto_ctx_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_crypto_ctx_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_KFUNCS_END(crypt_init_kfunc_btf_ids)
+
+static const struct btf_kfunc_id_set crypt_init_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &crypt_init_kfunc_btf_ids,
+};
+
+BTF_KFUNCS_START(crypt_kfunc_btf_ids)
+BTF_ID_FLAGS(func, bpf_crypto_decrypt, KF_RCU)
+BTF_ID_FLAGS(func, bpf_crypto_encrypt, KF_RCU)
+BTF_KFUNCS_END(crypt_kfunc_btf_ids)
+
+static const struct btf_kfunc_id_set crypt_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &crypt_kfunc_btf_ids,
+};
+
+BTF_ID_LIST(bpf_crypto_dtor_ids)
+BTF_ID(struct, bpf_crypto_ctx)
+BTF_ID(func, bpf_crypto_ctx_release)
+
+static int __init crypto_kfunc_init(void)
+{
+ int ret;
+ const struct btf_id_dtor_kfunc bpf_crypto_dtors[] = {
+ {
+ .btf_id = bpf_crypto_dtor_ids[0],
+ .kfunc_btf_id = bpf_crypto_dtor_ids[1]
+ },
+ };
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &crypt_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &crypt_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &crypt_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
+ &crypt_init_kfunc_set);
+ return ret ?: register_btf_id_dtor_kfuncs(bpf_crypto_dtors,
+ ARRAY_SIZE(bpf_crypto_dtors),
+ THIS_MODULE);
+}
+
+late_initcall(crypto_kfunc_init);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 4e2cdbb5629f..482d284a1553 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -83,7 +83,6 @@ struct bpf_dtab {
u32 n_buckets;
};
-static DEFINE_PER_CPU(struct list_head, dev_flush_list);
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
@@ -107,7 +106,7 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)];
}
-static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
+static int dev_map_alloc_check(union bpf_attr *attr)
{
u32 valsize = attr->value_size;
@@ -121,23 +120,28 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
attr->map_flags & ~DEV_CREATE_FLAG_MASK)
return -EINVAL;
+ if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+ /* Hash table size must be power of 2; roundup_pow_of_two()
+ * can overflow into UB on 32-bit arches
+ */
+ if (attr->max_entries > 1UL << 31)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
+{
/* Lookup returns a pointer straight to dev->ifindex, so make sure the
* verifier prevents writes from the BPF side
*/
attr->map_flags |= BPF_F_RDONLY_PROG;
-
-
bpf_map_init_from_attr(&dtab->map, attr);
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
- /* hash table size must be power of 2; roundup_pow_of_two() can
- * overflow into UB on 32-bit arches, so check that first
- */
- if (dtab->map.max_entries > 1UL << 31)
- return -EINVAL;
-
+ /* Hash table size must be power of 2 */
dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
-
dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
dtab->map.numa_node);
if (!dtab->dev_index_head)
@@ -180,7 +184,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
static void dev_map_free(struct bpf_map *map)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- int i;
+ u32 i;
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
* so the programs (can be more than one that used this map) were
@@ -196,7 +200,14 @@ static void dev_map_free(struct bpf_map *map)
list_del_rcu(&dtab->list);
spin_unlock(&dev_map_lock);
- bpf_clear_redirect_map(map);
+ /* bpf_redirect_info->map is assigned in __bpf_xdp_redirect_map()
+ * during NAPI callback and cleared after the XDP redirect. There is no
+ * explicit RCU read section which protects bpf_redirect_info->map but
+ * local_bh_disable() also marks the beginning an RCU section. This
+ * makes the complete softirq callback RCU protected. Thus after
+ * following synchronize_rcu() there no bpf_redirect_info->map == map
+ * assignment.
+ */
synchronize_rcu();
/* Make sure prior __dev_map_entry_free() have completed. */
@@ -322,9 +333,11 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
struct xdp_frame **frames, int n,
- struct net_device *dev)
+ struct net_device *tx_dev,
+ struct net_device *rx_dev)
{
- struct xdp_txq_info txq = { .dev = dev };
+ struct xdp_txq_info txq = { .dev = tx_dev };
+ struct xdp_rxq_info rxq = { .dev = rx_dev };
struct xdp_buff xdp;
int i, nframes = 0;
@@ -335,6 +348,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
xdp_convert_frame_to_buff(xdpf, &xdp);
xdp.txq = &txq;
+ xdp.rxq = &rxq;
act = bpf_prog_run_xdp(xdp_prog, &xdp);
switch (act) {
@@ -349,7 +363,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
fallthrough;
case XDP_ABORTED:
- trace_xdp_exception(dev, xdp_prog, act);
+ trace_xdp_exception(tx_dev, xdp_prog, act);
fallthrough;
case XDP_DROP:
xdp_return_frame_rx_napi(xdpf);
@@ -377,7 +391,7 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
}
if (bq->xdp_prog) {
- to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev);
+ to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx);
if (!to_send)
goto out;
}
@@ -406,9 +420,8 @@ out:
* driver before returning from its napi->poll() routine. See the comment above
* xdp_do_flush() in filter.c.
*/
-void __dev_flush(void)
+void __dev_flush(struct list_head *flush_list)
{
- struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
@@ -419,16 +432,6 @@ void __dev_flush(void)
}
}
-#ifdef CONFIG_DEBUG_NET
-bool dev_check_flush(void)
-{
- if (list_empty(this_cpu_ptr(&dev_flush_list)))
- return false;
- __dev_flush();
- return true;
-}
-#endif
-
/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
* by local_bh_disable() (from XDP calls inside NAPI). The
* rcu_read_lock_bh_held() below makes lockdep accept both.
@@ -453,7 +456,6 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx, struct bpf_prog *xdp_prog)
{
- struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
@@ -467,6 +469,8 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
* are only ever modified together.
*/
if (!bq->dev_rx) {
+ struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list();
+
bq->dev_rx = dev_rx;
bq->xdp_prog = xdp_prog;
list_add(&bq->flush_node, flush_list);
@@ -674,7 +678,7 @@ int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
}
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
int err;
@@ -697,7 +701,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
struct sk_buff *skb,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
struct sk_buff *nskb;
int err;
@@ -716,8 +720,8 @@ static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
}
int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
- struct bpf_prog *xdp_prog, struct bpf_map *map,
- bool exclude_ingress)
+ const struct bpf_prog *xdp_prog,
+ struct bpf_map *map, bool exclude_ingress)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dst, *last_dst = NULL;
@@ -760,9 +764,6 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
for (i = 0; i < dtab->n_buckets; i++) {
head = dev_map_index_hash(dtab, i);
hlist_for_each_entry_safe(dst, next, head, index_hlist) {
- if (!dst)
- continue;
-
if (is_ifindex_excluded(excluded_devices, num_excluded,
dst->dev->ifindex))
continue;
@@ -820,7 +821,7 @@ static long dev_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *old_dev;
- int k = *(u32 *)key;
+ u32 k = *(u32 *)key;
if (k >= map->max_entries)
return -EINVAL;
@@ -837,7 +838,7 @@ static long dev_map_hash_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *old_dev;
- int k = *(u32 *)key;
+ u32 k = *(u32 *)key;
unsigned long flags;
int ret = -ENOENT;
@@ -1043,6 +1044,7 @@ static u64 dev_map_mem_usage(const struct bpf_map *map)
BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab)
const struct bpf_map_ops dev_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = dev_map_alloc_check,
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
.map_get_next_key = dev_map_get_next_key,
@@ -1057,6 +1059,7 @@ const struct bpf_map_ops dev_map_ops = {
const struct bpf_map_ops dev_map_hash_ops = {
.map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = dev_map_alloc_check,
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
.map_get_next_key = dev_map_hash_get_next_key,
@@ -1156,15 +1159,11 @@ static struct notifier_block dev_map_notifier = {
static int __init dev_map_init(void)
{
- int cpu;
-
/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
offsetof(struct _bpf_dtab_netdev, dev));
register_netdevice_notifier(&dev_map_notifier);
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu));
return 0;
}
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index bd2e2dd04740..20883c6b1546 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -172,6 +172,17 @@ static bool is_addr_space_cast(const struct bpf_insn *insn)
insn->off == BPF_ADDR_SPACE_CAST;
}
+/* Special (internal-only) form of mov, used to resolve per-CPU addrs:
+ * dst_reg = src_reg + <percpu_base_off>
+ * BPF_ADDR_PERCPU is used as a special insn->off value.
+ */
+#define BPF_ADDR_PERCPU (-1)
+
+static inline bool is_mov_percpu_addr(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU;
+}
+
void print_bpf_insn(const struct bpf_insn_cbs *cbs,
const struct bpf_insn *insn,
bool allow_ptr_leaks)
@@ -191,9 +202,12 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
insn->dst_reg, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg);
} else if (is_addr_space_cast(insn)) {
- verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n",
+ verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %u, %u)\n",
insn->code, insn->dst_reg,
insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm);
+ } else if (is_mov_percpu_addr(insn)) {
+ verbose(cbs->private_data, "(%02x) r%d = &(void __percpu *)(r%d)\n",
+ insn->code, insn->dst_reg, insn->src_reg);
} else if (BPF_SRC(insn->code) == BPF_X) {
verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n",
insn->code, class == BPF_ALU ? 'w' : 'r',
@@ -253,6 +267,18 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->dst_reg, insn->off, insn->src_reg);
+ } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_LOAD_ACQ) {
+ verbose(cbs->private_data, "(%02x) r%d = load_acquire((%s *)(r%d %+d))\n",
+ insn->code, insn->dst_reg,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->off);
+ } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_STORE_REL) {
+ verbose(cbs->private_data, "(%02x) store_release((%s *)(r%d %+d), r%d)\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off, insn->src_reg);
} else {
verbose(cbs->private_data, "BUG_%02x\n", insn->code);
}
@@ -355,7 +381,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
insn->code, class == BPF_JMP32 ? 'w' : 'r',
insn->dst_reg,
bpf_jmp_string[BPF_OP(insn->code) >> 4],
- insn->imm, insn->off);
+ (u32)insn->imm, insn->off);
}
} else {
verbose(cbs->private_data, "(%02x) %s\n",
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index 70fb82bf1637..b77db7413f8c 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -154,7 +154,8 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
d->image = NULL;
goto out;
}
- bpf_image_ksym_add(d->image, PAGE_SIZE, &d->ksym);
+ bpf_image_ksym_init(d->image, PAGE_SIZE, &d->ksym);
+ bpf_image_ksym_add(&d->ksym);
}
prev_num_progs = d->num_progs;
diff --git a/kernel/bpf/dmabuf_iter.c b/kernel/bpf/dmabuf_iter.c
new file mode 100644
index 000000000000..4dd7ef7c145c
--- /dev/null
+++ b/kernel/bpf/dmabuf_iter.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Google LLC */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/dma-buf.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+static void *dmabuf_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos)
+ return NULL;
+
+ return dma_buf_iter_begin();
+}
+
+static void *dmabuf_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct dma_buf *dmabuf = v;
+
+ ++*pos;
+
+ return dma_buf_iter_next(dmabuf);
+}
+
+struct bpf_iter__dmabuf {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct dma_buf *, dmabuf);
+};
+
+static int __dmabuf_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+ struct bpf_iter_meta meta = {
+ .seq = seq,
+ };
+ struct bpf_iter__dmabuf ctx = {
+ .meta = &meta,
+ .dmabuf = v,
+ };
+ struct bpf_prog *prog = bpf_iter_get_info(&meta, in_stop);
+
+ if (prog)
+ return bpf_iter_run_prog(prog, &ctx);
+
+ return 0;
+}
+
+static int dmabuf_iter_seq_show(struct seq_file *seq, void *v)
+{
+ return __dmabuf_seq_show(seq, v, false);
+}
+
+static void dmabuf_iter_seq_stop(struct seq_file *seq, void *v)
+{
+ struct dma_buf *dmabuf = v;
+
+ if (dmabuf)
+ dma_buf_put(dmabuf);
+}
+
+static const struct seq_operations dmabuf_iter_seq_ops = {
+ .start = dmabuf_iter_seq_start,
+ .next = dmabuf_iter_seq_next,
+ .stop = dmabuf_iter_seq_stop,
+ .show = dmabuf_iter_seq_show,
+};
+
+static void bpf_iter_dmabuf_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ seq_puts(seq, "dmabuf iter\n");
+}
+
+static const struct bpf_iter_seq_info dmabuf_iter_seq_info = {
+ .seq_ops = &dmabuf_iter_seq_ops,
+ .init_seq_private = NULL,
+ .fini_seq_private = NULL,
+ .seq_priv_size = 0,
+};
+
+static struct bpf_iter_reg bpf_dmabuf_reg_info = {
+ .target = "dmabuf",
+ .feature = BPF_ITER_RESCHED,
+ .show_fdinfo = bpf_iter_dmabuf_show_fdinfo,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__dmabuf, dmabuf),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &dmabuf_iter_seq_info,
+};
+
+DEFINE_BPF_ITER_FUNC(dmabuf, struct bpf_iter_meta *meta, struct dma_buf *dmabuf)
+BTF_ID_LIST_SINGLE(bpf_dmabuf_btf_id, struct, dma_buf)
+
+static int __init dmabuf_iter_init(void)
+{
+ bpf_dmabuf_reg_info.ctx_arg_info[0].btf_id = bpf_dmabuf_btf_id[0];
+ return bpf_iter_reg_target(&bpf_dmabuf_reg_info);
+}
+
+late_initcall(dmabuf_iter_init);
+
+struct bpf_iter_dmabuf {
+ /*
+ * opaque iterator state; having __u64 here allows to preserve correct
+ * alignment requirements in vmlinux.h, generated from BTF
+ */
+ __u64 __opaque[1];
+} __aligned(8);
+
+/* Non-opaque version of bpf_iter_dmabuf */
+struct bpf_iter_dmabuf_kern {
+ struct dma_buf *dmabuf;
+} __aligned(8);
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_dmabuf_new(struct bpf_iter_dmabuf *it)
+{
+ struct bpf_iter_dmabuf_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(*kit) > sizeof(*it));
+ BUILD_BUG_ON(__alignof__(*kit) != __alignof__(*it));
+
+ kit->dmabuf = NULL;
+ return 0;
+}
+
+__bpf_kfunc struct dma_buf *bpf_iter_dmabuf_next(struct bpf_iter_dmabuf *it)
+{
+ struct bpf_iter_dmabuf_kern *kit = (void *)it;
+
+ if (kit->dmabuf)
+ kit->dmabuf = dma_buf_iter_next(kit->dmabuf);
+ else
+ kit->dmabuf = dma_buf_iter_begin();
+
+ return kit->dmabuf;
+}
+
+__bpf_kfunc void bpf_iter_dmabuf_destroy(struct bpf_iter_dmabuf *it)
+{
+ struct bpf_iter_dmabuf_kern *kit = (void *)it;
+
+ if (kit->dmabuf)
+ dma_buf_put(kit->dmabuf);
+}
+
+__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3a088a5349bc..71f9931ac64c 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -16,6 +16,7 @@
#include "bpf_lru_list.h"
#include "map_in_map.h"
#include <linux/bpf_mem_alloc.h>
+#include <asm/rqspinlock.h>
#define HTAB_CREATE_FLAG_MASK \
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
@@ -78,7 +79,7 @@
*/
struct bucket {
struct hlist_nulls_head head;
- raw_spinlock_t raw_lock;
+ rqspinlock_t raw_lock;
};
#define HASHTAB_MAP_LOCK_COUNT 8
@@ -104,8 +105,6 @@ struct bpf_htab {
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
u32 hashrnd;
- struct lock_class_key lockdep_key;
- int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
};
/* each htab element is struct htab_elem + key + value */
@@ -140,45 +139,26 @@ static void htab_init_buckets(struct bpf_htab *htab)
for (i = 0; i < htab->n_buckets; i++) {
INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
- raw_spin_lock_init(&htab->buckets[i].raw_lock);
- lockdep_set_class(&htab->buckets[i].raw_lock,
- &htab->lockdep_key);
+ raw_res_spin_lock_init(&htab->buckets[i].raw_lock);
cond_resched();
}
}
-static inline int htab_lock_bucket(const struct bpf_htab *htab,
- struct bucket *b, u32 hash,
- unsigned long *pflags)
+static inline int htab_lock_bucket(struct bucket *b, unsigned long *pflags)
{
unsigned long flags;
+ int ret;
- hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);
-
- preempt_disable();
- local_irq_save(flags);
- if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
- __this_cpu_dec(*(htab->map_locked[hash]));
- local_irq_restore(flags);
- preempt_enable();
- return -EBUSY;
- }
-
- raw_spin_lock(&b->raw_lock);
+ ret = raw_res_spin_lock_irqsave(&b->raw_lock, flags);
+ if (ret)
+ return ret;
*pflags = flags;
-
return 0;
}
-static inline void htab_unlock_bucket(const struct bpf_htab *htab,
- struct bucket *b, u32 hash,
- unsigned long flags)
+static inline void htab_unlock_bucket(struct bucket *b, unsigned long flags)
{
- hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);
- raw_spin_unlock(&b->raw_lock);
- __this_cpu_dec(*(htab->map_locked[hash]));
- local_irq_restore(flags);
- preempt_enable();
+ raw_res_spin_unlock_irqrestore(&b->raw_lock, flags);
}
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
@@ -195,20 +175,30 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
}
+static inline bool is_fd_htab(const struct bpf_htab *htab)
+{
+ return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS;
+}
+
+static inline void *htab_elem_value(struct htab_elem *l, u32 key_size)
+{
+ return l->key + round_up(key_size, 8);
+}
+
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
void __percpu *pptr)
{
- *(void __percpu **)(l->key + key_size) = pptr;
+ *(void __percpu **)htab_elem_value(l, key_size) = pptr;
}
static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
{
- return *(void __percpu **)(l->key + key_size);
+ return *(void __percpu **)htab_elem_value(l, key_size);
}
static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
{
- return *(void **)(l->key + roundup(map->key_size, 8));
+ return *(void **)htab_elem_value(l, map->key_size);
}
static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
@@ -216,18 +206,20 @@ static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
}
+/* Both percpu and fd htab support in-place update, so no need for
+ * extra elem. LRU itself can remove the least used element, so
+ * there is no need for an extra elem during map_update.
+ */
static bool htab_has_extra_elems(struct bpf_htab *htab)
{
- return !htab_is_percpu(htab) && !htab_is_lru(htab);
+ return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
}
-static void htab_free_prealloced_timers(struct bpf_htab *htab)
+static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab)
{
u32 num_entries = htab->map.max_entries;
int i;
- if (!btf_record_has_field(htab->map.record, BPF_TIMER))
- return;
if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus();
@@ -235,7 +227,12 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab)
struct htab_elem *elem;
elem = get_htab_elem(htab, i);
- bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
+ if (btf_record_has_field(htab->map.record, BPF_TIMER))
+ bpf_obj_free_timer(htab->map.record,
+ htab_elem_value(elem, htab->map.key_size));
+ if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
+ bpf_obj_free_workqueue(htab->map.record,
+ htab_elem_value(elem, htab->map.key_size));
cond_resched();
}
}
@@ -262,7 +259,8 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab)
cond_resched();
}
} else {
- bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
+ bpf_obj_free_fields(htab->map.record,
+ htab_elem_value(elem, htab->map.key_size));
cond_resched();
}
cond_resched();
@@ -459,6 +457,9 @@ static int htab_map_alloc_check(union bpf_attr *attr)
* kmalloc-able later in htab_map_update_elem()
*/
return -E2BIG;
+ /* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */
+ if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE)
+ return -E2BIG;
return 0;
}
@@ -467,8 +468,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{
bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
- bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
- attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
/* percpu_lru means each cpu has its own LRU list.
* it is different from BPF_MAP_TYPE_PERCPU_HASH where
* the map's value itself is percpu. percpu_lru has
@@ -477,14 +476,12 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
struct bpf_htab *htab;
- int err, i;
+ int err;
htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
if (!htab)
return ERR_PTR(-ENOMEM);
- lockdep_register_key(&htab->lockdep_key);
-
bpf_map_init_from_attr(&htab->map, attr);
if (percpu_lru) {
@@ -530,15 +527,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (!htab->buckets)
goto free_elem_count;
- for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) {
- htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map,
- sizeof(int),
- sizeof(int),
- GFP_USER);
- if (!htab->map_locked[i])
- goto free_map_locked;
- }
-
if (htab->map.map_flags & BPF_F_ZERO_SEED)
htab->hashrnd = 0;
else
@@ -574,10 +562,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (err)
goto free_map_locked;
- if (!percpu && !lru) {
- /* lru itself can remove the least used element, so
- * there is no need for an extra elem during map_update.
- */
+ if (htab_has_extra_elems(htab)) {
err = alloc_extra_elems(htab);
if (err)
goto free_prealloc;
@@ -601,15 +586,12 @@ free_prealloc:
free_map_locked:
if (htab->use_percpu_counter)
percpu_counter_destroy(&htab->pcount);
- for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
- free_percpu(htab->map_locked[i]);
bpf_map_area_free(htab->buckets);
bpf_mem_alloc_destroy(&htab->pcpu_ma);
bpf_mem_alloc_destroy(&htab->ma);
free_elem_count:
bpf_map_free_elem_count(&htab->map);
free_htab:
- lockdep_unregister_key(&htab->lockdep_key);
bpf_map_area_free(htab);
return ERR_PTR(err);
}
@@ -698,7 +680,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
struct htab_elem *l = __htab_map_lookup_elem(map, key);
if (l)
- return l->key + round_up(map->key_size, 8);
+ return htab_elem_value(l, map->key_size);
return NULL;
}
@@ -737,7 +719,7 @@ static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
if (l) {
if (mark)
bpf_lru_node_set_ref(&l->lru_node);
- return l->key + round_up(map->key_size, 8);
+ return htab_elem_value(l, map->key_size);
}
return NULL;
@@ -781,6 +763,9 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
static void check_and_free_fields(struct bpf_htab *htab,
struct htab_elem *elem)
{
+ if (IS_ERR_OR_NULL(htab->map.record))
+ return;
+
if (htab_is_percpu(htab)) {
void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
int cpu;
@@ -788,7 +773,7 @@ static void check_and_free_fields(struct bpf_htab *htab,
for_each_possible_cpu(cpu)
bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
} else {
- void *map_value = elem->key + round_up(htab->map.key_size, 8);
+ void *map_value = htab_elem_value(elem, htab->map.key_size);
bpf_obj_free_fields(htab->map.record, map_value);
}
@@ -811,20 +796,21 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
b = __select_bucket(htab, tgt_l->hash);
head = &b->head;
- ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret)
return false;
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l == tgt_l) {
hlist_nulls_del_rcu(&l->hash_node);
- check_and_free_fields(htab, l);
bpf_map_dec_elem_count(&htab->map);
break;
}
- htab_unlock_bucket(htab, b, tgt_l->hash, flags);
+ htab_unlock_bucket(b, flags);
+ if (l == tgt_l)
+ check_and_free_fields(htab, l);
return l == tgt_l;
}
@@ -890,6 +876,7 @@ find_first_elem:
static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
check_and_free_fields(htab, l);
+
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
bpf_mem_cache_free(&htab->ma, l);
@@ -942,7 +929,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
if (htab_is_prealloc(htab)) {
bpf_map_dec_elem_count(&htab->map);
check_and_free_fields(htab, l);
- __pcpu_freelist_push(&htab->freelist, &l->fnode);
+ pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
dec_elem_count(htab);
htab_elem_free(htab, l);
@@ -991,8 +978,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
{
- return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
- BITS_PER_LONG == 64;
+ return is_fd_htab(htab) && BITS_PER_LONG == 64;
}
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
@@ -1012,7 +998,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
*/
pl_new = this_cpu_ptr(htab->extra_elems);
l_new = *pl_new;
- htab_put_fd_value(htab, old_elem);
*pl_new = old_elem;
} else {
struct pcpu_freelist_node *l;
@@ -1046,14 +1031,15 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
/* alloc_percpu zero-fills */
- pptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
- if (!pptr) {
+ void *ptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
+
+ if (!ptr) {
bpf_mem_cache_free(&htab->ma, l_new);
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
- l_new->ptr_to_pptr = pptr;
- pptr = *(void **)pptr;
+ l_new->ptr_to_pptr = ptr;
+ pptr = *(void __percpu **)ptr;
}
pcpu_init_value(htab, pptr, value, onallcpus);
@@ -1062,11 +1048,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
htab_elem_set_ptr(l_new, key_size, pptr);
} else if (fd_htab_map_needs_adjust(htab)) {
size = round_up(size, 8);
- memcpy(l_new->key + round_up(key_size, 8), value, size);
+ memcpy(htab_elem_value(l_new, key_size), value, size);
} else {
- copy_map_value(&htab->map,
- l_new->key + round_up(key_size, 8),
- value);
+ copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value);
}
l_new->hash = hash;
@@ -1095,7 +1079,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct htab_elem *l_new = NULL, *l_old;
+ struct htab_elem *l_new, *l_old;
struct hlist_nulls_head *head;
unsigned long flags;
struct bucket *b;
@@ -1128,7 +1112,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
if (l_old) {
/* grab the element lock and update value in place */
copy_map_value_locked(map,
- l_old->key + round_up(key_size, 8),
+ htab_elem_value(l_old, key_size),
value, false);
return 0;
}
@@ -1138,7 +1122,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
*/
}
- ret = htab_lock_bucket(htab, b, hash, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret)
return ret;
@@ -1156,7 +1140,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
* and update element in place
*/
copy_map_value_locked(map,
- l_old->key + round_up(key_size, 8),
+ htab_elem_value(l_old, key_size),
value, false);
ret = 0;
goto err;
@@ -1176,14 +1160,19 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
if (l_old) {
hlist_nulls_del_rcu(&l_old->hash_node);
- if (!htab_is_prealloc(htab))
- free_htab_elem(htab, l_old);
- else
+
+ /* l_old has already been stashed in htab->extra_elems, free
+ * its special fields before it is available for reuse.
+ */
+ if (htab_is_prealloc(htab))
check_and_free_fields(htab, l_old);
}
- ret = 0;
+ htab_unlock_bucket(b, flags);
+ if (l_old && !htab_is_prealloc(htab))
+ free_htab_elem(htab, l_old);
+ return 0;
err:
- htab_unlock_bucket(htab, b, hash, flags);
+ htab_unlock_bucket(b, flags);
return ret;
}
@@ -1227,10 +1216,9 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
l_new = prealloc_lru_pop(htab, key, hash);
if (!l_new)
return -ENOMEM;
- copy_map_value(&htab->map,
- l_new->key + round_up(map->key_size, 8), value);
+ copy_map_value(&htab->map, htab_elem_value(l_new, map->key_size), value);
- ret = htab_lock_bucket(htab, b, hash, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret)
goto err_lock_bucket;
@@ -1251,7 +1239,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
ret = 0;
err:
- htab_unlock_bucket(htab, b, hash, flags);
+ htab_unlock_bucket(b, flags);
err_lock_bucket:
if (ret)
@@ -1262,13 +1250,14 @@ err_lock_bucket:
return ret;
}
-static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
void *value, u64 map_flags,
- bool onallcpus)
+ bool percpu, bool onallcpus)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct htab_elem *l_new = NULL, *l_old;
+ struct htab_elem *l_new, *l_old;
struct hlist_nulls_head *head;
+ void *old_map_ptr = NULL;
unsigned long flags;
struct bucket *b;
u32 key_size, hash;
@@ -1288,7 +1277,7 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
b = __select_bucket(htab, hash);
head = &b->head;
- ret = htab_lock_bucket(htab, b, hash, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret)
return ret;
@@ -1299,21 +1288,29 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
goto err;
if (l_old) {
- /* per-cpu hash map can update value in-place */
- pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
- value, onallcpus);
+ /* Update value in-place */
+ if (percpu) {
+ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
+ value, onallcpus);
+ } else {
+ void **inner_map_pptr = htab_elem_value(l_old, key_size);
+
+ old_map_ptr = *inner_map_pptr;
+ WRITE_ONCE(*inner_map_pptr, *(void **)value);
+ }
} else {
l_new = alloc_htab_elem(htab, key, value, key_size,
- hash, true, onallcpus, NULL);
+ hash, percpu, onallcpus, NULL);
if (IS_ERR(l_new)) {
ret = PTR_ERR(l_new);
goto err;
}
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
}
- ret = 0;
err:
- htab_unlock_bucket(htab, b, hash, flags);
+ htab_unlock_bucket(b, flags);
+ if (old_map_ptr)
+ map->ops->map_fd_put_ptr(map, old_map_ptr, true);
return ret;
}
@@ -1354,7 +1351,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
return -ENOMEM;
}
- ret = htab_lock_bucket(htab, b, hash, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret)
goto err_lock_bucket;
@@ -1378,7 +1375,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
}
ret = 0;
err:
- htab_unlock_bucket(htab, b, hash, flags);
+ htab_unlock_bucket(b, flags);
err_lock_bucket:
if (l_new) {
bpf_map_dec_elem_count(&htab->map);
@@ -1390,7 +1387,7 @@ err_lock_bucket:
static long htab_percpu_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags)
{
- return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
+ return htab_map_update_elem_in_place(map, key, value, map_flags, true, false);
}
static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
@@ -1420,20 +1417,20 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key)
b = __select_bucket(htab, hash);
head = &b->head;
- ret = htab_lock_bucket(htab, b, hash, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret)
return ret;
l = lookup_elem_raw(head, hash, key, key_size);
-
- if (l) {
+ if (l)
hlist_nulls_del_rcu(&l->hash_node);
- free_htab_elem(htab, l);
- } else {
+ else
ret = -ENOENT;
- }
- htab_unlock_bucket(htab, b, hash, flags);
+ htab_unlock_bucket(b, flags);
+
+ if (l)
+ free_htab_elem(htab, l);
return ret;
}
@@ -1456,7 +1453,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
b = __select_bucket(htab, hash);
head = &b->head;
- ret = htab_lock_bucket(htab, b, hash, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret)
return ret;
@@ -1467,7 +1464,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
else
ret = -ENOENT;
- htab_unlock_bucket(htab, b, hash, flags);
+ htab_unlock_bucket(b, flags);
if (l)
htab_lru_push_free(htab, l);
return ret;
@@ -1477,10 +1474,9 @@ static void delete_all_elements(struct bpf_htab *htab)
{
int i;
- /* It's called from a worker thread, so disable migration here,
- * since bpf_mem_cache_free() relies on that.
+ /* It's called from a worker thread and migration has been disabled,
+ * therefore, it is OK to invoke bpf_mem_cache_free() directly.
*/
- migrate_disable();
for (i = 0; i < htab->n_buckets; i++) {
struct hlist_nulls_head *head = select_bucket(htab, i);
struct hlist_nulls_node *n;
@@ -1490,11 +1486,11 @@ static void delete_all_elements(struct bpf_htab *htab)
hlist_nulls_del_rcu(&l->hash_node);
htab_elem_free(htab, l);
}
+ cond_resched();
}
- migrate_enable();
}
-static void htab_free_malloced_timers(struct bpf_htab *htab)
+static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab)
{
int i;
@@ -1506,31 +1502,35 @@ static void htab_free_malloced_timers(struct bpf_htab *htab)
hlist_nulls_for_each_entry(l, n, head, hash_node) {
/* We only free timer on uref dropping to zero */
- bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8));
+ if (btf_record_has_field(htab->map.record, BPF_TIMER))
+ bpf_obj_free_timer(htab->map.record,
+ htab_elem_value(l, htab->map.key_size));
+ if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
+ bpf_obj_free_workqueue(htab->map.record,
+ htab_elem_value(l, htab->map.key_size));
}
cond_resched_rcu();
}
rcu_read_unlock();
}
-static void htab_map_free_timers(struct bpf_map *map)
+static void htab_map_free_timers_and_wq(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- /* We only free timer on uref dropping to zero */
- if (!btf_record_has_field(htab->map.record, BPF_TIMER))
- return;
- if (!htab_is_prealloc(htab))
- htab_free_malloced_timers(htab);
- else
- htab_free_prealloced_timers(htab);
+ /* We only free timer and workqueue on uref dropping to zero */
+ if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) {
+ if (!htab_is_prealloc(htab))
+ htab_free_malloced_timers_and_wq(htab);
+ else
+ htab_free_prealloced_timers_and_wq(htab);
+ }
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void htab_map_free(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- int i;
/* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
* bpf_free_used_maps() is called after bpf prog is no longer executing.
@@ -1538,7 +1538,7 @@ static void htab_map_free(struct bpf_map *map)
*/
/* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
- * underneath and is reponsible for waiting for callbacks to finish
+ * underneath and is responsible for waiting for callbacks to finish
* during bpf_mem_alloc_destroy().
*/
if (!htab_is_prealloc(htab)) {
@@ -1555,9 +1555,6 @@ static void htab_map_free(struct bpf_map *map)
bpf_mem_alloc_destroy(&htab->ma);
if (htab->use_percpu_counter)
percpu_counter_destroy(&htab->pcount);
- for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
- free_percpu(htab->map_locked[i]);
- lockdep_unregister_key(&htab->lockdep_key);
bpf_map_area_free(htab);
}
@@ -1577,7 +1574,7 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
seq_puts(m, ": ");
btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
rcu_read_unlock();
}
@@ -1600,48 +1597,48 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
b = __select_bucket(htab, hash);
head = &b->head;
- ret = htab_lock_bucket(htab, b, hash, &bflags);
+ ret = htab_lock_bucket(b, &bflags);
if (ret)
return ret;
l = lookup_elem_raw(head, hash, key, key_size);
if (!l) {
ret = -ENOENT;
- } else {
- if (is_percpu) {
- u32 roundup_value_size = round_up(map->value_size, 8);
- void __percpu *pptr;
- int off = 0, cpu;
+ goto out_unlock;
+ }
- pptr = htab_elem_get_ptr(l, key_size);
- for_each_possible_cpu(cpu) {
- copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
- check_and_init_map_value(&htab->map, value + off);
- off += roundup_value_size;
- }
- } else {
- u32 roundup_key_size = round_up(map->key_size, 8);
+ if (is_percpu) {
+ u32 roundup_value_size = round_up(map->value_size, 8);
+ void __percpu *pptr;
+ int off = 0, cpu;
- if (flags & BPF_F_LOCK)
- copy_map_value_locked(map, value, l->key +
- roundup_key_size,
- true);
- else
- copy_map_value(map, value, l->key +
- roundup_key_size);
- /* Zeroing special fields in the temp buffer */
- check_and_init_map_value(map, value);
+ pptr = htab_elem_get_ptr(l, key_size);
+ for_each_possible_cpu(cpu) {
+ copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(&htab->map, value + off);
+ off += roundup_value_size;
}
+ } else {
+ void *src = htab_elem_value(l, map->key_size);
- hlist_nulls_del_rcu(&l->hash_node);
- if (!is_lru_map)
- free_htab_elem(htab, l);
+ if (flags & BPF_F_LOCK)
+ copy_map_value_locked(map, value, src, true);
+ else
+ copy_map_value(map, value, src);
+ /* Zeroing special fields in the temp buffer */
+ check_and_init_map_value(map, value);
}
+ hlist_nulls_del_rcu(&l->hash_node);
- htab_unlock_bucket(htab, b, hash, bflags);
+out_unlock:
+ htab_unlock_bucket(b, bflags);
- if (is_lru_map && l)
- htab_lru_push_free(htab, l);
+ if (l) {
+ if (is_lru_map)
+ htab_lru_push_free(htab, l);
+ else
+ free_htab_elem(htab, l);
+ }
return ret;
}
@@ -1684,12 +1681,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
bool is_percpu)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
void __user *uvalues = u64_to_user_ptr(attr->batch.values);
void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
u32 batch, max_count, size, bucket_size, map_id;
+ u32 bucket_cnt, total, key_size, value_size;
struct htab_elem *node_to_free = NULL;
u64 elem_map_flags, map_flags;
struct hlist_nulls_head *head;
@@ -1724,7 +1721,6 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
return -ENOENT;
key_size = htab->map.key_size;
- roundup_key_size = round_up(htab->map.key_size, 8);
value_size = htab->map.value_size;
size = round_up(value_size, 8);
if (is_percpu)
@@ -1756,7 +1752,7 @@ again_nocopy:
head = &b->head;
/* do not grab the lock unless need it (bucket_cnt > 0). */
if (locked) {
- ret = htab_lock_bucket(htab, b, batch, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret) {
rcu_read_unlock();
bpf_enable_instrumentation();
@@ -1779,7 +1775,7 @@ again_nocopy:
/* Note that since bucket_cnt > 0 here, it is implicit
* that the locked was grabbed, so release it.
*/
- htab_unlock_bucket(htab, b, batch, flags);
+ htab_unlock_bucket(b, flags);
rcu_read_unlock();
bpf_enable_instrumentation();
goto after_loop;
@@ -1790,7 +1786,7 @@ again_nocopy:
/* Note that since bucket_cnt > 0 here, it is implicit
* that the locked was grabbed, so release it.
*/
- htab_unlock_bucket(htab, b, batch, flags);
+ htab_unlock_bucket(b, flags);
rcu_read_unlock();
bpf_enable_instrumentation();
kvfree(keys);
@@ -1816,8 +1812,8 @@ again_nocopy:
off += size;
}
} else {
- value = l->key + roundup_key_size;
- if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+ value = htab_elem_value(l, key_size);
+ if (is_fd_htab(htab)) {
struct bpf_map **inner_map = value;
/* Actual value is the id of the inner map */
@@ -1840,25 +1836,29 @@ again_nocopy:
* may cause deadlock. See comments in function
* prealloc_lru_pop(). Let us do bpf_lru_push_free()
* after releasing the bucket lock.
+ *
+ * For htab of maps, htab_put_fd_value() in
+ * free_htab_elem() may acquire a spinlock with bucket
+ * lock being held and it violates the lock rule, so
+ * invoke free_htab_elem() after unlock as well.
*/
- if (is_lru_map) {
- l->batch_flink = node_to_free;
- node_to_free = l;
- } else {
- free_htab_elem(htab, l);
- }
+ l->batch_flink = node_to_free;
+ node_to_free = l;
}
dst_key += key_size;
dst_val += value_size;
}
- htab_unlock_bucket(htab, b, batch, flags);
+ htab_unlock_bucket(b, flags);
locked = false;
while (node_to_free) {
l = node_to_free;
node_to_free = node_to_free->batch_flink;
- htab_lru_push_free(htab, l);
+ if (is_lru_map)
+ htab_lru_push_free(htab, l);
+ else
+ free_htab_elem(htab, l);
}
next_batch:
@@ -2063,11 +2063,11 @@ static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
{
struct bpf_iter_seq_hash_map_info *info = seq->private;
- u32 roundup_key_size, roundup_value_size;
struct bpf_iter__bpf_map_elem ctx = {};
struct bpf_map *map = info->map;
struct bpf_iter_meta meta;
int ret = 0, off = 0, cpu;
+ u32 roundup_value_size;
struct bpf_prog *prog;
void __percpu *pptr;
@@ -2077,10 +2077,9 @@ static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
ctx.meta = &meta;
ctx.map = info->map;
if (elem) {
- roundup_key_size = round_up(map->key_size, 8);
ctx.key = elem->key;
if (!info->percpu_value_buf) {
- ctx.value = elem->key + roundup_key_size;
+ ctx.value = htab_elem_value(elem, map->key_size);
} else {
roundup_value_size = round_up(map->value_size, 8);
pptr = htab_elem_get_ptr(elem, map->key_size);
@@ -2165,7 +2164,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_
struct hlist_nulls_head *head;
struct hlist_nulls_node *n;
struct htab_elem *elem;
- u32 roundup_key_size;
int i, num_elems = 0;
void __percpu *pptr;
struct bucket *b;
@@ -2173,29 +2171,29 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_
bool is_percpu;
u64 ret = 0;
+ cant_migrate();
+
if (flags != 0)
return -EINVAL;
is_percpu = htab_is_percpu(htab);
- roundup_key_size = round_up(map->key_size, 8);
- /* disable migration so percpu value prepared here will be the
- * same as the one seen by the bpf program with bpf_map_lookup_elem().
+ /* migration has been disabled, so percpu value prepared here will be
+ * the same as the one seen by the bpf program with
+ * bpf_map_lookup_elem().
*/
- if (is_percpu)
- migrate_disable();
for (i = 0; i < htab->n_buckets; i++) {
b = &htab->buckets[i];
rcu_read_lock();
head = &b->head;
- hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
+ hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) {
key = elem->key;
if (is_percpu) {
/* current cpu value for percpu map */
pptr = htab_elem_get_ptr(elem, map->key_size);
val = this_cpu_ptr(pptr);
} else {
- val = elem->key + roundup_key_size;
+ val = htab_elem_value(elem, map->key_size);
}
num_elems++;
ret = callback_fn((u64)(long)map, (u64)(long)key,
@@ -2209,8 +2207,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_
rcu_read_unlock();
}
out:
- if (is_percpu)
- migrate_enable();
return num_elems;
}
@@ -2259,7 +2255,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
- .map_release_uref = htab_map_free_timers,
+ .map_release_uref = htab_map_free_timers_and_wq,
.map_lookup_elem = htab_map_lookup_elem,
.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
.map_update_elem = htab_map_update_elem,
@@ -2280,7 +2276,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
- .map_release_uref = htab_map_free_timers,
+ .map_release_uref = htab_map_free_timers_and_wq,
.map_lookup_elem = htab_lru_map_lookup_elem,
.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
@@ -2307,6 +2303,26 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+/* inline bpf_map_lookup_elem() call for per-CPU hashmap */
+static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ if (!bpf_jit_supports_percpu_insn())
+ return -EOPNOTSUPP;
+
+ BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+ (void *(*)(struct bpf_map *map, void *key))NULL));
+ *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0,
+ offsetof(struct htab_elem, key) + roundup(map->key_size, 8));
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
+ *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
+
+ return insn - insn_buf;
+}
+
static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
{
struct htab_elem *l;
@@ -2392,8 +2408,8 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
ret = __htab_lru_percpu_map_update_elem(map, key, value,
map_flags, true);
else
- ret = __htab_percpu_map_update_elem(map, key, value, map_flags,
- true);
+ ret = htab_map_update_elem_in_place(map, key, value, map_flags,
+ true, true);
rcu_read_unlock();
return ret;
@@ -2421,7 +2437,7 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
seq_printf(m, "\tcpu%d: ", cpu);
btf_type_seq_show(map->btf, map->btf_value_type_id,
per_cpu_ptr(pptr, cpu), m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
}
seq_puts(m, "}\n");
@@ -2435,6 +2451,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_percpu_map_lookup_elem,
+ .map_gen_lookup = htab_percpu_map_gen_lookup,
.map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
.map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
@@ -2516,24 +2533,23 @@ int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
return ret;
}
-/* only called from syscall */
+/* Only called from syscall */
int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
void *key, void *value, u64 map_flags)
{
void *ptr;
int ret;
- u32 ufd = *(u32 *)value;
- ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
+ ptr = map->ops->map_fd_get_ptr(map, map_file, *(int *)value);
if (IS_ERR(ptr))
return PTR_ERR(ptr);
/* The htab bucket lock is always held during update operations in fd
* htab map, and the following rcu_read_lock() is only used to avoid
- * the WARN_ON_ONCE in htab_map_update_elem().
+ * the WARN_ON_ONCE in htab_map_update_elem_in_place().
*/
rcu_read_lock();
- ret = htab_map_update_elem(map, key, &ptr, map_flags);
+ ret = htab_map_update_elem_in_place(map, key, &ptr, map_flags, false, false);
rcu_read_unlock();
if (ret)
map->ops->map_fd_put_ptr(map, ptr, false);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 449b9a5d3fe3..b71e428ad936 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -23,6 +23,7 @@
#include <linux/btf_ids.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/kasan.h>
+#include <linux/bpf_verifier.h>
#include "../../lib/kstrtox.h"
@@ -111,7 +112,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT,
+ .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
};
BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value)
@@ -124,12 +125,13 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT,
+ .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
};
BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu);
}
@@ -158,6 +160,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
.func = bpf_get_smp_processor_id,
.gpl_only = false,
.ret_type = RET_INTEGER,
+ .allow_fastcall = true,
};
BPF_CALL_0(bpf_get_numa_node_id)
@@ -517,16 +520,15 @@ static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags,
}
BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags,
- long *, res)
+ s64 *, res)
{
long long _res;
int err;
+ *res = 0;
err = __bpf_strtoll(buf, buf_len, flags, &_res);
if (err < 0)
return err;
- if (_res != (long)_res)
- return -ERANGE;
*res = _res;
return err;
}
@@ -538,23 +540,23 @@ const struct bpf_func_proto bpf_strtol_proto = {
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_LONG,
+ .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
+ .arg4_size = sizeof(s64),
};
BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags,
- unsigned long *, res)
+ u64 *, res)
{
unsigned long long _res;
bool is_negative;
int err;
+ *res = 0;
err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
if (err < 0)
return err;
if (is_negative)
return -EINVAL;
- if (_res != (unsigned long)_res)
- return -ERANGE;
*res = _res;
return err;
}
@@ -566,7 +568,8 @@ const struct bpf_func_proto bpf_strtoul_proto = {
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_LONG,
+ .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
+ .arg4_size = sizeof(u64),
};
BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
@@ -714,7 +717,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
if (cpu >= nr_cpu_ids)
return (unsigned long)NULL;
- return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu);
+ return (unsigned long)per_cpu_ptr((const void __percpu *)(const uintptr_t)ptr, cpu);
}
const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
@@ -727,7 +730,7 @@ const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
{
- return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr);
+ return (unsigned long)this_cpu_ptr((const void __percpu *)(const uintptr_t)percpu_ptr);
}
const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
@@ -1079,11 +1082,23 @@ const struct bpf_func_proto bpf_snprintf_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
+struct bpf_async_cb {
+ struct bpf_map *map;
+ struct bpf_prog *prog;
+ void __rcu *callback_fn;
+ void *value;
+ union {
+ struct rcu_head rcu;
+ struct work_struct delete_work;
+ };
+ u64 flags;
+};
+
/* BPF map elements can contain 'struct bpf_timer'.
* Such map owns all of its BPF timers.
* 'struct bpf_timer' is allocated as part of map element allocation
* and it's zero initialized.
- * That space is used to keep 'struct bpf_timer_kern'.
+ * That space is used to keep 'struct bpf_async_kern'.
* bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and
* remembers 'struct bpf_map *' pointer it's part of.
* bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn.
@@ -1096,17 +1111,24 @@ const struct bpf_func_proto bpf_snprintf_proto = {
* freeing the timers when inner map is replaced or deleted by user space.
*/
struct bpf_hrtimer {
+ struct bpf_async_cb cb;
struct hrtimer timer;
- struct bpf_map *map;
- struct bpf_prog *prog;
- void __rcu *callback_fn;
- void *value;
- struct rcu_head rcu;
+ atomic_t cancelling;
};
-/* the actual struct hidden inside uapi struct bpf_timer */
-struct bpf_timer_kern {
- struct bpf_hrtimer *timer;
+struct bpf_work {
+ struct bpf_async_cb cb;
+ struct work_struct work;
+ struct work_struct delete_work;
+};
+
+/* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */
+struct bpf_async_kern {
+ union {
+ struct bpf_async_cb *cb;
+ struct bpf_hrtimer *timer;
+ struct bpf_work *work;
+ };
/* bpf_spin_lock is used here instead of spinlock_t to make
* sure that it always fits into space reserved by struct bpf_timer
* regardless of LOCKDEP and spinlock debug flags.
@@ -1114,19 +1136,24 @@ struct bpf_timer_kern {
struct bpf_spin_lock lock;
} __attribute__((aligned(8)));
+enum bpf_async_type {
+ BPF_ASYNC_TYPE_TIMER = 0,
+ BPF_ASYNC_TYPE_WQ,
+};
+
static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
{
struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
- struct bpf_map *map = t->map;
- void *value = t->value;
+ struct bpf_map *map = t->cb.map;
+ void *value = t->cb.value;
bpf_callback_t callback_fn;
void *key;
u32 idx;
BTF_TYPE_EMIT(struct bpf_timer);
- callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held());
+ callback_fn = rcu_dereference_check(t->cb.callback_fn, rcu_read_lock_bh_held());
if (!callback_fn)
goto out;
@@ -1155,46 +1182,128 @@ out:
return HRTIMER_NORESTART;
}
-BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map,
- u64, flags)
+static void bpf_wq_work(struct work_struct *work)
{
- clockid_t clockid = flags & (MAX_CLOCKS - 1);
+ struct bpf_work *w = container_of(work, struct bpf_work, work);
+ struct bpf_async_cb *cb = &w->cb;
+ struct bpf_map *map = cb->map;
+ bpf_callback_t callback_fn;
+ void *value = cb->value;
+ void *key;
+ u32 idx;
+
+ BTF_TYPE_EMIT(struct bpf_wq);
+
+ callback_fn = READ_ONCE(cb->callback_fn);
+ if (!callback_fn)
+ return;
+
+ if (map->map_type == BPF_MAP_TYPE_ARRAY) {
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ /* compute the key */
+ idx = ((char *)value - array->value) / array->elem_size;
+ key = &idx;
+ } else { /* hash or lru */
+ key = value - round_up(map->key_size, 8);
+ }
+
+ rcu_read_lock_trace();
+ migrate_disable();
+
+ callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
+
+ migrate_enable();
+ rcu_read_unlock_trace();
+}
+
+static void bpf_wq_delete_work(struct work_struct *work)
+{
+ struct bpf_work *w = container_of(work, struct bpf_work, delete_work);
+
+ cancel_work_sync(&w->work);
+
+ kfree_rcu(w, cb.rcu);
+}
+
+static void bpf_timer_delete_work(struct work_struct *work)
+{
+ struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work);
+
+ /* Cancel the timer and wait for callback to complete if it was running.
+ * If hrtimer_cancel() can be safely called it's safe to call
+ * kfree_rcu(t) right after for both preallocated and non-preallocated
+ * maps. The async->cb = NULL was already done and no code path can see
+ * address 't' anymore. Timer if armed for existing bpf_hrtimer before
+ * bpf_timer_cancel_and_free will have been cancelled.
+ */
+ hrtimer_cancel(&t->timer);
+ kfree_rcu(t, cb.rcu);
+}
+
+static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
+ enum bpf_async_type type)
+{
+ struct bpf_async_cb *cb;
struct bpf_hrtimer *t;
+ struct bpf_work *w;
+ clockid_t clockid;
+ size_t size;
int ret = 0;
- BUILD_BUG_ON(MAX_CLOCKS != 16);
- BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer));
- BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer));
-
if (in_nmi())
return -EOPNOTSUPP;
- if (flags >= MAX_CLOCKS ||
- /* similar to timerfd except _ALARM variants are not supported */
- (clockid != CLOCK_MONOTONIC &&
- clockid != CLOCK_REALTIME &&
- clockid != CLOCK_BOOTTIME))
+ switch (type) {
+ case BPF_ASYNC_TYPE_TIMER:
+ size = sizeof(struct bpf_hrtimer);
+ break;
+ case BPF_ASYNC_TYPE_WQ:
+ size = sizeof(struct bpf_work);
+ break;
+ default:
return -EINVAL;
- __bpf_spin_lock_irqsave(&timer->lock);
- t = timer->timer;
+ }
+
+ __bpf_spin_lock_irqsave(&async->lock);
+ t = async->timer;
if (t) {
ret = -EBUSY;
goto out;
}
+
/* allocate hrtimer via map_kmalloc to use memcg accounting */
- t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node);
- if (!t) {
+ cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
+ if (!cb) {
ret = -ENOMEM;
goto out;
}
- t->value = (void *)timer - map->record->timer_off;
- t->map = map;
- t->prog = NULL;
- rcu_assign_pointer(t->callback_fn, NULL);
- hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
- t->timer.function = bpf_timer_cb;
- WRITE_ONCE(timer->timer, t);
- /* Guarantee the order between timer->timer and map->usercnt. So
+
+ switch (type) {
+ case BPF_ASYNC_TYPE_TIMER:
+ clockid = flags & (MAX_CLOCKS - 1);
+ t = (struct bpf_hrtimer *)cb;
+
+ atomic_set(&t->cancelling, 0);
+ INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work);
+ hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT);
+ cb->value = (void *)async - map->record->timer_off;
+ break;
+ case BPF_ASYNC_TYPE_WQ:
+ w = (struct bpf_work *)cb;
+
+ INIT_WORK(&w->work, bpf_wq_work);
+ INIT_WORK(&w->delete_work, bpf_wq_delete_work);
+ cb->value = (void *)async - map->record->wq_off;
+ break;
+ }
+ cb->map = map;
+ cb->prog = NULL;
+ cb->flags = flags;
+ rcu_assign_pointer(cb->callback_fn, NULL);
+
+ WRITE_ONCE(async->cb, cb);
+ /* Guarantee the order between async->cb and map->usercnt. So
* when there are concurrent uref release and bpf timer init, either
* bpf_timer_cancel_and_free() called by uref release reads a no-NULL
* timer or atomic64_read() below returns a zero usercnt.
@@ -1204,15 +1313,34 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map
/* maps with timers must be either held by user space
* or pinned in bpffs.
*/
- WRITE_ONCE(timer->timer, NULL);
- kfree(t);
+ WRITE_ONCE(async->cb, NULL);
+ kfree(cb);
ret = -EPERM;
}
out:
- __bpf_spin_unlock_irqrestore(&timer->lock);
+ __bpf_spin_unlock_irqrestore(&async->lock);
return ret;
}
+BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map,
+ u64, flags)
+{
+ clock_t clockid = flags & (MAX_CLOCKS - 1);
+
+ BUILD_BUG_ON(MAX_CLOCKS != 16);
+ BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer));
+ BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer));
+
+ if (flags >= MAX_CLOCKS ||
+ /* similar to timerfd except _ALARM variants are not supported */
+ (clockid != CLOCK_MONOTONIC &&
+ clockid != CLOCK_REALTIME &&
+ clockid != CLOCK_BOOTTIME))
+ return -EINVAL;
+
+ return __bpf_async_init(timer, map, flags, BPF_ASYNC_TYPE_TIMER);
+}
+
static const struct bpf_func_proto bpf_timer_init_proto = {
.func = bpf_timer_init,
.gpl_only = true,
@@ -1222,22 +1350,23 @@ static const struct bpf_func_proto bpf_timer_init_proto = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn,
- struct bpf_prog_aux *, aux)
+static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn,
+ struct bpf_prog_aux *aux, unsigned int flags,
+ enum bpf_async_type type)
{
struct bpf_prog *prev, *prog = aux->prog;
- struct bpf_hrtimer *t;
+ struct bpf_async_cb *cb;
int ret = 0;
if (in_nmi())
return -EOPNOTSUPP;
- __bpf_spin_lock_irqsave(&timer->lock);
- t = timer->timer;
- if (!t) {
+ __bpf_spin_lock_irqsave(&async->lock);
+ cb = async->cb;
+ if (!cb) {
ret = -EINVAL;
goto out;
}
- if (!atomic64_read(&t->map->usercnt)) {
+ if (!atomic64_read(&cb->map->usercnt)) {
/* maps with timers must be either held by user space
* or pinned in bpffs. Otherwise timer might still be
* running even when bpf prog is detached and user space
@@ -1246,7 +1375,7 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb
ret = -EPERM;
goto out;
}
- prev = t->prog;
+ prev = cb->prog;
if (prev != prog) {
/* Bump prog refcnt once. Every bpf_timer_set_callback()
* can pick different callback_fn-s within the same prog.
@@ -1259,14 +1388,20 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb
if (prev)
/* Drop prev prog refcnt when swapping with new prog */
bpf_prog_put(prev);
- t->prog = prog;
+ cb->prog = prog;
}
- rcu_assign_pointer(t->callback_fn, callback_fn);
+ rcu_assign_pointer(cb->callback_fn, callback_fn);
out:
- __bpf_spin_unlock_irqrestore(&timer->lock);
+ __bpf_spin_unlock_irqrestore(&async->lock);
return ret;
}
+BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn,
+ struct bpf_prog_aux *, aux)
+{
+ return __bpf_async_set_callback(timer, callback_fn, aux, 0, BPF_ASYNC_TYPE_TIMER);
+}
+
static const struct bpf_func_proto bpf_timer_set_callback_proto = {
.func = bpf_timer_set_callback,
.gpl_only = true,
@@ -1275,7 +1410,7 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = {
.arg2_type = ARG_PTR_TO_FUNC,
};
-BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags)
+BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags)
{
struct bpf_hrtimer *t;
int ret = 0;
@@ -1287,7 +1422,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
return -EINVAL;
__bpf_spin_lock_irqsave(&timer->lock);
t = timer->timer;
- if (!t || !t->prog) {
+ if (!t || !t->cb.prog) {
ret = -EINVAL;
goto out;
}
@@ -1315,20 +1450,21 @@ static const struct bpf_func_proto bpf_timer_start_proto = {
.arg3_type = ARG_ANYTHING,
};
-static void drop_prog_refcnt(struct bpf_hrtimer *t)
+static void drop_prog_refcnt(struct bpf_async_cb *async)
{
- struct bpf_prog *prog = t->prog;
+ struct bpf_prog *prog = async->prog;
if (prog) {
bpf_prog_put(prog);
- t->prog = NULL;
- rcu_assign_pointer(t->callback_fn, NULL);
+ async->prog = NULL;
+ rcu_assign_pointer(async->callback_fn, NULL);
}
}
-BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
+BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
{
- struct bpf_hrtimer *t;
+ struct bpf_hrtimer *t, *cur_t;
+ bool inc = false;
int ret = 0;
if (in_nmi())
@@ -1340,21 +1476,50 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
ret = -EINVAL;
goto out;
}
- if (this_cpu_read(hrtimer_running) == t) {
+
+ cur_t = this_cpu_read(hrtimer_running);
+ if (cur_t == t) {
/* If bpf callback_fn is trying to bpf_timer_cancel()
* its own timer the hrtimer_cancel() will deadlock
- * since it waits for callback_fn to finish
+ * since it waits for callback_fn to finish.
+ */
+ ret = -EDEADLK;
+ goto out;
+ }
+
+ /* Only account in-flight cancellations when invoked from a timer
+ * callback, since we want to avoid waiting only if other _callbacks_
+ * are waiting on us, to avoid introducing lockups. Non-callback paths
+ * are ok, since nobody would synchronously wait for their completion.
+ */
+ if (!cur_t)
+ goto drop;
+ atomic_inc(&t->cancelling);
+ /* Need full barrier after relaxed atomic_inc */
+ smp_mb__after_atomic();
+ inc = true;
+ if (atomic_read(&cur_t->cancelling)) {
+ /* We're cancelling timer t, while some other timer callback is
+ * attempting to cancel us. In such a case, it might be possible
+ * that timer t belongs to the other callback, or some other
+ * callback waiting upon it (creating transitive dependencies
+ * upon us), and we will enter a deadlock if we continue
+ * cancelling and waiting for it synchronously, since it might
+ * do the same. Bail!
*/
ret = -EDEADLK;
goto out;
}
- drop_prog_refcnt(t);
+drop:
+ drop_prog_refcnt(&t->cb);
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
/* Cancel the timer and wait for associated callback to finish
* if it was running.
*/
ret = ret ?: hrtimer_cancel(&t->timer);
+ if (inc)
+ atomic_dec(&t->cancelling);
rcu_read_unlock();
return ret;
}
@@ -1366,56 +1531,112 @@ static const struct bpf_func_proto bpf_timer_cancel_proto = {
.arg1_type = ARG_PTR_TO_TIMER,
};
-/* This function is called by map_delete/update_elem for individual element and
- * by ops->map_release_uref when the user space reference to a map reaches zero.
- */
-void bpf_timer_cancel_and_free(void *val)
+static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *async)
{
- struct bpf_timer_kern *timer = val;
- struct bpf_hrtimer *t;
+ struct bpf_async_cb *cb;
- /* Performance optimization: read timer->timer without lock first. */
- if (!READ_ONCE(timer->timer))
- return;
+ /* Performance optimization: read async->cb without lock first. */
+ if (!READ_ONCE(async->cb))
+ return NULL;
- __bpf_spin_lock_irqsave(&timer->lock);
+ __bpf_spin_lock_irqsave(&async->lock);
/* re-read it under lock */
- t = timer->timer;
- if (!t)
+ cb = async->cb;
+ if (!cb)
goto out;
- drop_prog_refcnt(t);
+ drop_prog_refcnt(cb);
/* The subsequent bpf_timer_start/cancel() helpers won't be able to use
* this timer, since it won't be initialized.
*/
- WRITE_ONCE(timer->timer, NULL);
+ WRITE_ONCE(async->cb, NULL);
out:
- __bpf_spin_unlock_irqrestore(&timer->lock);
+ __bpf_spin_unlock_irqrestore(&async->lock);
+ return cb;
+}
+
+/* This function is called by map_delete/update_elem for individual element and
+ * by ops->map_release_uref when the user space reference to a map reaches zero.
+ */
+void bpf_timer_cancel_and_free(void *val)
+{
+ struct bpf_hrtimer *t;
+
+ t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val);
+
if (!t)
return;
- /* Cancel the timer and wait for callback to complete if it was running.
- * If hrtimer_cancel() can be safely called it's safe to call kfree(t)
- * right after for both preallocated and non-preallocated maps.
- * The timer->timer = NULL was already done and no code path can
- * see address 't' anymore.
- *
- * Check that bpf_map_delete/update_elem() wasn't called from timer
- * callback_fn. In such case don't call hrtimer_cancel() (since it will
- * deadlock) and don't call hrtimer_try_to_cancel() (since it will just
- * return -1). Though callback_fn is still running on this cpu it's
+ /* We check that bpf_map_delete/update_elem() was called from timer
+ * callback_fn. In such case we don't call hrtimer_cancel() (since it
+ * will deadlock) and don't call hrtimer_try_to_cancel() (since it will
+ * just return -1). Though callback_fn is still running on this cpu it's
* safe to do kfree(t) because bpf_timer_cb() read everything it needed
* from 't'. The bpf subprog callback_fn won't be able to access 't',
- * since timer->timer = NULL was already done. The timer will be
+ * since async->cb = NULL was already done. The timer will be
* effectively cancelled because bpf_timer_cb() will return
* HRTIMER_NORESTART.
+ *
+ * However, it is possible the timer callback_fn calling us armed the
+ * timer _before_ calling us, such that failing to cancel it here will
+ * cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
+ * Therefore, we _need_ to cancel any outstanding timers before we do
+ * kfree_rcu, even though no more timers can be armed.
+ *
+ * Moreover, we need to schedule work even if timer does not belong to
+ * the calling callback_fn, as on two different CPUs, we can end up in a
+ * situation where both sides run in parallel, try to cancel one
+ * another, and we end up waiting on both sides in hrtimer_cancel
+ * without making forward progress, since timer1 depends on time2
+ * callback to finish, and vice versa.
+ *
+ * CPU 1 (timer1_cb) CPU 2 (timer2_cb)
+ * bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1)
+ *
+ * To avoid these issues, punt to workqueue context when we are in a
+ * timer callback.
*/
- if (this_cpu_read(hrtimer_running) != t)
- hrtimer_cancel(&t->timer);
- kfree_rcu(t, rcu);
+ if (this_cpu_read(hrtimer_running)) {
+ queue_work(system_unbound_wq, &t->cb.delete_work);
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ /* If the timer is running on other CPU, also use a kworker to
+ * wait for the completion of the timer instead of trying to
+ * acquire a sleepable lock in hrtimer_cancel() to wait for its
+ * completion.
+ */
+ if (hrtimer_try_to_cancel(&t->timer) >= 0)
+ kfree_rcu(t, cb.rcu);
+ else
+ queue_work(system_unbound_wq, &t->cb.delete_work);
+ } else {
+ bpf_timer_delete_work(&t->cb.delete_work);
+ }
}
-BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
+/* This function is called by map_delete/update_elem for individual element and
+ * by ops->map_release_uref when the user space reference to a map reaches zero.
+ */
+void bpf_wq_cancel_and_free(void *val)
{
- unsigned long *kptr = map_value;
+ struct bpf_work *work;
+
+ BTF_TYPE_EMIT(struct bpf_wq);
+
+ work = (struct bpf_work *)__bpf_async_cancel_and_free(val);
+ if (!work)
+ return;
+ /* Trigger cancel of the sleepable work, but *do not* wait for
+ * it to finish if it was running as we might not be in a
+ * sleepable context.
+ * kfree will be called once the work has finished.
+ */
+ schedule_work(&work->delete_work);
+}
+
+BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr)
+{
+ unsigned long *kptr = dst;
/* This helper may be inlined by verifier. */
return xchg(kptr, (unsigned long)ptr);
@@ -1430,7 +1651,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
.ret_btf_id = BPF_PTR_POISON,
- .arg1_type = ARG_PTR_TO_KPTR,
+ .arg1_type = ARG_KPTR_XCHG_DEST,
.arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE,
.arg2_btf_id = BPF_PTR_POISON,
};
@@ -1443,7 +1664,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = {
#define DYNPTR_SIZE_MASK 0xFFFFFF
#define DYNPTR_RDONLY_BIT BIT(31)
-static bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
+bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
{
return ptr->size & DYNPTR_RDONLY_BIT;
}
@@ -1494,16 +1715,6 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
memset(ptr, 0, sizeof(*ptr));
}
-static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
-{
- u32 size = __bpf_dynptr_size(ptr);
-
- if (len > size || offset > size - len)
- return -E2BIG;
-
- return 0;
-}
-
BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr)
{
int err;
@@ -1536,11 +1747,11 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
+ .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE,
};
-BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
- u32, offset, u64, flags)
+static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *src,
+ u32 offset, u64 flags)
{
enum bpf_dynptr_type type;
int err;
@@ -1573,6 +1784,12 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern
}
}
+BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
+ u32, offset, u64, flags)
+{
+ return __bpf_dynptr_read(dst, len, src, offset, flags);
+}
+
static const struct bpf_func_proto bpf_dynptr_read_proto = {
.func = bpf_dynptr_read,
.gpl_only = false,
@@ -1584,8 +1801,8 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
.arg5_type = ARG_ANYTHING,
};
-BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
- u32, len, u64, flags)
+int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src,
+ u32 len, u64 flags)
{
enum bpf_dynptr_type type;
int err;
@@ -1623,6 +1840,12 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v
}
}
+BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
+ u32, len, u64, flags)
+{
+ return __bpf_dynptr_write(dst, offset, src, len, flags);
+}
+
static const struct bpf_func_proto bpf_dynptr_write_proto = {
.func = bpf_dynptr_write,
.gpl_only = false,
@@ -1681,6 +1904,12 @@ const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
const struct bpf_func_proto bpf_probe_read_kernel_proto __weak;
const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
+const struct bpf_func_proto bpf_perf_event_read_proto __weak;
+const struct bpf_func_proto bpf_send_signal_proto __weak;
+const struct bpf_func_proto bpf_send_signal_thread_proto __weak;
+const struct bpf_func_proto bpf_get_task_stack_sleepable_proto __weak;
+const struct bpf_func_proto bpf_get_task_stack_proto __weak;
+const struct bpf_func_proto bpf_get_branch_snapshot_proto __weak;
const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
@@ -1730,6 +1959,12 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_strtol_proto;
case BPF_FUNC_strtoul:
return &bpf_strtoul_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_ns_current_pid_tgid:
+ return &bpf_get_ns_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
default:
break;
}
@@ -1787,7 +2022,21 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_current_cgroup_id_proto;
case BPF_FUNC_get_current_ancestor_cgroup_id:
return &bpf_get_current_ancestor_cgroup_id_proto;
+ case BPF_FUNC_current_task_under_cgroup:
+ return &bpf_current_task_under_cgroup_proto;
+#endif
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
#endif
+ case BPF_FUNC_task_storage_get:
+ if (bpf_prog_check_recur(prog))
+ return &bpf_task_storage_get_recur_proto;
+ return &bpf_task_storage_get_proto;
+ case BPF_FUNC_task_storage_delete:
+ if (bpf_prog_check_recur(prog))
+ return &bpf_task_storage_delete_recur_proto;
+ return &bpf_task_storage_delete_proto;
default:
break;
}
@@ -1802,6 +2051,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_current_task_proto;
case BPF_FUNC_get_current_task_btf:
return &bpf_get_current_task_btf_proto;
+ case BPF_FUNC_get_current_comm:
+ return &bpf_get_current_comm_proto;
case BPF_FUNC_probe_read_user:
return &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
@@ -1812,6 +2063,10 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_probe_read_kernel_str:
return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_str_proto;
+ case BPF_FUNC_copy_from_user:
+ return &bpf_copy_from_user_proto;
+ case BPF_FUNC_copy_from_user_task:
+ return &bpf_copy_from_user_task_proto;
case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto;
case BPF_FUNC_snprintf:
@@ -1820,10 +2075,26 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_task_pt_regs_proto;
case BPF_FUNC_trace_vprintk:
return bpf_get_trace_vprintk_proto();
+ case BPF_FUNC_perf_event_read_value:
+ return bpf_get_perf_event_read_value_proto();
+ case BPF_FUNC_perf_event_read:
+ return &bpf_perf_event_read_proto;
+ case BPF_FUNC_send_signal:
+ return &bpf_send_signal_proto;
+ case BPF_FUNC_send_signal_thread:
+ return &bpf_send_signal_thread_proto;
+ case BPF_FUNC_get_task_stack:
+ return prog->sleepable ? &bpf_get_task_stack_sleepable_proto
+ : &bpf_get_task_stack_proto;
+ case BPF_FUNC_get_branch_snapshot:
+ return &bpf_get_branch_snapshot_proto;
+ case BPF_FUNC_find_vma:
+ return &bpf_find_vma_proto;
default:
return NULL;
}
}
+EXPORT_SYMBOL_GPL(bpf_base_func_proto);
void bpf_list_head_free(const struct btf_field *field, void *list_head,
struct bpf_spin_lock *spin_lock)
@@ -1855,9 +2126,7 @@ unlock:
/* The contained type can also have resources, including a
* bpf_list_head which needs to be freed.
*/
- migrate_disable();
__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
- migrate_enable();
}
}
@@ -1894,9 +2163,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
obj -= field->graph_root.node_offset;
- migrate_disable();
__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
- migrate_enable();
}
}
@@ -2059,6 +2326,26 @@ __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
return __bpf_list_del(head, true);
}
+__bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head)
+{
+ struct list_head *h = (struct list_head *)head;
+
+ if (list_empty(h) || unlikely(!h->next))
+ return NULL;
+
+ return (struct bpf_list_node *)h->next;
+}
+
+__bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head)
+{
+ struct list_head *h = (struct list_head *)head;
+
+ if (list_empty(h) || unlikely(!h->next))
+ return NULL;
+
+ return (struct bpf_list_node *)h->prev;
+}
+
__bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
struct bpf_rb_node *node)
{
@@ -2132,6 +2419,33 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
return (struct bpf_rb_node *)rb_first_cached(r);
}
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_root(struct bpf_rb_root *root)
+{
+ struct rb_root_cached *r = (struct rb_root_cached *)root;
+
+ return (struct bpf_rb_node *)r->rb_root.rb_node;
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_left(struct bpf_rb_root *root, struct bpf_rb_node *node)
+{
+ struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
+
+ if (READ_ONCE(node_internal->owner) != root)
+ return NULL;
+
+ return (struct bpf_rb_node *)node_internal->rb_node.rb_left;
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_right(struct bpf_rb_root *root, struct bpf_rb_node *node)
+{
+ struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
+
+ if (READ_ONCE(node_internal->owner) != root)
+ return NULL;
+
+ return (struct bpf_rb_node *)node_internal->rb_node.rb_right;
+}
+
/**
* bpf_task_acquire - Acquire a reference to a task. A task acquired by this
* kfunc which is not stored in a map as a kptr, must be released by calling
@@ -2248,6 +2562,29 @@ __bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task,
return ret;
}
+BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct cgroup *cgrp;
+
+ if (unlikely(idx >= array->map.max_entries))
+ return -E2BIG;
+
+ cgrp = READ_ONCE(array->ptrs[idx]);
+ if (unlikely(!cgrp))
+ return -EAGAIN;
+
+ return task_under_cgroup_hierarchy(current, cgrp);
+}
+
+const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
+ .func = bpf_current_task_under_cgroup,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
+
/**
* bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a
* specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
@@ -2288,8 +2625,27 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
}
/**
+ * bpf_task_from_vpid - Find a struct task_struct from its vpid by looking it up
+ * in the pid namespace of the current task. If a task is returned, it must
+ * either be stored in a map, or released with bpf_task_release().
+ * @vpid: The vpid of the task being looked up.
+ */
+__bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid)
+{
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = find_task_by_vpid(vpid);
+ if (p)
+ p = bpf_task_acquire(p);
+ rcu_read_unlock();
+
+ return p;
+}
+
+/**
* bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
- * @ptr: The dynptr whose data slice to retrieve
+ * @p: The dynptr whose data slice to retrieve
* @offset: Offset into the dynptr
* @buffer__opt: User-provided buffer to copy contents into. May be NULL
* @buffer__szk: Size (in bytes) of the buffer if present. This is the
@@ -2315,9 +2671,10 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
* provided buffer, with its contents containing the data, if unable to obtain
* direct pointer)
*/
-__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset,
+__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset,
void *buffer__opt, u32 buffer__szk)
{
+ const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
enum bpf_dynptr_type type;
u32 len = buffer__szk;
int err;
@@ -2359,7 +2716,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
/**
* bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data.
- * @ptr: The dynptr whose data slice to retrieve
+ * @p: The dynptr whose data slice to retrieve
* @offset: Offset into the dynptr
* @buffer__opt: User-provided buffer to copy contents into. May be NULL
* @buffer__szk: Size (in bytes) of the buffer if present. This is the
@@ -2399,16 +2756,18 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
* provided buffer, with its contents containing the data, if unable to obtain
* direct pointer)
*/
-__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset,
+__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset,
void *buffer__opt, u32 buffer__szk)
{
+ const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
if (!ptr->data || __bpf_dynptr_is_rdonly(ptr))
return NULL;
/* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
*
* For skb-type dynptrs, it is safe to write into the returned pointer
- * if the bpf program allows skb data writes. There are two possiblities
+ * if the bpf program allows skb data writes. There are two possibilities
* that may occur when calling bpf_dynptr_slice_rdwr:
*
* 1) The requested slice is in the head of the skb. In this case, the
@@ -2427,11 +2786,12 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 o
* will be copied out into the buffer and the user will need to call
* bpf_dynptr_write() to commit changes.
*/
- return bpf_dynptr_slice(ptr, offset, buffer__opt, buffer__szk);
+ return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk);
}
-__bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 end)
+__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
u32 size;
if (!ptr->data || start > end)
@@ -2448,40 +2808,104 @@ __bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 en
return 0;
}
-__bpf_kfunc bool bpf_dynptr_is_null(struct bpf_dynptr_kern *ptr)
+__bpf_kfunc bool bpf_dynptr_is_null(const struct bpf_dynptr *p)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
return !ptr->data;
}
-__bpf_kfunc bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr)
+__bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
if (!ptr->data)
return false;
return __bpf_dynptr_is_rdonly(ptr);
}
-__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
+__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr *p)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
if (!ptr->data)
return -EINVAL;
return __bpf_dynptr_size(ptr);
}
-__bpf_kfunc int bpf_dynptr_clone(struct bpf_dynptr_kern *ptr,
- struct bpf_dynptr_kern *clone__uninit)
+__bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p,
+ struct bpf_dynptr *clone__uninit)
{
+ struct bpf_dynptr_kern *clone = (struct bpf_dynptr_kern *)clone__uninit;
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
if (!ptr->data) {
- bpf_dynptr_set_null(clone__uninit);
+ bpf_dynptr_set_null(clone);
return -EINVAL;
}
- *clone__uninit = *ptr;
+ *clone = *ptr;
return 0;
}
+/**
+ * bpf_dynptr_copy() - Copy data from one dynptr to another.
+ * @dst_ptr: Destination dynptr - where data should be copied to
+ * @dst_off: Offset into the destination dynptr
+ * @src_ptr: Source dynptr - where data should be copied from
+ * @src_off: Offset into the source dynptr
+ * @size: Length of the data to copy from source to destination
+ *
+ * Copies data from source dynptr to destination dynptr.
+ * Returns 0 on success; negative error, otherwise.
+ */
+__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off,
+ struct bpf_dynptr *src_ptr, u32 src_off, u32 size)
+{
+ struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr;
+ struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr;
+ void *src_slice, *dst_slice;
+ char buf[256];
+ u32 off;
+
+ src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size);
+ dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size);
+
+ if (src_slice && dst_slice) {
+ memmove(dst_slice, src_slice, size);
+ return 0;
+ }
+
+ if (src_slice)
+ return __bpf_dynptr_write(dst, dst_off, src_slice, size, 0);
+
+ if (dst_slice)
+ return __bpf_dynptr_read(dst_slice, size, src, src_off, 0);
+
+ if (bpf_dynptr_check_off_len(dst, dst_off, size) ||
+ bpf_dynptr_check_off_len(src, src_off, size))
+ return -E2BIG;
+
+ off = 0;
+ while (off < size) {
+ u32 chunk_sz = min_t(u32, sizeof(buf), size - off);
+ int err;
+
+ err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0);
+ if (err)
+ return err;
+ err = __bpf_dynptr_write(dst, dst_off + off, buf, chunk_sz, 0);
+ if (err)
+ return err;
+
+ off += chunk_sz;
+ }
+ return 0;
+}
+
__bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
{
return obj;
@@ -2545,6 +2969,315 @@ __bpf_kfunc void bpf_throw(u64 cookie)
WARN(1, "A call to BPF exception callback should never return\n");
}
+__bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags)
+{
+ struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
+ struct bpf_map *map = p__map;
+
+ BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq));
+ BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq));
+
+ if (flags)
+ return -EINVAL;
+
+ return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ);
+}
+
+__bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags)
+{
+ struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
+ struct bpf_work *w;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ if (flags)
+ return -EINVAL;
+ w = READ_ONCE(async->work);
+ if (!w || !READ_ONCE(w->cb.prog))
+ return -EINVAL;
+
+ schedule_work(&w->work);
+ return 0;
+}
+
+__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq,
+ int (callback_fn)(void *map, int *key, void *value),
+ unsigned int flags,
+ void *aux__prog)
+{
+ struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__prog;
+ struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
+
+ if (flags)
+ return -EINVAL;
+
+ return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ);
+}
+
+__bpf_kfunc void bpf_preempt_disable(void)
+{
+ preempt_disable();
+}
+
+__bpf_kfunc void bpf_preempt_enable(void)
+{
+ preempt_enable();
+}
+
+struct bpf_iter_bits {
+ __u64 __opaque[2];
+} __aligned(8);
+
+#define BITS_ITER_NR_WORDS_MAX 511
+
+struct bpf_iter_bits_kern {
+ union {
+ __u64 *bits;
+ __u64 bits_copy;
+ };
+ int nr_bits;
+ int bit;
+} __aligned(8);
+
+/* On 64-bit hosts, unsigned long and u64 have the same size, so passing
+ * a u64 pointer and an unsigned long pointer to find_next_bit() will
+ * return the same result, as both point to the same 8-byte area.
+ *
+ * For 32-bit little-endian hosts, using a u64 pointer or unsigned long
+ * pointer also makes no difference. This is because the first iterated
+ * unsigned long is composed of bits 0-31 of the u64 and the second unsigned
+ * long is composed of bits 32-63 of the u64.
+ *
+ * However, for 32-bit big-endian hosts, this is not the case. The first
+ * iterated unsigned long will be bits 32-63 of the u64, so swap these two
+ * ulong values within the u64.
+ */
+static void swap_ulong_in_u64(u64 *bits, unsigned int nr)
+{
+#if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN)
+ unsigned int i;
+
+ for (i = 0; i < nr; i++)
+ bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32);
+#endif
+}
+
+/**
+ * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area
+ * @it: The new bpf_iter_bits to be created
+ * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over
+ * @nr_words: The size of the specified memory area, measured in 8-byte units.
+ * The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be
+ * further reduced by the BPF memory allocator implementation.
+ *
+ * This function initializes a new bpf_iter_bits structure for iterating over
+ * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It
+ * copies the data of the memory area to the newly created bpf_iter_bits @it for
+ * subsequent iteration operations.
+ *
+ * On success, 0 is returned. On failure, ERR is returned.
+ */
+__bpf_kfunc int
+bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_words)
+{
+ struct bpf_iter_bits_kern *kit = (void *)it;
+ u32 nr_bytes = nr_words * sizeof(u64);
+ u32 nr_bits = BYTES_TO_BITS(nr_bytes);
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_bits_kern) != sizeof(struct bpf_iter_bits));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_bits_kern) !=
+ __alignof__(struct bpf_iter_bits));
+
+ kit->nr_bits = 0;
+ kit->bits_copy = 0;
+ kit->bit = -1;
+
+ if (!unsafe_ptr__ign || !nr_words)
+ return -EINVAL;
+ if (nr_words > BITS_ITER_NR_WORDS_MAX)
+ return -E2BIG;
+
+ /* Optimization for u64 mask */
+ if (nr_bits == 64) {
+ err = bpf_probe_read_kernel_common(&kit->bits_copy, nr_bytes, unsafe_ptr__ign);
+ if (err)
+ return -EFAULT;
+
+ swap_ulong_in_u64(&kit->bits_copy, nr_words);
+
+ kit->nr_bits = nr_bits;
+ return 0;
+ }
+
+ if (bpf_mem_alloc_check_size(false, nr_bytes))
+ return -E2BIG;
+
+ /* Fallback to memalloc */
+ kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes);
+ if (!kit->bits)
+ return -ENOMEM;
+
+ err = bpf_probe_read_kernel_common(kit->bits, nr_bytes, unsafe_ptr__ign);
+ if (err) {
+ bpf_mem_free(&bpf_global_ma, kit->bits);
+ return err;
+ }
+
+ swap_ulong_in_u64(kit->bits, nr_words);
+
+ kit->nr_bits = nr_bits;
+ return 0;
+}
+
+/**
+ * bpf_iter_bits_next() - Get the next bit in a bpf_iter_bits
+ * @it: The bpf_iter_bits to be checked
+ *
+ * This function returns a pointer to a number representing the value of the
+ * next bit in the bits.
+ *
+ * If there are no further bits available, it returns NULL.
+ */
+__bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it)
+{
+ struct bpf_iter_bits_kern *kit = (void *)it;
+ int bit = kit->bit, nr_bits = kit->nr_bits;
+ const void *bits;
+
+ if (!nr_bits || bit >= nr_bits)
+ return NULL;
+
+ bits = nr_bits == 64 ? &kit->bits_copy : kit->bits;
+ bit = find_next_bit(bits, nr_bits, bit + 1);
+ if (bit >= nr_bits) {
+ kit->bit = bit;
+ return NULL;
+ }
+
+ kit->bit = bit;
+ return &kit->bit;
+}
+
+/**
+ * bpf_iter_bits_destroy() - Destroy a bpf_iter_bits
+ * @it: The bpf_iter_bits to be destroyed
+ *
+ * Destroy the resource associated with the bpf_iter_bits.
+ */
+__bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it)
+{
+ struct bpf_iter_bits_kern *kit = (void *)it;
+
+ if (kit->nr_bits <= 64)
+ return;
+ bpf_mem_free(&bpf_global_ma, kit->bits);
+}
+
+/**
+ * bpf_copy_from_user_str() - Copy a string from an unsafe user address
+ * @dst: Destination address, in kernel space. This buffer must be
+ * at least @dst__sz bytes long.
+ * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL.
+ * @unsafe_ptr__ign: Source address, in user space.
+ * @flags: The only supported flag is BPF_F_PAD_ZEROS
+ *
+ * Copies a NUL-terminated string from userspace to BPF space. If user string is
+ * too long this will still ensure zero termination in the dst buffer unless
+ * buffer size is 0.
+ *
+ * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst to 0 on success and
+ * memset all of @dst on failure.
+ */
+__bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user *unsafe_ptr__ign, u64 flags)
+{
+ int ret;
+
+ if (unlikely(flags & ~BPF_F_PAD_ZEROS))
+ return -EINVAL;
+
+ if (unlikely(!dst__sz))
+ return 0;
+
+ ret = strncpy_from_user(dst, unsafe_ptr__ign, dst__sz - 1);
+ if (ret < 0) {
+ if (flags & BPF_F_PAD_ZEROS)
+ memset((char *)dst, 0, dst__sz);
+
+ return ret;
+ }
+
+ if (flags & BPF_F_PAD_ZEROS)
+ memset((char *)dst + ret, 0, dst__sz - ret);
+ else
+ ((char *)dst)[ret] = '\0';
+
+ return ret + 1;
+}
+
+/**
+ * bpf_copy_from_user_task_str() - Copy a string from an task's address space
+ * @dst: Destination address, in kernel space. This buffer must be
+ * at least @dst__sz bytes long.
+ * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL.
+ * @unsafe_ptr__ign: Source address in the task's address space.
+ * @tsk: The task whose address space will be used
+ * @flags: The only supported flag is BPF_F_PAD_ZEROS
+ *
+ * Copies a NUL terminated string from a task's address space to @dst__sz
+ * buffer. If user string is too long this will still ensure zero termination
+ * in the @dst__sz buffer unless buffer size is 0.
+ *
+ * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst__sz to 0 on success
+ * and memset all of @dst__sz on failure.
+ *
+ * Return: The number of copied bytes on success including the NUL terminator.
+ * A negative error code on failure.
+ */
+__bpf_kfunc int bpf_copy_from_user_task_str(void *dst, u32 dst__sz,
+ const void __user *unsafe_ptr__ign,
+ struct task_struct *tsk, u64 flags)
+{
+ int ret;
+
+ if (unlikely(flags & ~BPF_F_PAD_ZEROS))
+ return -EINVAL;
+
+ if (unlikely(dst__sz == 0))
+ return 0;
+
+ ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_ptr__ign, dst, dst__sz, 0);
+ if (ret < 0) {
+ if (flags & BPF_F_PAD_ZEROS)
+ memset(dst, 0, dst__sz);
+ return ret;
+ }
+
+ if (flags & BPF_F_PAD_ZEROS)
+ memset(dst + ret, 0, dst__sz - ret);
+
+ return ret + 1;
+}
+
+/* Keep unsinged long in prototype so that kfunc is usable when emitted to
+ * vmlinux.h in BPF programs directly, but note that while in BPF prog, the
+ * unsigned long always points to 8-byte region on stack, the kernel may only
+ * read and write the 4-bytes on 32-bit.
+ */
+__bpf_kfunc void bpf_local_irq_save(unsigned long *flags__irq_flag)
+{
+ local_irq_save(*flags__irq_flag);
+}
+
+__bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag)
+{
+ local_irq_restore(*flags__irq_flag);
+}
+
+__bpf_kfunc void __bpf_trap(void)
+{
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(generic_btf_ids)
@@ -2560,11 +3293,16 @@ BTF_ID_FLAGS(func, bpf_list_push_front_impl)
BTF_ID_FLAGS(func, bpf_list_push_back_impl)
BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_rbtree_add_impl)
BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL)
#ifdef CONFIG_CGROUPS
BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
@@ -2575,7 +3313,11 @@ BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
#endif
BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_throw)
+#ifdef CONFIG_BPF_EVENTS
+BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS)
+#endif
BTF_KFUNCS_END(generic_btf_ids)
static const struct btf_kfunc_id_set generic_kfunc_set = {
@@ -2593,8 +3335,8 @@ BTF_ID(func, bpf_cgroup_release_dtor)
#endif
BTF_KFUNCS_START(common_btf_ids)
-BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
-BTF_ID_FLAGS(func, bpf_rdonly_cast)
+BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx, KF_FASTCALL)
+BTF_ID_FLAGS(func, bpf_rdonly_cast, KF_FASTCALL)
BTF_ID_FLAGS(func, bpf_rcu_read_lock)
BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
@@ -2621,6 +3363,40 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null)
BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
BTF_ID_FLAGS(func, bpf_dynptr_size)
BTF_ID_FLAGS(func, bpf_dynptr_clone)
+BTF_ID_FLAGS(func, bpf_dynptr_copy)
+#ifdef CONFIG_NET
+BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
+#endif
+BTF_ID_FLAGS(func, bpf_wq_init)
+BTF_ID_FLAGS(func, bpf_wq_set_callback_impl)
+BTF_ID_FLAGS(func, bpf_wq_start)
+BTF_ID_FLAGS(func, bpf_preempt_disable)
+BTF_ID_FLAGS(func, bpf_preempt_enable)
+BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_copy_from_user_task_str, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_get_kmem_cache)
+BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_local_irq_save)
+BTF_ID_FLAGS(func, bpf_local_irq_restore)
+BTF_ID_FLAGS(func, bpf_probe_read_user_dynptr)
+BTF_ID_FLAGS(func, bpf_probe_read_kernel_dynptr)
+BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr)
+BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr)
+BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+#ifdef CONFIG_DMA_SHARED_BUFFER
+BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
+#endif
+BTF_ID_FLAGS(func, __bpf_trap)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
@@ -2648,6 +3424,8 @@ static int __init kfunc_init(void)
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &generic_kfunc_set);
ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
ARRAY_SIZE(generic_dtors),
THIS_MODULE);
@@ -2661,7 +3439,9 @@ late_initcall(kfunc_init);
*/
const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len)
{
- return bpf_dynptr_slice(ptr, 0, NULL, len);
+ const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr;
+
+ return bpf_dynptr_slice(p, 0, NULL, len);
}
/* Get a pointer to dynptr data up to len bytes for read write access. If
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index af5d2ffadd70..5c2e96b19392 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -150,14 +150,14 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
}
-static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
if (IS_ERR(inode))
- return PTR_ERR(inode);
+ return ERR_CAST(inode);
inode->i_op = &bpf_dir_iops;
inode->i_fop = &simple_dir_operations;
@@ -166,7 +166,7 @@ static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
inc_nlink(dir);
bpf_dentry_finalize(dentry, inode, dir);
- return 0;
+ return NULL;
}
struct map_iter {
@@ -421,7 +421,7 @@ static int bpf_iter_link_pin_kernel(struct dentry *parent,
int ret;
inode_lock(parent->d_inode);
- dentry = lookup_one_len(name, parent, strlen(name));
+ dentry = lookup_noperm(&QSTR(name), parent);
if (IS_ERR(dentry)) {
inode_unlock(parent->d_inode);
return PTR_ERR(dentry);
@@ -709,10 +709,10 @@ static void seq_print_delegate_opts(struct seq_file *m,
msk = 1ULL << e->val;
if (delegate_msk & msk) {
/* emit lower-case name without prefix */
- seq_printf(m, "%c", first ? '=' : ':');
+ seq_putc(m, first ? '=' : ':');
name += pfx_len;
while (*name) {
- seq_printf(m, "%c", tolower(*name));
+ seq_putc(m, tolower(*name));
name++;
}
@@ -880,7 +880,7 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
const struct btf_type *enum_t;
const char *enum_pfx;
u64 *delegate_msk, msk = 0;
- char *p;
+ char *p, *str;
int val;
/* ignore errors, fallback to hex */
@@ -911,7 +911,8 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
return -EINVAL;
}
- while ((p = strsep(&param->string, ":"))) {
+ str = param->string;
+ while ((p = strsep(&str, ":"))) {
if (strcmp(p, "any") == 0) {
msk |= ~0ULL;
} else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) {
diff --git a/kernel/bpf/kmem_cache_iter.c b/kernel/bpf/kmem_cache_iter.c
new file mode 100644
index 000000000000..3ae2158d767f
--- /dev/null
+++ b/kernel/bpf/kmem_cache_iter.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Google */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+#include "../../mm/slab.h" /* kmem_cache, slab_caches and slab_mutex */
+
+/* open-coded version */
+struct bpf_iter_kmem_cache {
+ __u64 __opaque[1];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_kmem_cache_kern {
+ struct kmem_cache *pos;
+} __attribute__((aligned(8)));
+
+#define KMEM_CACHE_POS_START ((void *)1L)
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_kmem_cache_new(struct bpf_iter_kmem_cache *it)
+{
+ struct bpf_iter_kmem_cache_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(*kit) > sizeof(*it));
+ BUILD_BUG_ON(__alignof__(*kit) != __alignof__(*it));
+
+ kit->pos = KMEM_CACHE_POS_START;
+ return 0;
+}
+
+__bpf_kfunc struct kmem_cache *bpf_iter_kmem_cache_next(struct bpf_iter_kmem_cache *it)
+{
+ struct bpf_iter_kmem_cache_kern *kit = (void *)it;
+ struct kmem_cache *prev = kit->pos;
+ struct kmem_cache *next;
+ bool destroy = false;
+
+ if (!prev)
+ return NULL;
+
+ mutex_lock(&slab_mutex);
+
+ if (list_empty(&slab_caches)) {
+ mutex_unlock(&slab_mutex);
+ return NULL;
+ }
+
+ if (prev == KMEM_CACHE_POS_START)
+ next = list_first_entry(&slab_caches, struct kmem_cache, list);
+ else if (list_last_entry(&slab_caches, struct kmem_cache, list) == prev)
+ next = NULL;
+ else
+ next = list_next_entry(prev, list);
+
+ /* boot_caches have negative refcount, don't touch them */
+ if (next && next->refcount > 0)
+ next->refcount++;
+
+ /* Skip kmem_cache_destroy() for active entries */
+ if (prev && prev != KMEM_CACHE_POS_START) {
+ if (prev->refcount > 1)
+ prev->refcount--;
+ else if (prev->refcount == 1)
+ destroy = true;
+ }
+
+ mutex_unlock(&slab_mutex);
+
+ if (destroy)
+ kmem_cache_destroy(prev);
+
+ kit->pos = next;
+ return next;
+}
+
+__bpf_kfunc void bpf_iter_kmem_cache_destroy(struct bpf_iter_kmem_cache *it)
+{
+ struct bpf_iter_kmem_cache_kern *kit = (void *)it;
+ struct kmem_cache *s = kit->pos;
+ bool destroy = false;
+
+ if (s == NULL || s == KMEM_CACHE_POS_START)
+ return;
+
+ mutex_lock(&slab_mutex);
+
+ /* Skip kmem_cache_destroy() for active entries */
+ if (s->refcount > 1)
+ s->refcount--;
+ else if (s->refcount == 1)
+ destroy = true;
+
+ mutex_unlock(&slab_mutex);
+
+ if (destroy)
+ kmem_cache_destroy(s);
+}
+
+__bpf_kfunc_end_defs();
+
+struct bpf_iter__kmem_cache {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct kmem_cache *, s);
+};
+
+union kmem_cache_iter_priv {
+ struct bpf_iter_kmem_cache it;
+ struct bpf_iter_kmem_cache_kern kit;
+};
+
+static void *kmem_cache_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ loff_t cnt = 0;
+ bool found = false;
+ struct kmem_cache *s;
+ union kmem_cache_iter_priv *p = seq->private;
+
+ mutex_lock(&slab_mutex);
+
+ /* Find an entry at the given position in the slab_caches list instead
+ * of keeping a reference (of the last visited entry, if any) out of
+ * slab_mutex. It might miss something if one is deleted in the middle
+ * while it releases the lock. But it should be rare and there's not
+ * much we can do about it.
+ */
+ list_for_each_entry(s, &slab_caches, list) {
+ if (cnt == *pos) {
+ /* Make sure this entry remains in the list by getting
+ * a new reference count. Note that boot_cache entries
+ * have a negative refcount, so don't touch them.
+ */
+ if (s->refcount > 0)
+ s->refcount++;
+ found = true;
+ break;
+ }
+ cnt++;
+ }
+ mutex_unlock(&slab_mutex);
+
+ if (!found)
+ s = NULL;
+
+ p->kit.pos = s;
+ return s;
+}
+
+static void kmem_cache_iter_seq_stop(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_meta meta;
+ struct bpf_iter__kmem_cache ctx = {
+ .meta = &meta,
+ .s = v,
+ };
+ union kmem_cache_iter_priv *p = seq->private;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, true);
+ if (prog && !ctx.s)
+ bpf_iter_run_prog(prog, &ctx);
+
+ bpf_iter_kmem_cache_destroy(&p->it);
+}
+
+static void *kmem_cache_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ union kmem_cache_iter_priv *p = seq->private;
+
+ ++*pos;
+
+ return bpf_iter_kmem_cache_next(&p->it);
+}
+
+static int kmem_cache_iter_seq_show(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_meta meta;
+ struct bpf_iter__kmem_cache ctx = {
+ .meta = &meta,
+ .s = v,
+ };
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, false);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ return ret;
+}
+
+static const struct seq_operations kmem_cache_iter_seq_ops = {
+ .start = kmem_cache_iter_seq_start,
+ .next = kmem_cache_iter_seq_next,
+ .stop = kmem_cache_iter_seq_stop,
+ .show = kmem_cache_iter_seq_show,
+};
+
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_kmem_cache_btf_id, struct, kmem_cache)
+
+static const struct bpf_iter_seq_info kmem_cache_iter_seq_info = {
+ .seq_ops = &kmem_cache_iter_seq_ops,
+ .seq_priv_size = sizeof(union kmem_cache_iter_priv),
+};
+
+static void bpf_iter_kmem_cache_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ seq_puts(seq, "kmem_cache iter\n");
+}
+
+DEFINE_BPF_ITER_FUNC(kmem_cache, struct bpf_iter_meta *meta,
+ struct kmem_cache *s)
+
+static struct bpf_iter_reg bpf_kmem_cache_reg_info = {
+ .target = "kmem_cache",
+ .feature = BPF_ITER_RESCHED,
+ .show_fdinfo = bpf_iter_kmem_cache_show_fdinfo,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__kmem_cache, s),
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
+ },
+ .seq_info = &kmem_cache_iter_seq_info,
+};
+
+static int __init bpf_kmem_cache_iter_init(void)
+{
+ bpf_kmem_cache_reg_info.ctx_arg_info[0].btf_id = bpf_kmem_cache_btf_id[0];
+ return bpf_iter_reg_target(&bpf_kmem_cache_reg_info);
+}
+
+late_initcall(bpf_kmem_cache_iter_init);
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index a04f505aefe9..3969eb0382af 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -431,7 +431,7 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
seq_puts(m, ": ");
btf_type_seq_show(map->btf, map->btf_value_type_id,
&READ_ONCE(storage->buf)->data[0], m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
} else {
seq_puts(m, ": {\n");
for_each_possible_cpu(cpu) {
@@ -439,7 +439,7 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
btf_type_seq_show(map->btf, map->btf_value_type_id,
per_cpu_ptr(storage->percpu_buf, cpu),
m);
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
}
seq_puts(m, "}\n");
}
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 2a243cf37c60..38050f4ee400 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -91,7 +91,7 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
goto fail;
} else {
u64 new_end, new_start;
- u32 buf_start, buf_end, new_n;
+ u32 buf_start, buf_end;
new_end = log->end_pos + n;
if (new_end - log->start_pos >= log->len_total)
@@ -467,9 +467,9 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type)
if (type & PTR_MAYBE_NULL) {
if (base_type(type) == PTR_TO_BTF_ID)
- strncpy(postfix, "or_null_", 16);
+ strscpy(postfix, "or_null_");
else
- strncpy(postfix, "_or_null", 16);
+ strscpy(postfix, "_or_null");
}
snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s",
@@ -537,6 +537,7 @@ static char slot_type_char[] = {
[STACK_ZERO] = '0',
[STACK_DYNPTR] = 'd',
[STACK_ITER] = 'i',
+ [STACK_IRQ_FLAG] = 'f'
};
static void print_liveness(struct bpf_verifier_env *env,
@@ -688,8 +689,7 @@ static void print_reg_state(struct bpf_verifier_env *env,
if (t == SCALAR_VALUE && reg->precise)
verbose(env, "P");
if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) {
- /* reg->off should be 0 for SCALAR_VALUE */
- verbose_snum(env, reg->var_off.value + reg->off);
+ verbose_snum(env, reg->var_off.value);
return;
}
@@ -708,7 +708,9 @@ static void print_reg_state(struct bpf_verifier_env *env,
verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
verbose(env, "(");
if (reg->id)
- verbose_a("id=%d", reg->id);
+ verbose_a("id=%d", reg->id & ~BPF_ADD_CONST);
+ if (reg->id & BPF_ADD_CONST)
+ verbose(env, "%+d", reg->off);
if (reg->ref_obj_id)
verbose_a("ref_obj_id=%d", reg->ref_obj_id);
if (type_is_non_owning_ref(reg->type))
@@ -752,9 +754,10 @@ static void print_reg_state(struct bpf_verifier_env *env,
verbose(env, ")");
}
-void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state,
- bool print_all)
+void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate,
+ u32 frameno, bool print_all)
{
+ const struct bpf_func_state *state = vstate->frame[frameno];
const struct bpf_reg_state *reg;
int i;
@@ -842,11 +845,11 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st
break;
}
}
- if (state->acquired_refs && state->refs[0].id) {
- verbose(env, " refs=%d", state->refs[0].id);
- for (i = 1; i < state->acquired_refs; i++)
- if (state->refs[i].id)
- verbose(env, ",%d", state->refs[i].id);
+ if (vstate->acquired_refs && vstate->refs[0].id) {
+ verbose(env, " refs=%d", vstate->refs[0].id);
+ for (i = 1; i < vstate->acquired_refs; i++)
+ if (vstate->refs[i].id)
+ verbose(env, ",%d", vstate->refs[i].id);
}
if (state->in_callback_fn)
verbose(env, " cb");
@@ -863,7 +866,8 @@ static inline u32 vlog_alignment(u32 pos)
BPF_LOG_MIN_ALIGNMENT) - pos - 1;
}
-void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state)
+void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate,
+ u32 frameno)
{
if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
/* remove new line character */
@@ -872,5 +876,5 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state
} else {
verbose(env, "%d:", env->insn_idx);
}
- print_verifier_state(env, state, false);
+ print_verifier_state(env, vstate, frameno, false);
}
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 050fe1ebf0f7..be66d7e520e0 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -15,6 +15,8 @@
#include <net/ipv6.h>
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
+#include <asm/rqspinlock.h>
+#include <linux/bpf_mem_alloc.h>
/* Intermediate node */
#define LPM_TREE_NODE_FLAG_IM BIT(0)
@@ -22,7 +24,6 @@
struct lpm_trie_node;
struct lpm_trie_node {
- struct rcu_head rcu;
struct lpm_trie_node __rcu *child[2];
u32 prefixlen;
u32 flags;
@@ -32,10 +33,11 @@ struct lpm_trie_node {
struct lpm_trie {
struct bpf_map map;
struct lpm_trie_node __rcu *root;
+ struct bpf_mem_alloc ma;
size_t n_entries;
size_t max_prefixlen;
size_t data_size;
- spinlock_t lock;
+ rqspinlock_t lock;
};
/* This trie implements a longest prefix match algorithm that can be used to
@@ -155,16 +157,17 @@ static inline int extract_bit(const u8 *data, size_t index)
}
/**
- * longest_prefix_match() - determine the longest prefix
+ * __longest_prefix_match() - determine the longest prefix
* @trie: The trie to get internal sizes from
* @node: The node to operate on
* @key: The key to compare to @node
*
* Determine the longest prefix of @node that matches the bits in @key.
*/
-static size_t longest_prefix_match(const struct lpm_trie *trie,
- const struct lpm_trie_node *node,
- const struct bpf_lpm_trie_key_u8 *key)
+static __always_inline
+size_t __longest_prefix_match(const struct lpm_trie *trie,
+ const struct lpm_trie_node *node,
+ const struct bpf_lpm_trie_key_u8 *key)
{
u32 limit = min(node->prefixlen, key->prefixlen);
u32 prefixlen = 0, i = 0;
@@ -224,6 +227,13 @@ static size_t longest_prefix_match(const struct lpm_trie *trie,
return prefixlen;
}
+static size_t longest_prefix_match(const struct lpm_trie *trie,
+ const struct lpm_trie_node *node,
+ const struct bpf_lpm_trie_key_u8 *key)
+{
+ return __longest_prefix_match(trie, node, key);
+}
+
/* Called from syscall or from eBPF program */
static void *trie_lookup_elem(struct bpf_map *map, void *_key)
{
@@ -245,7 +255,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
* If it's the maximum possible prefix for this trie, we have
* an exact match and can return it directly.
*/
- matchlen = longest_prefix_match(trie, node, key);
+ matchlen = __longest_prefix_match(trie, node, key);
if (matchlen == trie->max_prefixlen) {
found = node;
break;
@@ -279,17 +289,13 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
return found->data + trie->data_size;
}
-static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
+static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie,
const void *value)
{
struct lpm_trie_node *node;
- size_t size = sizeof(struct lpm_trie_node) + trie->data_size;
- if (value)
- size += trie->map.value_size;
+ node = bpf_mem_cache_alloc(&trie->ma);
- node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN,
- trie->map.numa_node);
if (!node)
return NULL;
@@ -302,12 +308,23 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
return node;
}
+static int trie_check_add_elem(struct lpm_trie *trie, u64 flags)
+{
+ if (flags == BPF_EXIST)
+ return -ENOENT;
+ if (trie->n_entries == trie->map.max_entries)
+ return -ENOSPC;
+ trie->n_entries++;
+ return 0;
+}
+
/* Called from syscall or from eBPF program */
static long trie_update_elem(struct bpf_map *map,
void *_key, void *value, u64 flags)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
- struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
+ struct lpm_trie_node *node, *im_node, *new_node;
+ struct lpm_trie_node *free_node = NULL;
struct lpm_trie_node __rcu **slot;
struct bpf_lpm_trie_key_u8 *key = _key;
unsigned long irq_flags;
@@ -321,22 +338,14 @@ static long trie_update_elem(struct bpf_map *map,
if (key->prefixlen > trie->max_prefixlen)
return -EINVAL;
- spin_lock_irqsave(&trie->lock, irq_flags);
-
/* Allocate and fill a new node */
-
- if (trie->n_entries == trie->map.max_entries) {
- ret = -ENOSPC;
- goto out;
- }
-
new_node = lpm_trie_node_alloc(trie, value);
- if (!new_node) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!new_node)
+ return -ENOMEM;
- trie->n_entries++;
+ ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags);
+ if (ret)
+ goto out_free;
new_node->prefixlen = key->prefixlen;
RCU_INIT_POINTER(new_node->child[0], NULL);
@@ -350,13 +359,11 @@ static long trie_update_elem(struct bpf_map *map,
*/
slot = &trie->root;
- while ((node = rcu_dereference_protected(*slot,
- lockdep_is_held(&trie->lock)))) {
+ while ((node = rcu_dereference(*slot))) {
matchlen = longest_prefix_match(trie, node, key);
if (node->prefixlen != matchlen ||
- node->prefixlen == key->prefixlen ||
- node->prefixlen == trie->max_prefixlen)
+ node->prefixlen == key->prefixlen)
break;
next_bit = extract_bit(key->data, node->prefixlen);
@@ -367,6 +374,10 @@ static long trie_update_elem(struct bpf_map *map,
* simply assign the @new_node to that slot and be done.
*/
if (!node) {
+ ret = trie_check_add_elem(trie, flags);
+ if (ret)
+ goto out;
+
rcu_assign_pointer(*slot, new_node);
goto out;
}
@@ -375,18 +386,30 @@ static long trie_update_elem(struct bpf_map *map,
* which already has the correct data array set.
*/
if (node->prefixlen == matchlen) {
+ if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) {
+ if (flags == BPF_NOEXIST) {
+ ret = -EEXIST;
+ goto out;
+ }
+ } else {
+ ret = trie_check_add_elem(trie, flags);
+ if (ret)
+ goto out;
+ }
+
new_node->child[0] = node->child[0];
new_node->child[1] = node->child[1];
- if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
- trie->n_entries--;
-
rcu_assign_pointer(*slot, new_node);
- kfree_rcu(node, rcu);
+ free_node = node;
goto out;
}
+ ret = trie_check_add_elem(trie, flags);
+ if (ret)
+ goto out;
+
/* If the new node matches the prefix completely, it must be inserted
* as an ancestor. Simply insert it between @node and *@slot.
*/
@@ -399,6 +422,7 @@ static long trie_update_elem(struct bpf_map *map,
im_node = lpm_trie_node_alloc(trie, NULL);
if (!im_node) {
+ trie->n_entries--;
ret = -ENOMEM;
goto out;
}
@@ -420,15 +444,11 @@ static long trie_update_elem(struct bpf_map *map,
rcu_assign_pointer(*slot, im_node);
out:
- if (ret) {
- if (new_node)
- trie->n_entries--;
-
- kfree(new_node);
- kfree(im_node);
- }
-
- spin_unlock_irqrestore(&trie->lock, irq_flags);
+ raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags);
+out_free:
+ if (ret)
+ bpf_mem_cache_free(&trie->ma, new_node);
+ bpf_mem_cache_free_rcu(&trie->ma, free_node);
return ret;
}
@@ -437,6 +457,7 @@ out:
static long trie_delete_elem(struct bpf_map *map, void *_key)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ struct lpm_trie_node *free_node = NULL, *free_parent = NULL;
struct bpf_lpm_trie_key_u8 *key = _key;
struct lpm_trie_node __rcu **trim, **trim2;
struct lpm_trie_node *node, *parent;
@@ -448,7 +469,9 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
if (key->prefixlen > trie->max_prefixlen)
return -EINVAL;
- spin_lock_irqsave(&trie->lock, irq_flags);
+ ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags);
+ if (ret)
+ return ret;
/* Walk the tree looking for an exact key/length match and keeping
* track of the path we traverse. We will need to know the node
@@ -459,8 +482,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
trim = &trie->root;
trim2 = trim;
parent = NULL;
- while ((node = rcu_dereference_protected(
- *trim, lockdep_is_held(&trie->lock)))) {
+ while ((node = rcu_dereference(*trim))) {
matchlen = longest_prefix_match(trie, node, key);
if (node->prefixlen != matchlen ||
@@ -506,8 +528,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
else
rcu_assign_pointer(
*trim2, rcu_access_pointer(parent->child[0]));
- kfree_rcu(parent, rcu);
- kfree_rcu(node, rcu);
+ free_parent = parent;
+ free_node = node;
goto out;
}
@@ -521,10 +543,13 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1]));
else
RCU_INIT_POINTER(*trim, NULL);
- kfree_rcu(node, rcu);
+ free_node = node;
out:
- spin_unlock_irqrestore(&trie->lock, irq_flags);
+ raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags);
+
+ bpf_mem_cache_free_rcu(&trie->ma, free_parent);
+ bpf_mem_cache_free_rcu(&trie->ma, free_node);
return ret;
}
@@ -546,6 +571,8 @@ out:
static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
struct lpm_trie *trie;
+ size_t leaf_size;
+ int err;
/* check sanity of attributes */
if (attr->max_entries == 0 ||
@@ -568,9 +595,19 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
offsetof(struct bpf_lpm_trie_key_u8, data);
trie->max_prefixlen = trie->data_size * 8;
- spin_lock_init(&trie->lock);
+ raw_res_spin_lock_init(&trie->lock);
+ /* Allocate intermediate and leaf nodes from the same allocator */
+ leaf_size = sizeof(struct lpm_trie_node) + trie->data_size +
+ trie->map.value_size;
+ err = bpf_mem_alloc_init(&trie->ma, leaf_size, false);
+ if (err)
+ goto free_out;
return &trie->map;
+
+free_out:
+ bpf_map_area_free(trie);
+ return ERR_PTR(err);
}
static void trie_free(struct bpf_map *map)
@@ -602,13 +639,17 @@ static void trie_free(struct bpf_map *map)
continue;
}
- kfree(node);
+ /* No bpf program may access the map, so freeing the
+ * node without waiting for the extra RCU GP.
+ */
+ bpf_mem_cache_raw_free(node);
RCU_INIT_POINTER(*slot, NULL);
break;
}
}
out:
+ bpf_mem_alloc_destroy(&trie->ma);
bpf_map_area_free(trie);
}
@@ -620,7 +661,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
struct lpm_trie_node **node_stack = NULL;
int err = 0, stack_ptr = -1;
unsigned int next_bit;
- size_t matchlen;
+ size_t matchlen = 0;
/* The get_next_key follows postorder. For the 4 node example in
* the top of this file, the trie_get_next_key() returns the following
@@ -642,7 +683,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
if (!key || key->prefixlen > trie->max_prefixlen)
goto find_leftmost;
- node_stack = kmalloc_array(trie->max_prefixlen,
+ node_stack = kmalloc_array(trie->max_prefixlen + 1,
sizeof(struct lpm_trie_node *),
GFP_ATOMIC | __GFP_NOWARN);
if (!node_stack)
@@ -659,7 +700,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
next_bit = extract_bit(key->data, node->prefixlen);
node = rcu_dereference(node->child[next_bit]);
}
- if (!node || node->prefixlen != key->prefixlen ||
+ if (!node || node->prefixlen != matchlen ||
(node->flags & LPM_TREE_NODE_FLAG_IM))
goto find_leftmost;
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 8ef269e66ba5..645bd30bc9a9 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -11,35 +11,27 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
{
struct bpf_map *inner_map, *inner_map_meta;
u32 inner_map_meta_size;
- struct fd f;
- int ret;
+ CLASS(fd, f)(inner_map_ufd);
- f = fdget(inner_map_ufd);
inner_map = __bpf_map_get(f);
if (IS_ERR(inner_map))
return inner_map;
/* Does not support >1 level map-in-map */
- if (inner_map->inner_map_meta) {
- ret = -EINVAL;
- goto put;
- }
+ if (inner_map->inner_map_meta)
+ return ERR_PTR(-EINVAL);
- if (!inner_map->ops->map_meta_equal) {
- ret = -ENOTSUPP;
- goto put;
- }
+ if (!inner_map->ops->map_meta_equal)
+ return ERR_PTR(-ENOTSUPP);
inner_map_meta_size = sizeof(*inner_map_meta);
/* In some cases verifier needs to access beyond just base map. */
- if (inner_map->ops == &array_map_ops)
+ if (inner_map->ops == &array_map_ops || inner_map->ops == &percpu_array_map_ops)
inner_map_meta_size = sizeof(struct bpf_array);
inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER);
- if (!inner_map_meta) {
- ret = -ENOMEM;
- goto put;
- }
+ if (!inner_map_meta)
+ return ERR_PTR(-ENOMEM);
inner_map_meta->map_type = inner_map->map_type;
inner_map_meta->key_size = inner_map->key_size;
@@ -53,8 +45,9 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
* invalid/empty/valid, but ERR_PTR in case of errors. During
* equality NULL or IS_ERR is equivalent.
*/
- ret = PTR_ERR(inner_map_meta->record);
- goto free;
+ struct bpf_map *ret = ERR_CAST(inner_map_meta->record);
+ kfree(inner_map_meta);
+ return ret;
}
/* Note: We must use the same BTF, as we also used btf_record_dup above
* which relies on BTF being same for both maps, as some members like
@@ -68,7 +61,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
/* Misc members not needed in bpf_map_meta_equal() check. */
inner_map_meta->ops = inner_map->ops;
- if (inner_map->ops == &array_map_ops) {
+ if (inner_map->ops == &array_map_ops || inner_map->ops == &percpu_array_map_ops) {
struct bpf_array *inner_array_meta =
container_of(inner_map_meta, struct bpf_array, map);
struct bpf_array *inner_array = container_of(inner_map, struct bpf_array, map);
@@ -77,14 +70,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_array_meta->elem_size = inner_array->elem_size;
inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1;
}
-
- fdput(f);
return inner_map_meta;
-free:
- kfree(inner_map_meta);
-put:
- fdput(f);
- return ERR_PTR(ret);
}
void bpf_map_meta_free(struct bpf_map *map_meta)
@@ -110,9 +96,8 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
int ufd)
{
struct bpf_map *inner_map, *inner_map_meta;
- struct fd f;
+ CLASS(fd, f)(ufd);
- f = fdget(ufd);
inner_map = __bpf_map_get(f);
if (IS_ERR(inner_map))
return inner_map;
@@ -123,7 +108,6 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
else
inner_map = ERR_PTR(-EINVAL);
- fdput(f);
return inner_map;
}
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 550f02e2cb13..889374722d0a 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -35,6 +35,8 @@
*/
#define LLIST_NODE_SZ sizeof(struct llist_node)
+#define BPF_MEM_ALLOC_SIZE_MAX 4096
+
/* similar to kmalloc, but sizeof == 8 bucket is gone */
static u8 size_index[24] __ro_after_init = {
3, /* 8 */
@@ -65,7 +67,7 @@ static u8 size_index[24] __ro_after_init = {
static int bpf_mem_cache_idx(size_t size)
{
- if (!size || size > 4096)
+ if (!size || size > BPF_MEM_ALLOC_SIZE_MAX)
return -1;
if (size <= 192)
@@ -138,8 +140,8 @@ static struct llist_node notrace *__llist_del_first(struct llist_head *head)
static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)
{
if (c->percpu_size) {
- void **obj = kmalloc_node(c->percpu_size, flags, node);
- void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
+ void __percpu **obj = kmalloc_node(c->percpu_size, flags, node);
+ void __percpu *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
if (!obj || !pptr) {
free_percpu(pptr);
@@ -155,12 +157,9 @@ static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)
static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
{
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
if (c->objcg)
return get_mem_cgroup_from_objcg(c->objcg);
-#endif
-
-#ifdef CONFIG_MEMCG
return root_mem_cgroup;
#else
return NULL;
@@ -255,11 +254,8 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic)
static void free_one(void *obj, bool percpu)
{
- if (percpu) {
- free_percpu(((void **)obj)[1]);
- kfree(obj);
- return;
- }
+ if (percpu)
+ free_percpu(((void __percpu **)obj)[1]);
kfree(obj);
}
@@ -512,8 +508,8 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
*/
int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
{
- struct bpf_mem_caches *cc, __percpu *pcc;
- struct bpf_mem_cache *c, __percpu *pc;
+ struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc;
+ struct bpf_mem_cache *c; struct bpf_mem_cache __percpu *pc;
struct obj_cgroup *objcg = NULL;
int cpu, i, unit_size, percpu_size = 0;
@@ -534,7 +530,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
size += LLIST_NODE_SZ; /* room for llist_node */
unit_size = size;
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
if (memcg_bpf_enabled())
objcg = get_obj_cgroup_from_current();
#endif
@@ -556,7 +552,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
if (!pcc)
return -ENOMEM;
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
objcg = get_obj_cgroup_from_current();
#endif
ma->objcg = objcg;
@@ -594,7 +590,7 @@ int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg
int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size)
{
- struct bpf_mem_caches *cc, __percpu *pcc;
+ struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc;
int cpu, i, unit_size, percpu_size;
struct obj_cgroup *objcg;
struct bpf_mem_cache *c;
@@ -759,8 +755,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
}
- if (ma->objcg)
- obj_cgroup_put(ma->objcg);
+ obj_cgroup_put(ma->objcg);
destroy_mem_alloc(ma, rcu_in_progress);
}
if (ma->caches) {
@@ -776,8 +771,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
}
}
- if (ma->objcg)
- obj_cgroup_put(ma->objcg);
+ obj_cgroup_put(ma->objcg);
destroy_mem_alloc(ma, rcu_in_progress);
}
}
@@ -1010,3 +1004,13 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
+
+int bpf_mem_alloc_check_size(bool percpu, size_t size)
+{
+ /* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */
+ if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) ||
+ (!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ))
+ return -E2BIG;
+
+ return 0;
+}
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 1a4fec330eaa..42ae8d595c2c 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -25,6 +25,7 @@
#include <linux/rhashtable.h>
#include <linux/rtnetlink.h>
#include <linux/rwsem.h>
+#include <net/netdev_lock.h>
#include <net/xdp.h>
/* Protects offdevs, members of bpf_offload_netdev and offload members
@@ -528,13 +529,14 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&offmap->map, attr);
-
rtnl_lock();
- down_write(&bpf_devs_lock);
offmap->netdev = __dev_get_by_index(net, attr->map_ifindex);
err = bpf_dev_offload_check(offmap->netdev);
if (err)
- goto err_unlock;
+ goto err_unlock_rtnl;
+
+ netdev_lock_ops(offmap->netdev);
+ down_write(&bpf_devs_lock);
ondev = bpf_offload_find_netdev(offmap->netdev);
if (!ondev) {
@@ -548,12 +550,15 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
list_add_tail(&offmap->offloads, &ondev->maps);
up_write(&bpf_devs_lock);
+ netdev_unlock_ops(offmap->netdev);
rtnl_unlock();
return &offmap->map;
err_unlock:
up_write(&bpf_devs_lock);
+ netdev_unlock_ops(offmap->netdev);
+err_unlock_rtnl:
rtnl_unlock();
bpf_map_area_free(offmap);
return ERR_PTR(err);
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 034cf87b54e9..632762b57299 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -14,11 +14,9 @@ int pcpu_freelist_init(struct pcpu_freelist *s)
for_each_possible_cpu(cpu) {
struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu);
- raw_spin_lock_init(&head->lock);
+ raw_res_spin_lock_init(&head->lock);
head->first = NULL;
}
- raw_spin_lock_init(&s->extralist.lock);
- s->extralist.first = NULL;
return 0;
}
@@ -34,58 +32,39 @@ static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head,
WRITE_ONCE(head->first, node);
}
-static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
+static inline bool ___pcpu_freelist_push(struct pcpu_freelist_head *head,
struct pcpu_freelist_node *node)
{
- raw_spin_lock(&head->lock);
- pcpu_freelist_push_node(head, node);
- raw_spin_unlock(&head->lock);
-}
-
-static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s,
- struct pcpu_freelist_node *node)
-{
- if (!raw_spin_trylock(&s->extralist.lock))
+ if (raw_res_spin_lock(&head->lock))
return false;
-
- pcpu_freelist_push_node(&s->extralist, node);
- raw_spin_unlock(&s->extralist.lock);
+ pcpu_freelist_push_node(head, node);
+ raw_res_spin_unlock(&head->lock);
return true;
}
-static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s,
- struct pcpu_freelist_node *node)
+void __pcpu_freelist_push(struct pcpu_freelist *s,
+ struct pcpu_freelist_node *node)
{
- int cpu, orig_cpu;
+ struct pcpu_freelist_head *head;
+ int cpu;
- orig_cpu = raw_smp_processor_id();
- while (1) {
- for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) {
- struct pcpu_freelist_head *head;
+ if (___pcpu_freelist_push(this_cpu_ptr(s->freelist), node))
+ return;
+ while (true) {
+ for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
+ if (cpu == raw_smp_processor_id())
+ continue;
head = per_cpu_ptr(s->freelist, cpu);
- if (raw_spin_trylock(&head->lock)) {
- pcpu_freelist_push_node(head, node);
- raw_spin_unlock(&head->lock);
- return;
- }
- }
-
- /* cannot lock any per cpu lock, try extralist */
- if (pcpu_freelist_try_push_extra(s, node))
+ if (raw_res_spin_lock(&head->lock))
+ continue;
+ pcpu_freelist_push_node(head, node);
+ raw_res_spin_unlock(&head->lock);
return;
+ }
}
}
-void __pcpu_freelist_push(struct pcpu_freelist *s,
- struct pcpu_freelist_node *node)
-{
- if (in_nmi())
- ___pcpu_freelist_push_nmi(s, node);
- else
- ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node);
-}
-
void pcpu_freelist_push(struct pcpu_freelist *s,
struct pcpu_freelist_node *node)
{
@@ -120,71 +99,29 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
{
+ struct pcpu_freelist_node *node = NULL;
struct pcpu_freelist_head *head;
- struct pcpu_freelist_node *node;
int cpu;
for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
head = per_cpu_ptr(s->freelist, cpu);
if (!READ_ONCE(head->first))
continue;
- raw_spin_lock(&head->lock);
+ if (raw_res_spin_lock(&head->lock))
+ continue;
node = head->first;
if (node) {
WRITE_ONCE(head->first, node->next);
- raw_spin_unlock(&head->lock);
+ raw_res_spin_unlock(&head->lock);
return node;
}
- raw_spin_unlock(&head->lock);
+ raw_res_spin_unlock(&head->lock);
}
-
- /* per cpu lists are all empty, try extralist */
- if (!READ_ONCE(s->extralist.first))
- return NULL;
- raw_spin_lock(&s->extralist.lock);
- node = s->extralist.first;
- if (node)
- WRITE_ONCE(s->extralist.first, node->next);
- raw_spin_unlock(&s->extralist.lock);
- return node;
-}
-
-static struct pcpu_freelist_node *
-___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
-{
- struct pcpu_freelist_head *head;
- struct pcpu_freelist_node *node;
- int cpu;
-
- for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
- head = per_cpu_ptr(s->freelist, cpu);
- if (!READ_ONCE(head->first))
- continue;
- if (raw_spin_trylock(&head->lock)) {
- node = head->first;
- if (node) {
- WRITE_ONCE(head->first, node->next);
- raw_spin_unlock(&head->lock);
- return node;
- }
- raw_spin_unlock(&head->lock);
- }
- }
-
- /* cannot pop from per cpu lists, try extralist */
- if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock))
- return NULL;
- node = s->extralist.first;
- if (node)
- WRITE_ONCE(s->extralist.first, node->next);
- raw_spin_unlock(&s->extralist.lock);
return node;
}
struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
{
- if (in_nmi())
- return ___pcpu_freelist_pop_nmi(s);
return ___pcpu_freelist_pop(s);
}
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index 3c76553cfe57..914798b74967 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -5,15 +5,15 @@
#define __PERCPU_FREELIST_H__
#include <linux/spinlock.h>
#include <linux/percpu.h>
+#include <asm/rqspinlock.h>
struct pcpu_freelist_head {
struct pcpu_freelist_node *first;
- raw_spinlock_t lock;
+ rqspinlock_t lock;
};
struct pcpu_freelist {
struct pcpu_freelist_head __percpu *freelist;
- struct pcpu_freelist_head extralist;
};
struct pcpu_freelist_node {
diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
index 0c63bc2cd895..774e5a538811 100644
--- a/kernel/bpf/preload/bpf_preload_kern.c
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -89,4 +89,6 @@ static void __exit fini(void)
}
late_initcall(load);
module_exit(fini);
+MODULE_IMPORT_NS("BPF_INTERNAL");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Embedded BPF programs for introspection in bpffs");
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index d869f51ea93a..9a5f94371e50 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -9,13 +9,14 @@
#include <linux/slab.h>
#include <linux/btf_ids.h>
#include "percpu_freelist.h"
+#include <asm/rqspinlock.h>
#define QUEUE_STACK_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
struct bpf_queue_stack {
struct bpf_map map;
- raw_spinlock_t lock;
+ rqspinlock_t lock;
u32 head, tail;
u32 size; /* max_entries + 1 */
@@ -78,7 +79,7 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
qs->size = size;
- raw_spin_lock_init(&qs->lock);
+ raw_res_spin_lock_init(&qs->lock);
return &qs->map;
}
@@ -98,12 +99,8 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
int err = 0;
void *ptr;
- if (in_nmi()) {
- if (!raw_spin_trylock_irqsave(&qs->lock, flags))
- return -EBUSY;
- } else {
- raw_spin_lock_irqsave(&qs->lock, flags);
- }
+ if (raw_res_spin_lock_irqsave(&qs->lock, flags))
+ return -EBUSY;
if (queue_stack_map_is_empty(qs)) {
memset(value, 0, qs->map.value_size);
@@ -120,7 +117,7 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
}
out:
- raw_spin_unlock_irqrestore(&qs->lock, flags);
+ raw_res_spin_unlock_irqrestore(&qs->lock, flags);
return err;
}
@@ -133,12 +130,8 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
void *ptr;
u32 index;
- if (in_nmi()) {
- if (!raw_spin_trylock_irqsave(&qs->lock, flags))
- return -EBUSY;
- } else {
- raw_spin_lock_irqsave(&qs->lock, flags);
- }
+ if (raw_res_spin_lock_irqsave(&qs->lock, flags))
+ return -EBUSY;
if (queue_stack_map_is_empty(qs)) {
memset(value, 0, qs->map.value_size);
@@ -157,7 +150,7 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
qs->head = index;
out:
- raw_spin_unlock_irqrestore(&qs->lock, flags);
+ raw_res_spin_unlock_irqrestore(&qs->lock, flags);
return err;
}
@@ -203,12 +196,8 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
if (flags & BPF_NOEXIST || flags > BPF_EXIST)
return -EINVAL;
- if (in_nmi()) {
- if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags))
- return -EBUSY;
- } else {
- raw_spin_lock_irqsave(&qs->lock, irq_flags);
- }
+ if (raw_res_spin_lock_irqsave(&qs->lock, irq_flags))
+ return -EBUSY;
if (queue_stack_map_is_full(qs)) {
if (!replace) {
@@ -227,7 +216,7 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
qs->head = 0;
out:
- raw_spin_unlock_irqrestore(&qs->lock, irq_flags);
+ raw_res_spin_unlock_irqrestore(&qs->lock, irq_flags);
return err;
}
diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c
new file mode 100644
index 000000000000..37b80a23ae1a
--- /dev/null
+++ b/kernel/bpf/range_tree.c
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <linux/interval_tree_generic.h>
+#include <linux/slab.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/bpf.h>
+#include "range_tree.h"
+
+/*
+ * struct range_tree is a data structure used to allocate contiguous memory
+ * ranges in bpf arena. It's a large bitmap. The contiguous sequence of bits is
+ * represented by struct range_node or 'rn' for short.
+ * rn->rn_rbnode links it into an interval tree while
+ * rn->rb_range_size links it into a second rbtree sorted by size of the range.
+ * __find_range() performs binary search and best fit algorithm to find the
+ * range less or equal requested size.
+ * range_tree_clear/set() clears or sets a range of bits in this bitmap. The
+ * adjacent ranges are merged or split at the same time.
+ *
+ * The split/merge logic is based/borrowed from XFS's xbitmap32 added
+ * in commit 6772fcc8890a ("xfs: convert xbitmap to interval tree").
+ *
+ * The implementation relies on external lock to protect rbtree-s.
+ * The alloc/free of range_node-s is done via bpf_mem_alloc.
+ *
+ * bpf arena is using range_tree to represent unallocated slots.
+ * At init time:
+ * range_tree_set(rt, 0, max);
+ * Then:
+ * start = range_tree_find(rt, len);
+ * if (start >= 0)
+ * range_tree_clear(rt, start, len);
+ * to find free range and mark slots as allocated and later:
+ * range_tree_set(rt, start, len);
+ * to mark as unallocated after use.
+ */
+struct range_node {
+ struct rb_node rn_rbnode;
+ struct rb_node rb_range_size;
+ u32 rn_start;
+ u32 rn_last; /* inclusive */
+ u32 __rn_subtree_last;
+};
+
+static struct range_node *rb_to_range_node(struct rb_node *rb)
+{
+ return rb_entry(rb, struct range_node, rb_range_size);
+}
+
+static u32 rn_size(struct range_node *rn)
+{
+ return rn->rn_last - rn->rn_start + 1;
+}
+
+/* Find range that fits best to requested size */
+static inline struct range_node *__find_range(struct range_tree *rt, u32 len)
+{
+ struct rb_node *rb = rt->range_size_root.rb_root.rb_node;
+ struct range_node *best = NULL;
+
+ while (rb) {
+ struct range_node *rn = rb_to_range_node(rb);
+
+ if (len <= rn_size(rn)) {
+ best = rn;
+ rb = rb->rb_right;
+ } else {
+ rb = rb->rb_left;
+ }
+ }
+
+ return best;
+}
+
+s64 range_tree_find(struct range_tree *rt, u32 len)
+{
+ struct range_node *rn;
+
+ rn = __find_range(rt, len);
+ if (!rn)
+ return -ENOENT;
+ return rn->rn_start;
+}
+
+/* Insert the range into rbtree sorted by the range size */
+static inline void __range_size_insert(struct range_node *rn,
+ struct rb_root_cached *root)
+{
+ struct rb_node **link = &root->rb_root.rb_node, *rb = NULL;
+ u64 size = rn_size(rn);
+ bool leftmost = true;
+
+ while (*link) {
+ rb = *link;
+ if (size > rn_size(rb_to_range_node(rb))) {
+ link = &rb->rb_left;
+ } else {
+ link = &rb->rb_right;
+ leftmost = false;
+ }
+ }
+
+ rb_link_node(&rn->rb_range_size, rb, link);
+ rb_insert_color_cached(&rn->rb_range_size, root, leftmost);
+}
+
+#define START(node) ((node)->rn_start)
+#define LAST(node) ((node)->rn_last)
+
+INTERVAL_TREE_DEFINE(struct range_node, rn_rbnode, u32,
+ __rn_subtree_last, START, LAST,
+ static inline __maybe_unused,
+ __range_it)
+
+static inline __maybe_unused void
+range_it_insert(struct range_node *rn, struct range_tree *rt)
+{
+ __range_size_insert(rn, &rt->range_size_root);
+ __range_it_insert(rn, &rt->it_root);
+}
+
+static inline __maybe_unused void
+range_it_remove(struct range_node *rn, struct range_tree *rt)
+{
+ rb_erase_cached(&rn->rb_range_size, &rt->range_size_root);
+ RB_CLEAR_NODE(&rn->rb_range_size);
+ __range_it_remove(rn, &rt->it_root);
+}
+
+static inline __maybe_unused struct range_node *
+range_it_iter_first(struct range_tree *rt, u32 start, u32 last)
+{
+ return __range_it_iter_first(&rt->it_root, start, last);
+}
+
+/* Clear the range in this range tree */
+int range_tree_clear(struct range_tree *rt, u32 start, u32 len)
+{
+ u32 last = start + len - 1;
+ struct range_node *new_rn;
+ struct range_node *rn;
+
+ while ((rn = range_it_iter_first(rt, start, last))) {
+ if (rn->rn_start < start && rn->rn_last > last) {
+ u32 old_last = rn->rn_last;
+
+ /* Overlaps with the entire clearing range */
+ range_it_remove(rn, rt);
+ rn->rn_last = start - 1;
+ range_it_insert(rn, rt);
+
+ /* Add a range */
+ migrate_disable();
+ new_rn = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node));
+ migrate_enable();
+ if (!new_rn)
+ return -ENOMEM;
+ new_rn->rn_start = last + 1;
+ new_rn->rn_last = old_last;
+ range_it_insert(new_rn, rt);
+ } else if (rn->rn_start < start) {
+ /* Overlaps with the left side of the clearing range */
+ range_it_remove(rn, rt);
+ rn->rn_last = start - 1;
+ range_it_insert(rn, rt);
+ } else if (rn->rn_last > last) {
+ /* Overlaps with the right side of the clearing range */
+ range_it_remove(rn, rt);
+ rn->rn_start = last + 1;
+ range_it_insert(rn, rt);
+ break;
+ } else {
+ /* in the middle of the clearing range */
+ range_it_remove(rn, rt);
+ migrate_disable();
+ bpf_mem_free(&bpf_global_ma, rn);
+ migrate_enable();
+ }
+ }
+ return 0;
+}
+
+/* Is the whole range set ? */
+int is_range_tree_set(struct range_tree *rt, u32 start, u32 len)
+{
+ u32 last = start + len - 1;
+ struct range_node *left;
+
+ /* Is this whole range set ? */
+ left = range_it_iter_first(rt, start, last);
+ if (left && left->rn_start <= start && left->rn_last >= last)
+ return 0;
+ return -ESRCH;
+}
+
+/* Set the range in this range tree */
+int range_tree_set(struct range_tree *rt, u32 start, u32 len)
+{
+ u32 last = start + len - 1;
+ struct range_node *right;
+ struct range_node *left;
+ int err;
+
+ /* Is this whole range already set ? */
+ left = range_it_iter_first(rt, start, last);
+ if (left && left->rn_start <= start && left->rn_last >= last)
+ return 0;
+
+ /* Clear out everything in the range we want to set. */
+ err = range_tree_clear(rt, start, len);
+ if (err)
+ return err;
+
+ /* Do we have a left-adjacent range ? */
+ left = range_it_iter_first(rt, start - 1, start - 1);
+ if (left && left->rn_last + 1 != start)
+ return -EFAULT;
+
+ /* Do we have a right-adjacent range ? */
+ right = range_it_iter_first(rt, last + 1, last + 1);
+ if (right && right->rn_start != last + 1)
+ return -EFAULT;
+
+ if (left && right) {
+ /* Combine left and right adjacent ranges */
+ range_it_remove(left, rt);
+ range_it_remove(right, rt);
+ left->rn_last = right->rn_last;
+ range_it_insert(left, rt);
+ migrate_disable();
+ bpf_mem_free(&bpf_global_ma, right);
+ migrate_enable();
+ } else if (left) {
+ /* Combine with the left range */
+ range_it_remove(left, rt);
+ left->rn_last = last;
+ range_it_insert(left, rt);
+ } else if (right) {
+ /* Combine with the right range */
+ range_it_remove(right, rt);
+ right->rn_start = start;
+ range_it_insert(right, rt);
+ } else {
+ migrate_disable();
+ left = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node));
+ migrate_enable();
+ if (!left)
+ return -ENOMEM;
+ left->rn_start = start;
+ left->rn_last = last;
+ range_it_insert(left, rt);
+ }
+ return 0;
+}
+
+void range_tree_destroy(struct range_tree *rt)
+{
+ struct range_node *rn;
+
+ while ((rn = range_it_iter_first(rt, 0, -1U))) {
+ range_it_remove(rn, rt);
+ bpf_mem_free(&bpf_global_ma, rn);
+ }
+}
+
+void range_tree_init(struct range_tree *rt)
+{
+ rt->it_root = RB_ROOT_CACHED;
+ rt->range_size_root = RB_ROOT_CACHED;
+}
diff --git a/kernel/bpf/range_tree.h b/kernel/bpf/range_tree.h
new file mode 100644
index 000000000000..ff0b9110eb71
--- /dev/null
+++ b/kernel/bpf/range_tree.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#ifndef _RANGE_TREE_H
+#define _RANGE_TREE_H 1
+
+struct range_tree {
+ /* root of interval tree */
+ struct rb_root_cached it_root;
+ /* root of rbtree of interval sizes */
+ struct rb_root_cached range_size_root;
+};
+
+void range_tree_init(struct range_tree *rt);
+void range_tree_destroy(struct range_tree *rt);
+
+int range_tree_clear(struct range_tree *rt, u32 start, u32 len);
+int range_tree_set(struct range_tree *rt, u32 start, u32 len);
+int is_range_tree_set(struct range_tree *rt, u32 start, u32 len);
+s64 range_tree_find(struct range_tree *rt, u32 len);
+
+#endif
diff --git a/kernel/bpf/relo_core.c b/kernel/bpf/relo_core.c
new file mode 100644
index 000000000000..aa822c9fcfde
--- /dev/null
+++ b/kernel/bpf/relo_core.c
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+#include "../../tools/lib/bpf/relo_core.c"
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 4b4f9670f1a9..49b8e5a0c6b4 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -308,7 +308,7 @@ put_file_unlock:
spin_unlock_bh(&reuseport_lock);
put_file:
- fput(socket->file);
+ sockfd_put(socket);
return err;
}
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 0ee653a936ea..719d73299397 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -11,6 +11,7 @@
#include <linux/kmemleak.h>
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
+#include <asm/rqspinlock.h>
#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
@@ -29,7 +30,7 @@ struct bpf_ringbuf {
u64 mask;
struct page **pages;
int nr_pages;
- spinlock_t spinlock ____cacheline_aligned_in_smp;
+ rqspinlock_t spinlock ____cacheline_aligned_in_smp;
/* For user-space producer ring buffers, an atomic_t busy bit is used
* to synchronize access to the ring buffers in the kernel, rather than
* the spinlock that is used for kernel-producer ring buffers. This is
@@ -51,7 +52,8 @@ struct bpf_ringbuf {
* This prevents a user-space application from modifying the
* position and ruining in-kernel tracking. The permissions of the
* pages depend on who is producing samples: user-space or the
- * kernel.
+ * kernel. Note that the pending counter is placed in the same
+ * page as the producer, so that it shares the same cache line.
*
* Kernel-producer
* ---------------
@@ -70,6 +72,7 @@ struct bpf_ringbuf {
*/
unsigned long consumer_pos __aligned(PAGE_SIZE);
unsigned long producer_pos __aligned(PAGE_SIZE);
+ unsigned long pending_pos;
char data[] __aligned(PAGE_SIZE);
};
@@ -171,7 +174,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
if (!rb)
return NULL;
- spin_lock_init(&rb->spinlock);
+ raw_res_spin_lock_init(&rb->spinlock);
atomic_set(&rb->busy, 0);
init_waitqueue_head(&rb->waitq);
init_irq_work(&rb->work, bpf_ringbuf_notify);
@@ -179,6 +182,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
rb->mask = data_sz - 1;
rb->consumer_pos = 0;
rb->producer_pos = 0;
+ rb->pending_pos = 0;
return rb;
}
@@ -265,8 +269,6 @@ static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma
/* allow writable mapping for the consumer_pos only */
if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE)
return -EPERM;
- } else {
- vm_flags_clear(vma, VM_MAYWRITE);
}
/* remap_vmalloc_range() checks size and offset constraints */
return remap_vmalloc_range(vma, rb_map->rb,
@@ -286,8 +288,6 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma
* position, and the ring buffer data itself.
*/
return -EPERM;
- } else {
- vm_flags_clear(vma, VM_MAYWRITE);
}
/* remap_vmalloc_range() checks size and offset constraints */
return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
@@ -404,9 +404,9 @@ bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr)
static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
{
- unsigned long cons_pos, prod_pos, new_prod_pos, flags;
- u32 len, pg_off;
+ unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, flags;
struct bpf_ringbuf_hdr *hdr;
+ u32 len, pg_off, tmp_size, hdr_len;
if (unlikely(size > RINGBUF_MAX_RECORD_SZ))
return NULL;
@@ -417,21 +417,33 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
cons_pos = smp_load_acquire(&rb->consumer_pos);
- if (in_nmi()) {
- if (!spin_trylock_irqsave(&rb->spinlock, flags))
- return NULL;
- } else {
- spin_lock_irqsave(&rb->spinlock, flags);
- }
+ if (raw_res_spin_lock_irqsave(&rb->spinlock, flags))
+ return NULL;
+ pend_pos = rb->pending_pos;
prod_pos = rb->producer_pos;
new_prod_pos = prod_pos + len;
- /* check for out of ringbuf space by ensuring producer position
- * doesn't advance more than (ringbuf_size - 1) ahead
+ while (pend_pos < prod_pos) {
+ hdr = (void *)rb->data + (pend_pos & rb->mask);
+ hdr_len = READ_ONCE(hdr->len);
+ if (hdr_len & BPF_RINGBUF_BUSY_BIT)
+ break;
+ tmp_size = hdr_len & ~BPF_RINGBUF_DISCARD_BIT;
+ tmp_size = round_up(tmp_size + BPF_RINGBUF_HDR_SZ, 8);
+ pend_pos += tmp_size;
+ }
+ rb->pending_pos = pend_pos;
+
+ /* check for out of ringbuf space:
+ * - by ensuring producer position doesn't advance more than
+ * (ringbuf_size - 1) ahead
+ * - by ensuring oldest not yet committed record until newest
+ * record does not span more than (ringbuf_size - 1)
*/
- if (new_prod_pos - cons_pos > rb->mask) {
- spin_unlock_irqrestore(&rb->spinlock, flags);
+ if (new_prod_pos - cons_pos > rb->mask ||
+ new_prod_pos - pend_pos > rb->mask) {
+ raw_res_spin_unlock_irqrestore(&rb->spinlock, flags);
return NULL;
}
@@ -443,7 +455,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
/* pairs with consumer's smp_load_acquire() */
smp_store_release(&rb->producer_pos, new_prod_pos);
- spin_unlock_irqrestore(&rb->spinlock, flags);
+ raw_res_spin_unlock_irqrestore(&rb->spinlock, flags);
return (void *)hdr + BPF_RINGBUF_HDR_SZ;
}
@@ -613,7 +625,7 @@ const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = {
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT,
+ .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT | MEM_WRITE,
};
BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
new file mode 100644
index 000000000000..338305c8852c
--- /dev/null
+++ b/kernel/bpf/rqspinlock.c
@@ -0,0 +1,737 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Resilient Queued Spin Lock
+ *
+ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ * (C) Copyright 2013-2014,2018 Red Hat, Inc.
+ * (C) Copyright 2015 Intel Corp.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
+ * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ * Peter Zijlstra <peterz@infradead.org>
+ * Kumar Kartikeya Dwivedi <memxor@gmail.com>
+ */
+
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mutex.h>
+#include <linux/prefetch.h>
+#include <asm/byteorder.h>
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#endif
+#include <trace/events/lock.h>
+#include <asm/rqspinlock.h>
+#include <linux/timekeeping.h>
+
+/*
+ * Include queued spinlock definitions and statistics code
+ */
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include "../locking/qspinlock.h"
+#include "../locking/lock_events.h"
+#include "rqspinlock.h"
+#include "../locking/mcs_spinlock.h"
+#endif
+
+/*
+ * The basic principle of a queue-based spinlock can best be understood
+ * by studying a classic queue-based spinlock implementation called the
+ * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable
+ * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and
+ * Scott") is available at
+ *
+ * https://bugzilla.kernel.org/show_bug.cgi?id=206115
+ *
+ * This queued spinlock implementation is based on the MCS lock, however to
+ * make it fit the 4 bytes we assume spinlock_t to be, and preserve its
+ * existing API, we must modify it somehow.
+ *
+ * In particular; where the traditional MCS lock consists of a tail pointer
+ * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
+ * unlock the next pending (next->locked), we compress both these: {tail,
+ * next->locked} into a single u32 value.
+ *
+ * Since a spinlock disables recursion of its own context and there is a limit
+ * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
+ * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
+ * we can encode the tail by combining the 2-bit nesting level with the cpu
+ * number. With one byte for the lock value and 3 bytes for the tail, only a
+ * 32-bit word is now needed. Even though we only need 1 bit for the lock,
+ * we extend it to a full byte to achieve better performance for architectures
+ * that support atomic byte write.
+ *
+ * We also change the first spinner to spin on the lock bit instead of its
+ * node; whereby avoiding the need to carry a node from lock to unlock, and
+ * preserving existing lock API. This also makes the unlock code simpler and
+ * faster.
+ *
+ * N.B. The current implementation only supports architectures that allow
+ * atomic operations on smaller 8-bit and 16-bit data types.
+ *
+ */
+
+struct rqspinlock_timeout {
+ u64 timeout_end;
+ u64 duration;
+ u64 cur;
+ u16 spin;
+};
+
+#define RES_TIMEOUT_VAL 2
+
+DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks);
+EXPORT_SYMBOL_GPL(rqspinlock_held_locks);
+
+static bool is_lock_released(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts)
+{
+ if (!(atomic_read_acquire(&lock->val) & (mask)))
+ return true;
+ return false;
+}
+
+static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask,
+ struct rqspinlock_timeout *ts)
+{
+ struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
+ int cnt = min(RES_NR_HELD, rqh->cnt);
+
+ /*
+ * Return an error if we hold the lock we are attempting to acquire.
+ * We'll iterate over max 32 locks; no need to do is_lock_released.
+ */
+ for (int i = 0; i < cnt - 1; i++) {
+ if (rqh->locks[i] == lock)
+ return -EDEADLK;
+ }
+ return 0;
+}
+
+/*
+ * This focuses on the most common case of ABBA deadlocks (or ABBA involving
+ * more locks, which reduce to ABBA). This is not exhaustive, and we rely on
+ * timeouts as the final line of defense.
+ */
+static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask,
+ struct rqspinlock_timeout *ts)
+{
+ struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
+ int rqh_cnt = min(RES_NR_HELD, rqh->cnt);
+ void *remote_lock;
+ int cpu;
+
+ /*
+ * Find the CPU holding the lock that we want to acquire. If there is a
+ * deadlock scenario, we will read a stable set on the remote CPU and
+ * find the target. This would be a constant time operation instead of
+ * O(NR_CPUS) if we could determine the owning CPU from a lock value, but
+ * that requires increasing the size of the lock word.
+ */
+ for_each_possible_cpu(cpu) {
+ struct rqspinlock_held *rqh_cpu = per_cpu_ptr(&rqspinlock_held_locks, cpu);
+ int real_cnt = READ_ONCE(rqh_cpu->cnt);
+ int cnt = min(RES_NR_HELD, real_cnt);
+
+ /*
+ * Let's ensure to break out of this loop if the lock is available for
+ * us to potentially acquire.
+ */
+ if (is_lock_released(lock, mask, ts))
+ return 0;
+
+ /*
+ * Skip ourselves, and CPUs whose count is less than 2, as they need at
+ * least one held lock and one acquisition attempt (reflected as top
+ * most entry) to participate in an ABBA deadlock.
+ *
+ * If cnt is more than RES_NR_HELD, it means the current lock being
+ * acquired won't appear in the table, and other locks in the table are
+ * already held, so we can't determine ABBA.
+ */
+ if (cpu == smp_processor_id() || real_cnt < 2 || real_cnt > RES_NR_HELD)
+ continue;
+
+ /*
+ * Obtain the entry at the top, this corresponds to the lock the
+ * remote CPU is attempting to acquire in a deadlock situation,
+ * and would be one of the locks we hold on the current CPU.
+ */
+ remote_lock = READ_ONCE(rqh_cpu->locks[cnt - 1]);
+ /*
+ * If it is NULL, we've raced and cannot determine a deadlock
+ * conclusively, skip this CPU.
+ */
+ if (!remote_lock)
+ continue;
+ /*
+ * Find if the lock we're attempting to acquire is held by this CPU.
+ * Don't consider the topmost entry, as that must be the latest lock
+ * being held or acquired. For a deadlock, the target CPU must also
+ * attempt to acquire a lock we hold, so for this search only 'cnt - 1'
+ * entries are important.
+ */
+ for (int i = 0; i < cnt - 1; i++) {
+ if (READ_ONCE(rqh_cpu->locks[i]) != lock)
+ continue;
+ /*
+ * We found our lock as held on the remote CPU. Is the
+ * acquisition attempt on the remote CPU for a lock held
+ * by us? If so, we have a deadlock situation, and need
+ * to recover.
+ */
+ for (int i = 0; i < rqh_cnt - 1; i++) {
+ if (rqh->locks[i] == remote_lock)
+ return -EDEADLK;
+ }
+ /*
+ * Inconclusive; retry again later.
+ */
+ return 0;
+ }
+ }
+ return 0;
+}
+
+static noinline int check_deadlock(rqspinlock_t *lock, u32 mask,
+ struct rqspinlock_timeout *ts)
+{
+ int ret;
+
+ ret = check_deadlock_AA(lock, mask, ts);
+ if (ret)
+ return ret;
+ ret = check_deadlock_ABBA(lock, mask, ts);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
+ struct rqspinlock_timeout *ts)
+{
+ u64 time = ktime_get_mono_fast_ns();
+ u64 prev = ts->cur;
+
+ if (!ts->timeout_end) {
+ ts->cur = time;
+ ts->timeout_end = time + ts->duration;
+ return 0;
+ }
+
+ if (time > ts->timeout_end)
+ return -ETIMEDOUT;
+
+ /*
+ * A millisecond interval passed from last time? Trigger deadlock
+ * checks.
+ */
+ if (prev + NSEC_PER_MSEC < time) {
+ ts->cur = time;
+ return check_deadlock(lock, mask, ts);
+ }
+
+ return 0;
+}
+
+/*
+ * Do not amortize with spins when res_smp_cond_load_acquire is defined,
+ * as the macro does internal amortization for us.
+ */
+#ifndef res_smp_cond_load_acquire
+#define RES_CHECK_TIMEOUT(ts, ret, mask) \
+ ({ \
+ if (!(ts).spin++) \
+ (ret) = check_timeout((lock), (mask), &(ts)); \
+ (ret); \
+ })
+#else
+#define RES_CHECK_TIMEOUT(ts, ret, mask) \
+ ({ (ret) = check_timeout((lock), (mask), &(ts)); })
+#endif
+
+/*
+ * Initialize the 'spin' member.
+ * Set spin member to 0 to trigger AA/ABBA checks immediately.
+ */
+#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; })
+
+/*
+ * We only need to reset 'timeout_end', 'spin' will just wrap around as necessary.
+ * Duration is defined for each spin attempt, so set it here.
+ */
+#define RES_RESET_TIMEOUT(ts, _duration) ({ (ts).timeout_end = 0; (ts).duration = _duration; })
+
+/*
+ * Provide a test-and-set fallback for cases when queued spin lock support is
+ * absent from the architecture.
+ */
+int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock)
+{
+ struct rqspinlock_timeout ts;
+ int val, ret = 0;
+
+ RES_INIT_TIMEOUT(ts);
+ grab_held_lock_entry(lock);
+
+ /*
+ * Since the waiting loop's time is dependent on the amount of
+ * contention, a short timeout unlike rqspinlock waiting loops
+ * isn't enough. Choose a second as the timeout value.
+ */
+ RES_RESET_TIMEOUT(ts, NSEC_PER_SEC);
+retry:
+ val = atomic_read(&lock->val);
+
+ if (val || !atomic_try_cmpxchg(&lock->val, &val, 1)) {
+ if (RES_CHECK_TIMEOUT(ts, ret, ~0u))
+ goto out;
+ cpu_relax();
+ goto retry;
+ }
+
+ return 0;
+out:
+ release_held_lock_entry();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(resilient_tas_spin_lock);
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+
+/*
+ * Per-CPU queue node structures; we can never have more than 4 nested
+ * contexts: task, softirq, hardirq, nmi.
+ *
+ * Exactly fits one 64-byte cacheline on a 64-bit architecture.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]);
+
+#ifndef res_smp_cond_load_acquire
+#define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c)
+#endif
+
+#define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c))
+
+/**
+ * resilient_queued_spin_lock_slowpath - acquire the queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ * @val: Current value of the queued spinlock 32-bit word
+ *
+ * Return:
+ * * 0 - Lock was acquired successfully.
+ * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock.
+ * * -ETIMEDOUT - Lock acquisition failed because of timeout.
+ *
+ * (queue tail, pending bit, lock value)
+ *
+ * fast : slow : unlock
+ * : :
+ * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
+ * : | ^--------.------. / :
+ * : v \ \ | :
+ * pending : (0,1,1) +--> (0,1,0) \ | :
+ * : | ^--' | | :
+ * : v | | :
+ * uncontended : (n,x,y) +--> (n,0,0) --' | :
+ * queue : | ^--' | :
+ * : v | :
+ * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' :
+ * queue : ^--' :
+ */
+int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
+{
+ struct mcs_spinlock *prev, *next, *node;
+ struct rqspinlock_timeout ts;
+ int idx, ret = 0;
+ u32 old, tail;
+
+ BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
+
+ if (resilient_virt_spin_lock_enabled())
+ return resilient_virt_spin_lock(lock);
+
+ RES_INIT_TIMEOUT(ts);
+
+ /*
+ * Wait for in-progress pending->locked hand-overs with a bounded
+ * number of spins so that we guarantee forward progress.
+ *
+ * 0,1,0 -> 0,0,1
+ */
+ if (val == _Q_PENDING_VAL) {
+ int cnt = _Q_PENDING_LOOPS;
+ val = atomic_cond_read_relaxed(&lock->val,
+ (VAL != _Q_PENDING_VAL) || !cnt--);
+ }
+
+ /*
+ * If we observe any contention; queue.
+ */
+ if (val & ~_Q_LOCKED_MASK)
+ goto queue;
+
+ /*
+ * trylock || pending
+ *
+ * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
+ */
+ val = queued_fetch_set_pending_acquire(lock);
+
+ /*
+ * If we observe contention, there is a concurrent locker.
+ *
+ * Undo and queue; our setting of PENDING might have made the
+ * n,0,0 -> 0,0,0 transition fail and it will now be waiting
+ * on @next to become !NULL.
+ */
+ if (unlikely(val & ~_Q_LOCKED_MASK)) {
+
+ /* Undo PENDING if we set it. */
+ if (!(val & _Q_PENDING_MASK))
+ clear_pending(lock);
+
+ goto queue;
+ }
+
+ /*
+ * Grab an entry in the held locks array, to enable deadlock detection.
+ */
+ grab_held_lock_entry(lock);
+
+ /*
+ * We're pending, wait for the owner to go away.
+ *
+ * 0,1,1 -> *,1,0
+ *
+ * this wait loop must be a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because not all
+ * clear_pending_set_locked() implementations imply full
+ * barriers.
+ */
+ if (val & _Q_LOCKED_MASK) {
+ RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
+ res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK));
+ }
+
+ if (ret) {
+ /*
+ * We waited for the locked bit to go back to 0, as the pending
+ * waiter, but timed out. We need to clear the pending bit since
+ * we own it. Once a stuck owner has been recovered, the lock
+ * must be restored to a valid state, hence removing the pending
+ * bit is necessary.
+ *
+ * *,1,* -> *,0,*
+ */
+ clear_pending(lock);
+ lockevent_inc(rqspinlock_lock_timeout);
+ goto err_release_entry;
+ }
+
+ /*
+ * take ownership and clear the pending bit.
+ *
+ * 0,1,0 -> 0,0,1
+ */
+ clear_pending_set_locked(lock);
+ lockevent_inc(lock_pending);
+ return 0;
+
+ /*
+ * End of pending bit optimistic spinning and beginning of MCS
+ * queuing.
+ */
+queue:
+ lockevent_inc(lock_slowpath);
+ /*
+ * Grab deadlock detection entry for the queue path.
+ */
+ grab_held_lock_entry(lock);
+
+ node = this_cpu_ptr(&rqnodes[0].mcs);
+ idx = node->count++;
+ tail = encode_tail(smp_processor_id(), idx);
+
+ trace_contention_begin(lock, LCB_F_SPIN);
+
+ /*
+ * 4 nodes are allocated based on the assumption that there will
+ * not be nested NMIs taking spinlocks. That may not be true in
+ * some architectures even though the chance of needing more than
+ * 4 nodes will still be extremely unlikely. When that happens,
+ * we fall back to spinning on the lock directly without using
+ * any MCS node. This is not the most elegant solution, but is
+ * simple enough.
+ */
+ if (unlikely(idx >= _Q_MAX_NODES)) {
+ lockevent_inc(lock_no_node);
+ RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
+ while (!queued_spin_trylock(lock)) {
+ if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) {
+ lockevent_inc(rqspinlock_lock_timeout);
+ goto err_release_node;
+ }
+ cpu_relax();
+ }
+ goto release;
+ }
+
+ node = grab_mcs_node(node, idx);
+
+ /*
+ * Keep counts of non-zero index values:
+ */
+ lockevent_cond_inc(lock_use_node2 + idx - 1, idx);
+
+ /*
+ * Ensure that we increment the head node->count before initialising
+ * the actual node. If the compiler is kind enough to reorder these
+ * stores, then an IRQ could overwrite our assignments.
+ */
+ barrier();
+
+ node->locked = 0;
+ node->next = NULL;
+
+ /*
+ * We touched a (possibly) cold cacheline in the per-cpu queue node;
+ * attempt the trylock once more in the hope someone let go while we
+ * weren't watching.
+ */
+ if (queued_spin_trylock(lock))
+ goto release;
+
+ /*
+ * Ensure that the initialisation of @node is complete before we
+ * publish the updated tail via xchg_tail() and potentially link
+ * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
+ */
+ smp_wmb();
+
+ /*
+ * Publish the updated tail.
+ * We have already touched the queueing cacheline; don't bother with
+ * pending stuff.
+ *
+ * p,*,* -> n,*,*
+ */
+ old = xchg_tail(lock, tail);
+ next = NULL;
+
+ /*
+ * if there was a previous node; link it and wait until reaching the
+ * head of the waitqueue.
+ */
+ if (old & _Q_TAIL_MASK) {
+ int val;
+
+ prev = decode_tail(old, rqnodes);
+
+ /* Link @node into the waitqueue. */
+ WRITE_ONCE(prev->next, node);
+
+ val = arch_mcs_spin_lock_contended(&node->locked);
+ if (val == RES_TIMEOUT_VAL) {
+ ret = -EDEADLK;
+ goto waitq_timeout;
+ }
+
+ /*
+ * While waiting for the MCS lock, the next pointer may have
+ * been set by another lock waiter. We optimistically load
+ * the next pointer & prefetch the cacheline for writing
+ * to reduce latency in the upcoming MCS unlock operation.
+ */
+ next = READ_ONCE(node->next);
+ if (next)
+ prefetchw(next);
+ }
+
+ /*
+ * we're at the head of the waitqueue, wait for the owner & pending to
+ * go away.
+ *
+ * *,x,y -> *,0,0
+ *
+ * this wait loop must use a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because the set_locked() function below
+ * does not imply a full barrier.
+ *
+ * We use RES_DEF_TIMEOUT * 2 as the duration, as RES_DEF_TIMEOUT is
+ * meant to span maximum allowed time per critical section, and we may
+ * have both the owner of the lock and the pending bit waiter ahead of
+ * us.
+ */
+ RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2);
+ val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) ||
+ RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK));
+
+waitq_timeout:
+ if (ret) {
+ /*
+ * If the tail is still pointing to us, then we are the final waiter,
+ * and are responsible for resetting the tail back to 0. Otherwise, if
+ * the cmpxchg operation fails, we signal the next waiter to take exit
+ * and try the same. For a waiter with tail node 'n':
+ *
+ * n,*,* -> 0,*,*
+ *
+ * When performing cmpxchg for the whole word (NR_CPUS > 16k), it is
+ * possible locked/pending bits keep changing and we see failures even
+ * when we remain the head of wait queue. However, eventually,
+ * pending bit owner will unset the pending bit, and new waiters
+ * will queue behind us. This will leave the lock owner in
+ * charge, and it will eventually either set locked bit to 0, or
+ * leave it as 1, allowing us to make progress.
+ *
+ * We terminate the whole wait queue for two reasons. Firstly,
+ * we eschew per-waiter timeouts with one applied at the head of
+ * the wait queue. This allows everyone to break out faster
+ * once we've seen the owner / pending waiter not responding for
+ * the timeout duration from the head. Secondly, it avoids
+ * complicated synchronization, because when not leaving in FIFO
+ * order, prev's next pointer needs to be fixed up etc.
+ */
+ if (!try_cmpxchg_tail(lock, tail, 0)) {
+ next = smp_cond_load_relaxed(&node->next, VAL);
+ WRITE_ONCE(next->locked, RES_TIMEOUT_VAL);
+ }
+ lockevent_inc(rqspinlock_lock_timeout);
+ goto err_release_node;
+ }
+
+ /*
+ * claim the lock:
+ *
+ * n,0,0 -> 0,0,1 : lock, uncontended
+ * *,*,0 -> *,*,1 : lock, contended
+ *
+ * If the queue head is the only one in the queue (lock value == tail)
+ * and nobody is pending, clear the tail code and grab the lock.
+ * Otherwise, we only need to grab the lock.
+ */
+
+ /*
+ * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
+ * above wait condition, therefore any concurrent setting of
+ * PENDING will make the uncontended transition fail.
+ */
+ if ((val & _Q_TAIL_MASK) == tail) {
+ if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
+ goto release; /* No contention */
+ }
+
+ /*
+ * Either somebody is queued behind us or _Q_PENDING_VAL got set
+ * which will then detect the remaining tail and queue behind us
+ * ensuring we'll see a @next.
+ */
+ set_locked(lock);
+
+ /*
+ * contended path; wait for next if not observed yet, release.
+ */
+ if (!next)
+ next = smp_cond_load_relaxed(&node->next, (VAL));
+
+ arch_mcs_spin_unlock_contended(&next->locked);
+
+release:
+ trace_contention_end(lock, 0);
+
+ /*
+ * release the node
+ */
+ __this_cpu_dec(rqnodes[0].mcs.count);
+ return ret;
+err_release_node:
+ trace_contention_end(lock, ret);
+ __this_cpu_dec(rqnodes[0].mcs.count);
+err_release_entry:
+ release_held_lock_entry();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath);
+
+#endif /* CONFIG_QUEUED_SPINLOCKS */
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock)
+{
+ int ret;
+
+ BUILD_BUG_ON(sizeof(rqspinlock_t) != sizeof(struct bpf_res_spin_lock));
+ BUILD_BUG_ON(__alignof__(rqspinlock_t) != __alignof__(struct bpf_res_spin_lock));
+
+ preempt_disable();
+ ret = res_spin_lock((rqspinlock_t *)lock);
+ if (unlikely(ret)) {
+ preempt_enable();
+ return ret;
+ }
+ return 0;
+}
+
+__bpf_kfunc void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock)
+{
+ res_spin_unlock((rqspinlock_t *)lock);
+ preempt_enable();
+}
+
+__bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag)
+{
+ u64 *ptr = (u64 *)flags__irq_flag;
+ unsigned long flags;
+ int ret;
+
+ preempt_disable();
+ local_irq_save(flags);
+ ret = res_spin_lock((rqspinlock_t *)lock);
+ if (unlikely(ret)) {
+ local_irq_restore(flags);
+ preempt_enable();
+ return ret;
+ }
+ *ptr = flags;
+ return 0;
+}
+
+__bpf_kfunc void bpf_res_spin_unlock_irqrestore(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag)
+{
+ u64 *ptr = (u64 *)flags__irq_flag;
+ unsigned long flags = *ptr;
+
+ res_spin_unlock((rqspinlock_t *)lock);
+ local_irq_restore(flags);
+ preempt_enable();
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(rqspinlock_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_res_spin_lock, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_res_spin_unlock)
+BTF_ID_FLAGS(func, bpf_res_spin_lock_irqsave, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_res_spin_unlock_irqrestore)
+BTF_KFUNCS_END(rqspinlock_kfunc_ids)
+
+static const struct btf_kfunc_id_set rqspinlock_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &rqspinlock_kfunc_ids,
+};
+
+static __init int rqspinlock_register_kfuncs(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &rqspinlock_kfunc_set);
+}
+late_initcall(rqspinlock_register_kfuncs);
diff --git a/kernel/bpf/rqspinlock.h b/kernel/bpf/rqspinlock.h
new file mode 100644
index 000000000000..5d8cb1b1aab4
--- /dev/null
+++ b/kernel/bpf/rqspinlock.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Resilient Queued Spin Lock defines
+ *
+ * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
+ *
+ * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com>
+ */
+#ifndef __LINUX_RQSPINLOCK_H
+#define __LINUX_RQSPINLOCK_H
+
+#include "../locking/qspinlock.h"
+
+/*
+ * try_cmpxchg_tail - Return result of cmpxchg of tail word with a new value
+ * @lock: Pointer to queued spinlock structure
+ * @tail: The tail to compare against
+ * @new_tail: The new queue tail code word
+ * Return: Bool to indicate whether the cmpxchg operation succeeded
+ *
+ * This is used by the head of the wait queue to clean up the queue.
+ * Provides relaxed ordering, since observers only rely on initialized
+ * state of the node which was made visible through the xchg_tail operation,
+ * i.e. through the smp_wmb preceding xchg_tail.
+ *
+ * We avoid using 16-bit cmpxchg, which is not available on all architectures.
+ */
+static __always_inline bool try_cmpxchg_tail(struct qspinlock *lock, u32 tail, u32 new_tail)
+{
+ u32 old, new;
+
+ old = atomic_read(&lock->val);
+ do {
+ /*
+ * Is the tail part we compare to already stale? Fail.
+ */
+ if ((old & _Q_TAIL_MASK) != tail)
+ return false;
+ /*
+ * Encode latest locked/pending state for new tail.
+ */
+ new = (old & _Q_LOCKED_PENDING_MASK) | new_tail;
+ } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
+
+ return true;
+}
+
+#endif /* __LINUX_RQSPINLOCK_H */
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index c99f8e5234ac..3615c06b7dfa 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -124,8 +124,24 @@ free_smap:
return ERR_PTR(err);
}
+static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, bool may_fault)
+{
+ return may_fault ? build_id_parse(vma, build_id, NULL)
+ : build_id_parse_nofault(vma, build_id, NULL);
+}
+
+/*
+ * Expects all id_offs[i].ip values to be set to correct initial IPs.
+ * They will be subsequently:
+ * - either adjusted in place to a file offset, if build ID fetching
+ * succeeds; in this case id_offs[i].build_id is set to correct build ID,
+ * and id_offs[i].status is set to BPF_STACK_BUILD_ID_VALID;
+ * - or IP will be kept intact, if build ID fetching failed; in this case
+ * id_offs[i].build_id is zeroed out and id_offs[i].status is set to
+ * BPF_STACK_BUILD_ID_IP.
+ */
static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
- u64 *ips, u32 trace_nr, bool user)
+ u32 trace_nr, bool user, bool may_fault)
{
int i;
struct mmap_unlock_irq_work *work = NULL;
@@ -142,30 +158,28 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
/* cannot access current->mm, fall back to ips */
for (i = 0; i < trace_nr; i++) {
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
- id_offs[i].ip = ips[i];
memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
}
return;
}
for (i = 0; i < trace_nr; i++) {
- if (range_in_vma(prev_vma, ips[i], ips[i])) {
+ u64 ip = READ_ONCE(id_offs[i].ip);
+
+ if (range_in_vma(prev_vma, ip, ip)) {
vma = prev_vma;
- memcpy(id_offs[i].build_id, prev_build_id,
- BUILD_ID_SIZE_MAX);
+ memcpy(id_offs[i].build_id, prev_build_id, BUILD_ID_SIZE_MAX);
goto build_id_valid;
}
- vma = find_vma(current->mm, ips[i]);
- if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) {
+ vma = find_vma(current->mm, ip);
+ if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) {
/* per entry fall back to ips */
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
- id_offs[i].ip = ips[i];
memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
continue;
}
build_id_valid:
- id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
- - vma->vm_start;
+ id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ip - vma->vm_start;
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
prev_vma = vma;
prev_build_id = id_offs[i].build_id;
@@ -216,7 +230,7 @@ static long __bpf_get_stackid(struct bpf_map *map,
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
- u32 hash, id, trace_nr, trace_len;
+ u32 hash, id, trace_nr, trace_len, i;
bool user = flags & BPF_F_USER_STACK;
u64 *ips;
bool hash_matches;
@@ -238,15 +252,18 @@ static long __bpf_get_stackid(struct bpf_map *map,
return id;
if (stack_map_use_build_id(map)) {
+ struct bpf_stack_build_id *id_offs;
+
/* for build_id+offset, pop a bucket before slow cmp */
new_bucket = (struct stack_map_bucket *)
pcpu_freelist_pop(&smap->freelist);
if (unlikely(!new_bucket))
return -ENOMEM;
new_bucket->nr = trace_nr;
- stack_map_get_build_id_offset(
- (struct bpf_stack_build_id *)new_bucket->data,
- ips, trace_nr, user);
+ id_offs = (struct bpf_stack_build_id *)new_bucket->data;
+ for (i = 0; i < trace_nr; i++)
+ id_offs[i].ip = ips[i];
+ stack_map_get_build_id_offset(id_offs, trace_nr, user, false /* !may_fault */);
trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
if (hash_matches && bucket->nr == trace_nr &&
memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
@@ -387,7 +404,7 @@ const struct bpf_func_proto bpf_get_stackid_proto_pe = {
static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
struct perf_callchain_entry *trace_in,
- void *buf, u32 size, u64 flags)
+ void *buf, u32 size, u64 flags, bool may_fault)
{
u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
bool user_build_id = flags & BPF_F_USER_BUILD_ID;
@@ -405,8 +422,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
if (kernel && user_build_id)
goto clear;
- elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
- : sizeof(u64);
+ elem_size = user_build_id ? sizeof(struct bpf_stack_build_id) : sizeof(u64);
if (unlikely(size % elem_size))
goto clear;
@@ -427,6 +443,9 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
if (sysctl_perf_event_max_stack < max_depth)
max_depth = sysctl_perf_event_max_stack;
+ if (may_fault)
+ rcu_read_lock(); /* need RCU for perf's callchain below */
+
if (trace_in)
trace = trace_in;
else if (kernel && task)
@@ -434,21 +453,34 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
else
trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
crosstask, false);
- if (unlikely(!trace))
- goto err_fault;
- if (trace->nr < skip)
+ if (unlikely(!trace) || trace->nr < skip) {
+ if (may_fault)
+ rcu_read_unlock();
goto err_fault;
+ }
trace_nr = trace->nr - skip;
trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
copy_len = trace_nr * elem_size;
ips = trace->ip + skip;
- if (user && user_build_id)
- stack_map_get_build_id_offset(buf, ips, trace_nr, user);
- else
+ if (user_build_id) {
+ struct bpf_stack_build_id *id_offs = buf;
+ u32 i;
+
+ for (i = 0; i < trace_nr; i++)
+ id_offs[i].ip = ips[i];
+ } else {
memcpy(buf, ips, copy_len);
+ }
+
+ /* trace/ips should not be dereferenced after this point */
+ if (may_fault)
+ rcu_read_unlock();
+
+ if (user_build_id)
+ stack_map_get_build_id_offset(buf, trace_nr, user, may_fault);
if (size > copy_len)
memset(buf + copy_len, 0, size - copy_len);
@@ -464,7 +496,7 @@ clear:
BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
u64, flags)
{
- return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);
}
const struct bpf_func_proto bpf_get_stack_proto = {
@@ -477,8 +509,24 @@ const struct bpf_func_proto bpf_get_stack_proto = {
.arg4_type = ARG_ANYTHING,
};
-BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
- u32, size, u64, flags)
+BPF_CALL_4(bpf_get_stack_sleepable, struct pt_regs *, regs, void *, buf, u32, size,
+ u64, flags)
+{
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, true /* may_fault */);
+}
+
+const struct bpf_func_proto bpf_get_stack_sleepable_proto = {
+ .func = bpf_get_stack_sleepable,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static long __bpf_get_task_stack(struct task_struct *task, void *buf, u32 size,
+ u64 flags, bool may_fault)
{
struct pt_regs *regs;
long res = -EINVAL;
@@ -488,12 +536,18 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
regs = task_pt_regs(task);
if (regs)
- res = __bpf_get_stack(regs, task, NULL, buf, size, flags);
+ res = __bpf_get_stack(regs, task, NULL, buf, size, flags, may_fault);
put_task_stack(task);
return res;
}
+BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
+ u32, size, u64, flags)
+{
+ return __bpf_get_task_stack(task, buf, size, flags, false /* !may_fault */);
+}
+
const struct bpf_func_proto bpf_get_task_stack_proto = {
.func = bpf_get_task_stack,
.gpl_only = false,
@@ -505,6 +559,23 @@ const struct bpf_func_proto bpf_get_task_stack_proto = {
.arg4_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_get_task_stack_sleepable, struct task_struct *, task, void *, buf,
+ u32, size, u64, flags)
+{
+ return __bpf_get_task_stack(task, buf, size, flags, true /* !may_fault */);
+}
+
+const struct bpf_func_proto bpf_get_task_stack_sleepable_proto = {
+ .func = bpf_get_task_stack_sleepable,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
void *, buf, u32, size, u64, flags)
{
@@ -516,7 +587,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
__u64 nr_kernel;
if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
- return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
BPF_F_USER_BUILD_ID)))
@@ -536,7 +607,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
__u64 nr = trace->nr;
trace->nr = nr_kernel;
- err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
+ err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);
/* restore nr */
trace->nr = nr;
@@ -548,7 +619,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
goto clear;
flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
- err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
+ err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);
}
return err;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c287925471f6..dd5304c6ac3c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -35,6 +35,8 @@
#include <linux/rcupdate_trace.h>
#include <linux/memcontrol.h>
#include <linux/trace_events.h>
+#include <linux/tracepoint.h>
+#include <linux/overflow.h>
#include <net/netfilter/nf_bpf_link.h>
#include <net/netkit.h>
@@ -155,6 +157,89 @@ static void maybe_wait_bpf_programs(struct bpf_map *map)
synchronize_rcu();
}
+static void unpin_uptr_kaddr(void *kaddr)
+{
+ if (kaddr)
+ unpin_user_page(virt_to_page(kaddr));
+}
+
+static void __bpf_obj_unpin_uptrs(struct btf_record *rec, u32 cnt, void *obj)
+{
+ const struct btf_field *field;
+ void **uptr_addr;
+ int i;
+
+ for (i = 0, field = rec->fields; i < cnt; i++, field++) {
+ if (field->type != BPF_UPTR)
+ continue;
+
+ uptr_addr = obj + field->offset;
+ unpin_uptr_kaddr(*uptr_addr);
+ }
+}
+
+static void bpf_obj_unpin_uptrs(struct btf_record *rec, void *obj)
+{
+ if (!btf_record_has_field(rec, BPF_UPTR))
+ return;
+
+ __bpf_obj_unpin_uptrs(rec, rec->cnt, obj);
+}
+
+static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj)
+{
+ const struct btf_field *field;
+ const struct btf_type *t;
+ unsigned long start, end;
+ struct page *page;
+ void **uptr_addr;
+ int i, err;
+
+ if (!btf_record_has_field(rec, BPF_UPTR))
+ return 0;
+
+ for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) {
+ if (field->type != BPF_UPTR)
+ continue;
+
+ uptr_addr = obj + field->offset;
+ start = *(unsigned long *)uptr_addr;
+ if (!start)
+ continue;
+
+ t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id);
+ /* t->size was checked for zero before */
+ if (check_add_overflow(start, t->size - 1, &end)) {
+ err = -EFAULT;
+ goto unpin_all;
+ }
+
+ /* The uptr's struct cannot span across two pages */
+ if ((start & PAGE_MASK) != (end & PAGE_MASK)) {
+ err = -EOPNOTSUPP;
+ goto unpin_all;
+ }
+
+ err = pin_user_pages_fast(start, 1, FOLL_LONGTERM | FOLL_WRITE, &page);
+ if (err != 1)
+ goto unpin_all;
+
+ if (PageHighMem(page)) {
+ err = -EOPNOTSUPP;
+ unpin_user_page(page);
+ goto unpin_all;
+ }
+
+ *uptr_addr = page_address(page) + offset_in_page(start);
+ }
+
+ return 0;
+
+unpin_all:
+ __bpf_obj_unpin_uptrs(rec, i, obj);
+ return err;
+}
+
static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
void *key, void *value, __u64 flags)
{
@@ -199,9 +284,14 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
err = map->ops->map_push_elem(map, value, flags);
} else {
- rcu_read_lock();
- err = map->ops->map_update_elem(map, key, value, flags);
- rcu_read_unlock();
+ err = bpf_obj_pin_uptrs(map->record, value);
+ if (!err) {
+ rcu_read_lock();
+ err = map->ops->map_update_elem(map, key, value, flags);
+ rcu_read_unlock();
+ if (err)
+ bpf_obj_unpin_uptrs(map->record, value);
+ }
}
bpf_enable_instrumentation();
@@ -385,7 +475,7 @@ void bpf_map_free_id(struct bpf_map *map)
spin_unlock_irqrestore(&map_idr_lock, flags);
}
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
static void bpf_map_save_memcg(struct bpf_map *map)
{
/* Currently if a map is created by a process belonging to the root
@@ -480,32 +570,49 @@ static void bpf_map_release_memcg(struct bpf_map *map)
}
#endif
-int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
+static bool can_alloc_pages(void)
+{
+ return preempt_count() == 0 && !irqs_disabled() &&
+ !IS_ENABLED(CONFIG_PREEMPT_RT);
+}
+
+static struct page *__bpf_alloc_page(int nid)
+{
+ if (!can_alloc_pages())
+ return alloc_pages_nolock(nid, 0);
+
+ return alloc_pages_node(nid,
+ GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT
+ | __GFP_NOWARN,
+ 0);
+}
+
+int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
unsigned long nr_pages, struct page **pages)
{
unsigned long i, j;
struct page *pg;
int ret = 0;
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
struct mem_cgroup *memcg, *old_memcg;
memcg = bpf_map_get_memcg(map);
old_memcg = set_active_memcg(memcg);
#endif
for (i = 0; i < nr_pages; i++) {
- pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0);
+ pg = __bpf_alloc_page(nid);
if (pg) {
pages[i] = pg;
continue;
}
for (j = 0; j < i; j++)
- __free_page(pages[j]);
+ free_pages_nolock(pages[j], 0);
ret = -ENOMEM;
break;
}
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
set_active_memcg(old_memcg);
mem_cgroup_put(memcg);
#endif
@@ -548,17 +655,21 @@ void btf_record_free(struct btf_record *rec)
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
if (rec->fields[i].kptr.module)
module_put(rec->fields[i].kptr.module);
- btf_put(rec->fields[i].kptr.btf);
+ if (btf_is_kernel(rec->fields[i].kptr.btf))
+ btf_put(rec->fields[i].kptr.btf);
break;
case BPF_LIST_HEAD:
case BPF_LIST_NODE:
case BPF_RB_ROOT:
case BPF_RB_NODE:
case BPF_SPIN_LOCK:
+ case BPF_RES_SPIN_LOCK:
case BPF_TIMER:
case BPF_REFCOUNT:
+ case BPF_WORKQUEUE:
/* Nothing to release */
break;
default:
@@ -583,7 +694,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
if (IS_ERR_OR_NULL(rec))
return NULL;
- size = offsetof(struct btf_record, fields[rec->cnt]);
+ size = struct_size(rec, fields, rec->cnt);
new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN);
if (!new_rec)
return ERR_PTR(-ENOMEM);
@@ -595,7 +706,9 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
- btf_get(fields[i].kptr.btf);
+ case BPF_UPTR:
+ if (btf_is_kernel(fields[i].kptr.btf))
+ btf_get(fields[i].kptr.btf);
if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
ret = -ENXIO;
goto free;
@@ -606,8 +719,10 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
case BPF_RB_ROOT:
case BPF_RB_NODE:
case BPF_SPIN_LOCK:
+ case BPF_RES_SPIN_LOCK:
case BPF_TIMER:
case BPF_REFCOUNT:
+ case BPF_WORKQUEUE:
/* Nothing to acquire */
break;
default:
@@ -634,7 +749,7 @@ bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *r
return false;
if (rec_a->cnt != rec_b->cnt)
return false;
- size = offsetof(struct btf_record, fields[rec_a->cnt]);
+ size = struct_size(rec_a, fields, rec_a->cnt);
/* btf_parse_fields uses kzalloc to allocate a btf_record, so unused
* members are zeroed out. So memcmp is safe to do without worrying
* about padding/unused fields.
@@ -659,6 +774,13 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
bpf_timer_cancel_and_free(obj + rec->timer_off);
}
+void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj)
+{
+ if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE)))
+ return;
+ bpf_wq_cancel_and_free(obj + rec->wq_off);
+}
+
void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
{
const struct btf_field *fields;
@@ -675,10 +797,14 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
switch (fields[i].type) {
case BPF_SPIN_LOCK:
+ case BPF_RES_SPIN_LOCK:
break;
case BPF_TIMER:
bpf_timer_cancel_and_free(field_ptr);
break;
+ case BPF_WORKQUEUE:
+ bpf_wq_cancel_and_free(field_ptr);
+ break;
case BPF_KPTR_UNREF:
WRITE_ONCE(*(u64 *)field_ptr, 0);
break;
@@ -691,15 +817,17 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
if (!btf_is_kernel(field->kptr.btf)) {
pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
field->kptr.btf_id);
- migrate_disable();
__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
pointee_struct_meta->record : NULL,
fields[i].type == BPF_KPTR_PERCPU);
- migrate_enable();
} else {
field->kptr.dtor(xchgd_field);
}
break;
+ case BPF_UPTR:
+ /* The caller ensured that no one is using the uptr */
+ unpin_uptr_kaddr(*(void **)field_ptr);
+ break;
case BPF_LIST_HEAD:
if (WARN_ON_ONCE(rec->spin_lock_off < 0))
continue;
@@ -721,17 +849,19 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
}
}
-/* called from workqueue */
-static void bpf_map_free_deferred(struct work_struct *work)
+static void bpf_map_free(struct bpf_map *map)
{
- struct bpf_map *map = container_of(work, struct bpf_map, work);
struct btf_record *rec = map->record;
struct btf *btf = map->btf;
- security_bpf_map_free(map);
- bpf_map_release_memcg(map);
- /* implementation dependent freeing */
+ /* implementation dependent freeing. Disabling migration to simplify
+ * the free of values or special fields allocated from bpf memory
+ * allocator.
+ */
+ migrate_disable();
map->ops->map_free(map);
+ migrate_enable();
+
/* Delay freeing of btf_record for maps, as map_free
* callback usually needs access to them. It is better to do it here
* than require each callback to do the free itself manually.
@@ -748,6 +878,16 @@ static void bpf_map_free_deferred(struct work_struct *work)
btf_put(btf);
}
+/* called from workqueue */
+static void bpf_map_free_deferred(struct work_struct *work)
+{
+ struct bpf_map *map = container_of(work, struct bpf_map, work);
+
+ security_bpf_map_free(map);
+ bpf_map_release_memcg(map);
+ bpf_map_free(map);
+}
+
static void bpf_map_put_uref(struct bpf_map *map)
{
if (atomic64_dec_and_test(&map->usercnt)) {
@@ -817,7 +957,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
{
- fmode_t mode = f.file->f_mode;
+ fmode_t mode = fd_file(f)->f_mode;
/* Our file permissions may have been overridden by global
* map permissions facing syscall side.
@@ -916,7 +1056,7 @@ static const struct vm_operations_struct bpf_map_default_vmops = {
static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct bpf_map *map = filp->private_data;
- int err;
+ int err = 0;
if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record))
return -ENOTSUPP;
@@ -940,24 +1080,33 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
err = -EACCES;
goto out;
}
+ bpf_map_write_active_inc(map);
}
+out:
+ mutex_unlock(&map->freeze_mutex);
+ if (err)
+ return err;
/* set default open/close callbacks */
vma->vm_ops = &bpf_map_default_vmops;
vma->vm_private_data = map;
vm_flags_clear(vma, VM_MAYEXEC);
+ /* If mapping is read-only, then disallow potentially re-mapping with
+ * PROT_WRITE by dropping VM_MAYWRITE flag. This VM_MAYWRITE clearing
+ * means that as far as BPF map's memory-mapped VMAs are concerned,
+ * VM_WRITE and VM_MAYWRITE and equivalent, if one of them is set,
+ * both should be set, so we can forget about VM_MAYWRITE and always
+ * check just VM_WRITE
+ */
if (!(vma->vm_flags & VM_WRITE))
- /* disallow re-mapping with PROT_WRITE */
vm_flags_clear(vma, VM_MAYWRITE);
err = map->ops->map_mmap(map, vma);
- if (err)
- goto out;
+ if (err) {
+ if (vma->vm_flags & VM_WRITE)
+ bpf_map_write_active_dec(map);
+ }
- if (vma->vm_flags & VM_MAYWRITE)
- bpf_map_write_active_inc(map);
-out:
- mutex_unlock(&map->freeze_mutex);
return err;
}
@@ -980,7 +1129,7 @@ static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr
if (map->ops->map_get_unmapped_area)
return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags);
#ifdef CONFIG_MMU
- return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+ return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
#else
return addr;
#endif
@@ -1084,8 +1233,8 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
return -EINVAL;
map->record = btf_parse_fields(btf, value_type,
- BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
- BPF_RB_ROOT | BPF_REFCOUNT,
+ BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
+ BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR,
map->value_size);
if (!IS_ERR_OR_NULL(map->record)) {
int i;
@@ -1103,6 +1252,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
case 0:
continue;
case BPF_SPIN_LOCK:
+ case BPF_RES_SPIN_LOCK:
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY &&
map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
@@ -1115,6 +1265,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
}
break;
case BPF_TIMER:
+ case BPF_WORKQUEUE:
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY) {
@@ -1140,6 +1291,12 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
goto free_map_tab;
}
break;
+ case BPF_UPTR:
+ if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
case BPF_LIST_HEAD:
case BPF_RB_ROOT:
if (map->map_type != BPF_MAP_TYPE_HASH &&
@@ -1180,7 +1337,7 @@ static bool bpf_net_capable(void)
#define BPF_MAP_CREATE_LAST_FIELD map_token_fd
/* called via syscall */
-static int map_create(union bpf_attr *attr)
+static int map_create(union bpf_attr *attr, bool kernel)
{
const struct bpf_map_ops *ops;
struct bpf_token *token = NULL;
@@ -1370,7 +1527,7 @@ static int map_create(union bpf_attr *attr)
attr->btf_vmlinux_value_type_id;
}
- err = security_bpf_map_create(map, attr, token);
+ err = security_bpf_map_create(map, attr, token, kernel);
if (err)
goto free_map_sec;
@@ -1398,28 +1555,12 @@ static int map_create(union bpf_attr *attr)
free_map_sec:
security_bpf_map_free(map);
free_map:
- btf_put(map->btf);
- map->ops->map_free(map);
+ bpf_map_free(map);
put_token:
bpf_token_put(token);
return err;
}
-/* if error is returned, fd is released.
- * On success caller should complete fd access with matching fdput()
- */
-struct bpf_map *__bpf_map_get(struct fd f)
-{
- if (!f.file)
- return ERR_PTR(-EBADF);
- if (f.file->f_op != &bpf_map_fops) {
- fdput(f);
- return ERR_PTR(-EINVAL);
- }
-
- return f.file->private_data;
-}
-
void bpf_map_inc(struct bpf_map *map)
{
atomic64_inc(&map->refcnt);
@@ -1435,31 +1576,23 @@ EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
struct bpf_map *bpf_map_get(u32 ufd)
{
- struct fd f = fdget(ufd);
- struct bpf_map *map;
-
- map = __bpf_map_get(f);
- if (IS_ERR(map))
- return map;
+ CLASS(fd, f)(ufd);
+ struct bpf_map *map = __bpf_map_get(f);
- bpf_map_inc(map);
- fdput(f);
+ if (!IS_ERR(map))
+ bpf_map_inc(map);
return map;
}
-EXPORT_SYMBOL(bpf_map_get);
+EXPORT_SYMBOL_NS(bpf_map_get, "BPF_INTERNAL");
struct bpf_map *bpf_map_get_with_uref(u32 ufd)
{
- struct fd f = fdget(ufd);
- struct bpf_map *map;
-
- map = __bpf_map_get(f);
- if (IS_ERR(map))
- return map;
+ CLASS(fd, f)(ufd);
+ struct bpf_map *map = __bpf_map_get(f);
- bpf_map_inc_with_uref(map);
- fdput(f);
+ if (!IS_ERR(map))
+ bpf_map_inc_with_uref(map);
return map;
}
@@ -1482,11 +1615,8 @@ struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
{
- spin_lock_bh(&map_idr_lock);
- map = __bpf_map_inc_not_zero(map, false);
- spin_unlock_bh(&map_idr_lock);
-
- return map;
+ lockdep_assert(rcu_read_lock_held());
+ return __bpf_map_inc_not_zero(map, false);
}
EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
@@ -1524,11 +1654,9 @@ static int map_lookup_elem(union bpf_attr *attr)
{
void __user *ukey = u64_to_user_ptr(attr->key);
void __user *uvalue = u64_to_user_ptr(attr->value);
- int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
u32 value_size;
- struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
@@ -1537,26 +1665,20 @@ static int map_lookup_elem(union bpf_attr *attr)
if (attr->flags & ~BPF_F_LOCK)
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
- if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
- err = -EPERM;
- goto err_put;
- }
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
+ return -EPERM;
if ((attr->flags & BPF_F_LOCK) &&
- !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
- err = -EINVAL;
- goto err_put;
- }
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK))
+ return -EINVAL;
key = __bpf_copy_key(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
- }
+ if (IS_ERR(key))
+ return PTR_ERR(key);
value_size = bpf_map_value_size(map);
@@ -1587,8 +1709,6 @@ free_value:
kvfree(value);
free_key:
kvfree(key);
-err_put:
- fdput(f);
return err;
}
@@ -1599,17 +1719,15 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
{
bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
- int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
u32 value_size;
- struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -1638,7 +1756,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
goto free_key;
}
- err = bpf_map_update_value(map, f.file, key, value, attr->flags);
+ err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags);
if (!err)
maybe_wait_bpf_programs(map);
@@ -1647,7 +1765,6 @@ free_key:
kvfree(key);
err_put:
bpf_map_write_active_dec(map);
- fdput(f);
return err;
}
@@ -1656,16 +1773,14 @@ err_put:
static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
{
bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
- int ufd = attr->map_fd;
struct bpf_map *map;
- struct fd f;
void *key;
int err;
if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -1702,7 +1817,6 @@ out:
kvfree(key);
err_put:
bpf_map_write_active_dec(map);
- fdput(f);
return err;
}
@@ -1713,30 +1827,24 @@ static int map_get_next_key(union bpf_attr *attr)
{
void __user *ukey = u64_to_user_ptr(attr->key);
void __user *unext_key = u64_to_user_ptr(attr->next_key);
- int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *next_key;
- struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
- if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
- err = -EPERM;
- goto err_put;
- }
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
+ return -EPERM;
if (ukey) {
key = __bpf_copy_key(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
- }
+ if (IS_ERR(key))
+ return PTR_ERR(key);
} else {
key = NULL;
}
@@ -1768,8 +1876,6 @@ free_next_key:
kvfree(next_key);
free_key:
kvfree(key);
-err_put:
- fdput(f);
return err;
}
@@ -1890,8 +1996,6 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
return err;
}
-#define MAP_LOOKUP_RETRIES 3
-
int generic_map_lookup_batch(struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -1901,8 +2005,8 @@ int generic_map_lookup_batch(struct bpf_map *map,
void __user *values = u64_to_user_ptr(attr->batch.values);
void __user *keys = u64_to_user_ptr(attr->batch.keys);
void *buf, *buf_prevkey, *prev_key, *key, *value;
- int err, retry = MAP_LOOKUP_RETRIES;
u32 value_size, cp, max_count;
+ int err;
if (attr->batch.elem_flags & ~BPF_F_LOCK)
return -EINVAL;
@@ -1948,14 +2052,8 @@ int generic_map_lookup_batch(struct bpf_map *map,
err = bpf_map_copy_value(map, key, value,
attr->batch.elem_flags);
- if (err == -ENOENT) {
- if (retry) {
- retry--;
- continue;
- }
- err = -EINTR;
- break;
- }
+ if (err == -ENOENT)
+ goto next_key;
if (err)
goto free_buf;
@@ -1970,12 +2068,12 @@ int generic_map_lookup_batch(struct bpf_map *map,
goto free_buf;
}
+ cp++;
+next_key:
if (!prev_key)
prev_key = buf_prevkey;
swap(prev_key, key);
- retry = MAP_LOOKUP_RETRIES;
- cp++;
cond_resched();
}
@@ -1998,11 +2096,9 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
{
void __user *ukey = u64_to_user_ptr(attr->key);
void __user *uvalue = u64_to_user_ptr(attr->value);
- int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
u32 value_size;
- struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
@@ -2011,7 +2107,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
if (attr->flags & ~BPF_F_LOCK)
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -2081,7 +2177,6 @@ free_key:
kvfree(key);
err_put:
bpf_map_write_active_dec(map);
- fdput(f);
return err;
}
@@ -2089,27 +2184,22 @@ err_put:
static int map_freeze(const union bpf_attr *attr)
{
- int err = 0, ufd = attr->map_fd;
+ int err = 0;
struct bpf_map *map;
- struct fd f;
if (CHECK_ATTR(BPF_MAP_FREEZE))
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
- if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) {
- fdput(f);
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record))
return -ENOTSUPP;
- }
- if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
- fdput(f);
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE))
return -EPERM;
- }
mutex_lock(&map->freeze_mutex);
if (bpf_map_write_active(map)) {
@@ -2124,7 +2214,6 @@ static int map_freeze(const union bpf_attr *attr)
WRITE_ONCE(map->frozen, true);
err_put:
mutex_unlock(&map->freeze_mutex);
- fdput(f);
return err;
}
@@ -2244,6 +2333,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
kvfree(prog->aux->jited_linfo);
kvfree(prog->aux->linfo);
kfree(prog->aux->kfunc_tab);
+ kfree(prog->aux->ctx_arg_info);
if (prog->aux->attach_btf)
btf_put(prog->aux->attach_btf);
@@ -2394,18 +2484,6 @@ int bpf_prog_new_fd(struct bpf_prog *prog)
O_RDWR | O_CLOEXEC);
}
-static struct bpf_prog *____bpf_prog_get(struct fd f)
-{
- if (!f.file)
- return ERR_PTR(-EBADF);
- if (f.file->f_op != &bpf_prog_fops) {
- fdput(f);
- return ERR_PTR(-EINVAL);
- }
-
- return f.file->private_data;
-}
-
void bpf_prog_add(struct bpf_prog *prog, int i)
{
atomic64_add(i, &prog->aux->refcnt);
@@ -2461,20 +2539,19 @@ bool bpf_prog_get_ok(struct bpf_prog *prog,
static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
bool attach_drv)
{
- struct fd f = fdget(ufd);
+ CLASS(fd, f)(ufd);
struct bpf_prog *prog;
- prog = ____bpf_prog_get(f);
- if (IS_ERR(prog))
- return prog;
- if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
- prog = ERR_PTR(-EINVAL);
- goto out;
- }
+ if (fd_empty(f))
+ return ERR_PTR(-EBADF);
+ if (fd_file(f)->f_op != &bpf_prog_fops)
+ return ERR_PTR(-EINVAL);
+
+ prog = fd_file(f)->private_data;
+ if (!bpf_prog_get_ok(prog, attach_type, attach_drv))
+ return ERR_PTR(-EINVAL);
bpf_prog_inc(prog);
-out:
- fdput(f);
return prog;
}
@@ -2678,7 +2755,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD prog_token_fd
+#define BPF_PROG_LOAD_LAST_FIELD fd_array_cnt
static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
@@ -2887,7 +2964,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
if (err < 0)
goto free_prog;
- err = security_bpf_prog_load(prog, attr, token);
+ err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel);
if (err)
goto free_prog_sec;
@@ -2982,16 +3059,33 @@ static int bpf_obj_get(const union bpf_attr *attr)
attr->file_flags);
}
-void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
- const struct bpf_link_ops *ops, struct bpf_prog *prog)
+/* bpf_link_init_sleepable() allows to specify whether BPF link itself has
+ * "sleepable" semantics, which normally would mean that BPF link's attach
+ * hook can dereference link or link's underlying program for some time after
+ * detachment due to RCU Tasks Trace-based lifetime protection scheme.
+ * BPF program itself can be non-sleepable, yet, because it's transitively
+ * reachable through BPF link, its freeing has to be delayed until after RCU
+ * Tasks Trace GP.
+ */
+void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
+ const struct bpf_link_ops *ops, struct bpf_prog *prog,
+ bool sleepable)
{
+ WARN_ON(ops->dealloc && ops->dealloc_deferred);
atomic64_set(&link->refcnt, 1);
link->type = type;
+ link->sleepable = sleepable;
link->id = 0;
link->ops = ops;
link->prog = prog;
}
+void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
+ const struct bpf_link_ops *ops, struct bpf_prog *prog)
+{
+ bpf_link_init_sleepable(link, type, ops, prog, false);
+}
+
static void bpf_link_free_id(int id)
{
if (!id)
@@ -3024,12 +3118,24 @@ void bpf_link_inc(struct bpf_link *link)
atomic64_inc(&link->refcnt);
}
+static void bpf_link_dealloc(struct bpf_link *link)
+{
+ /* now that we know that bpf_link itself can't be reached, put underlying BPF program */
+ if (link->prog)
+ bpf_prog_put(link->prog);
+
+ /* free bpf_link and its containing memory */
+ if (link->ops->dealloc_deferred)
+ link->ops->dealloc_deferred(link);
+ else
+ link->ops->dealloc(link);
+}
+
static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu)
{
struct bpf_link *link = container_of(rcu, struct bpf_link, rcu);
- /* free bpf_link and its containing memory */
- link->ops->dealloc_deferred(link);
+ bpf_link_dealloc(link);
}
static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
@@ -3043,27 +3149,28 @@ static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
/* bpf_link_free is guaranteed to be called from process context */
static void bpf_link_free(struct bpf_link *link)
{
- bool sleepable = false;
+ const struct bpf_link_ops *ops = link->ops;
bpf_link_free_id(link->id);
- if (link->prog) {
- sleepable = link->prog->sleepable;
- /* detach BPF program, clean up used resources */
- link->ops->release(link);
- bpf_prog_put(link->prog);
- }
- if (link->ops->dealloc_deferred) {
- /* schedule BPF link deallocation; if underlying BPF program
- * is sleepable, we need to first wait for RCU tasks trace
- * sync, then go through "classic" RCU grace period
+ /* detach BPF program, clean up used resources */
+ if (link->prog)
+ ops->release(link);
+ if (ops->dealloc_deferred) {
+ /* Schedule BPF link deallocation, which will only then
+ * trigger putting BPF program refcount.
+ * If underlying BPF program is sleepable or BPF link's target
+ * attach hookpoint is sleepable or otherwise requires RCU GPs
+ * to ensure link and its underlying BPF program is not
+ * reachable anymore, we need to first wait for RCU tasks
+ * trace sync, and then go through "classic" RCU grace period
*/
- if (sleepable)
+ if (link->sleepable || (link->prog && link->prog->sleepable))
call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
else
call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
+ } else if (ops->dealloc) {
+ bpf_link_dealloc(link);
}
- if (link->ops->dealloc)
- link->ops->dealloc(link);
}
static void bpf_link_put_deferred(struct work_struct *work)
@@ -3117,13 +3224,17 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_link *link = filp->private_data;
const struct bpf_prog *prog = link->prog;
+ enum bpf_link_type type = link->type;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- seq_printf(m,
- "link_type:\t%s\n"
- "link_id:\t%u\n",
- bpf_link_type_strs[link->type],
- link->id);
+ if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) {
+ seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]);
+ } else {
+ WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type);
+ seq_printf(m, "link_type:\t<%u>\n", type);
+ }
+ seq_printf(m, "link_id:\t%u\n", link->id);
+
if (prog) {
bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
@@ -3137,6 +3248,13 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
}
#endif
+static __poll_t bpf_link_poll(struct file *file, struct poll_table_struct *pts)
+{
+ struct bpf_link *link = file->private_data;
+
+ return link->ops->poll(file, pts);
+}
+
static const struct file_operations bpf_link_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_link_show_fdinfo,
@@ -3146,6 +3264,16 @@ static const struct file_operations bpf_link_fops = {
.write = bpf_dummy_write,
};
+static const struct file_operations bpf_link_fops_poll = {
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_link_show_fdinfo,
+#endif
+ .release = bpf_link_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
+ .poll = bpf_link_poll,
+};
+
static int bpf_link_alloc_id(struct bpf_link *link)
{
int id;
@@ -3188,7 +3316,9 @@ int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
return id;
}
- file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
+ file = anon_inode_getfile("bpf_link",
+ link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops,
+ link, O_CLOEXEC);
if (IS_ERR(file)) {
bpf_link_free_id(id);
put_unused_fd(fd);
@@ -3216,28 +3346,26 @@ int bpf_link_settle(struct bpf_link_primer *primer)
int bpf_link_new_fd(struct bpf_link *link)
{
- return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
+ return anon_inode_getfd("bpf-link",
+ link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops,
+ link, O_CLOEXEC);
}
struct bpf_link *bpf_link_get_from_fd(u32 ufd)
{
- struct fd f = fdget(ufd);
+ CLASS(fd, f)(ufd);
struct bpf_link *link;
- if (!f.file)
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- if (f.file->f_op != &bpf_link_fops) {
- fdput(f);
+ if (fd_file(f)->f_op != &bpf_link_fops && fd_file(f)->f_op != &bpf_link_fops_poll)
return ERR_PTR(-EINVAL);
- }
- link = f.file->private_data;
+ link = fd_file(f)->private_data;
bpf_link_inc(link);
- fdput(f);
-
return link;
}
-EXPORT_SYMBOL(bpf_link_get_from_fd);
+EXPORT_SYMBOL_NS(bpf_link_get_from_fd, "BPF_INTERNAL");
static void bpf_tracing_link_release(struct bpf_link *link)
{
@@ -3245,7 +3373,8 @@ static void bpf_tracing_link_release(struct bpf_link *link)
container_of(link, struct bpf_tracing_link, link.link);
WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
- tr_link->trampoline));
+ tr_link->trampoline,
+ tr_link->tgt_prog));
bpf_trampoline_put(tr_link->trampoline);
@@ -3385,7 +3514,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
* in prog->aux
*
* - if prog->aux->dst_trampoline is NULL, the program has already been
- * attached to a target and its initial target was cleared (below)
+ * attached to a target and its initial target was cleared (below)
*
* - if tgt_prog != NULL, the caller specified tgt_prog_fd +
* target_btf_id using the link_create API.
@@ -3460,7 +3589,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
if (err)
goto out_unlock;
- err = bpf_trampoline_link_prog(&link->link, tr);
+ err = bpf_trampoline_link_prog(&link->link, tr, tgt_prog);
if (err) {
bpf_link_cleanup(&link_primer);
link = NULL;
@@ -3498,17 +3627,12 @@ out_put_prog:
return err;
}
-struct bpf_raw_tp_link {
- struct bpf_link link;
- struct bpf_raw_event_map *btp;
-};
-
static void bpf_raw_tp_link_release(struct bpf_link *link)
{
struct bpf_raw_tp_link *raw_tp =
container_of(link, struct bpf_raw_tp_link, link);
- bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog);
+ bpf_probe_unregister(raw_tp->btp, raw_tp);
bpf_put_raw_tracepoint(raw_tp->btp);
}
@@ -3601,15 +3725,16 @@ static void bpf_perf_link_dealloc(struct bpf_link *link)
}
static int bpf_perf_link_fill_common(const struct perf_event *event,
- char __user *uname, u32 ulen,
+ char __user *uname, u32 *ulenp,
u64 *probe_offset, u64 *probe_addr,
u32 *fd_type, unsigned long *missed)
{
const char *buf;
- u32 prog_id;
+ u32 prog_id, ulen;
size_t len;
int err;
+ ulen = *ulenp;
if (!ulen ^ !uname)
return -EINVAL;
@@ -3617,10 +3742,17 @@ static int bpf_perf_link_fill_common(const struct perf_event *event,
probe_offset, probe_addr, missed);
if (err)
return err;
+
+ if (buf) {
+ len = strlen(buf);
+ *ulenp = len + 1;
+ } else {
+ *ulenp = 1;
+ }
if (!uname)
return 0;
+
if (buf) {
- len = strlen(buf);
err = bpf_copy_to_user(uname, buf, ulen, len);
if (err)
return err;
@@ -3645,7 +3777,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
uname = u64_to_user_ptr(info->perf_event.kprobe.func_name);
ulen = info->perf_event.kprobe.name_len;
- err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
+ err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr,
&type, &missed);
if (err)
return err;
@@ -3653,7 +3785,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
info->perf_event.type = BPF_PERF_EVENT_KRETPROBE;
else
info->perf_event.type = BPF_PERF_EVENT_KPROBE;
-
+ info->perf_event.kprobe.name_len = ulen;
info->perf_event.kprobe.offset = offset;
info->perf_event.kprobe.missed = missed;
if (!kallsyms_show_value(current_cred()))
@@ -3668,14 +3800,14 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
struct bpf_link_info *info)
{
+ u64 ref_ctr_offset, offset;
char __user *uname;
- u64 addr, offset;
u32 ulen, type;
int err;
uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
ulen = info->perf_event.uprobe.name_len;
- err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
+ err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &ref_ctr_offset,
&type, NULL);
if (err)
return err;
@@ -3684,8 +3816,10 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
info->perf_event.type = BPF_PERF_EVENT_URETPROBE;
else
info->perf_event.type = BPF_PERF_EVENT_UPROBE;
+ info->perf_event.uprobe.name_len = ulen;
info->perf_event.uprobe.offset = offset;
info->perf_event.uprobe.cookie = event->bpf_cookie;
+ info->perf_event.uprobe.ref_ctr_offset = ref_ctr_offset;
return 0;
}
#endif
@@ -3709,12 +3843,18 @@ static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
{
char __user *uname;
u32 ulen;
+ int err;
uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
ulen = info->perf_event.tracepoint.name_len;
+ err = bpf_perf_link_fill_common(event, uname, &ulen, NULL, NULL, NULL, NULL);
+ if (err)
+ return err;
+
info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
+ info->perf_event.tracepoint.name_len = ulen;
info->perf_event.tracepoint.cookie = event->bpf_cookie;
- return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL);
+ return 0;
}
static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
@@ -3808,7 +3948,7 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro
#endif /* CONFIG_PERF_EVENTS */
static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
- const char __user *user_tp_name)
+ const char __user *user_tp_name, u64 cookie)
{
struct bpf_link_primer link_primer;
struct bpf_raw_tp_link *link;
@@ -3852,9 +3992,11 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
err = -ENOMEM;
goto out_put_btp;
}
- bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
- &bpf_raw_tp_link_lops, prog);
+ bpf_link_init_sleepable(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
+ &bpf_raw_tp_link_lops, prog,
+ tracepoint_is_faultable(btp->tp));
link->btp = btp;
+ link->cookie = cookie;
err = bpf_link_prime(&link->link, &link_primer);
if (err) {
@@ -3862,7 +4004,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
goto out_put_btp;
}
- err = bpf_probe_register(link->btp, prog);
+ err = bpf_probe_register(link->btp, link);
if (err) {
bpf_link_cleanup(&link_primer);
goto out_put_btp;
@@ -3875,11 +4017,13 @@ out_put_btp:
return err;
}
-#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
+#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.cookie
static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
{
struct bpf_prog *prog;
+ void __user *tp_name;
+ __u64 cookie;
int fd;
if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
@@ -3889,7 +4033,9 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
if (IS_ERR(prog))
return PTR_ERR(prog);
- fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name));
+ tp_name = u64_to_user_ptr(attr->raw_tracepoint.name);
+ cookie = attr->raw_tracepoint.cookie;
+ fd = bpf_raw_tp_link_attach(prog, tp_name, cookie);
if (fd < 0)
bpf_prog_put(prog);
return fd;
@@ -3985,6 +4131,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
* check permissions at attach time.
*/
return -EPERM;
+
+ ptype = attach_type_to_prog_type(attach_type);
+ if (prog->type != ptype)
+ return -EINVAL;
+
return prog->enforce_expected_attach_type &&
prog->expected_attach_type != attach_type ?
-EINVAL : 0;
@@ -4003,12 +4154,20 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
attach_type != BPF_TRACE_KPROBE_MULTI)
return -EINVAL;
+ if (prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION &&
+ attach_type != BPF_TRACE_KPROBE_SESSION)
+ return -EINVAL;
if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
attach_type != BPF_TRACE_UPROBE_MULTI)
return -EINVAL;
+ if (prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION &&
+ attach_type != BPF_TRACE_UPROBE_SESSION)
+ return -EINVAL;
if (attach_type != BPF_PERF_EVENT &&
attach_type != BPF_TRACE_KPROBE_MULTI &&
- attach_type != BPF_TRACE_UPROBE_MULTI)
+ attach_type != BPF_TRACE_KPROBE_SESSION &&
+ attach_type != BPF_TRACE_UPROBE_MULTI &&
+ attach_type != BPF_TRACE_UPROBE_SESSION)
return -EINVAL;
return 0;
case BPF_PROG_TYPE_SCHED_CLS:
@@ -4031,7 +4190,8 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
#define BPF_F_ATTACH_MASK_BASE \
(BPF_F_ALLOW_OVERRIDE | \
BPF_F_ALLOW_MULTI | \
- BPF_F_REPLACE)
+ BPF_F_REPLACE | \
+ BPF_F_PREORDER)
#define BPF_F_ATTACH_MASK_MPROG \
(BPF_F_REPLACE | \
@@ -4595,6 +4755,8 @@ static int bpf_prog_get_info_by_fd(struct file *file,
info.recursion_misses = stats.misses;
info.verified_insns = prog->aux->verified_insns;
+ if (prog->aux->btf)
+ info.btf_id = btf_obj_id(prog->aux->btf);
if (!bpf_capable()) {
info.jited_prog_len = 0;
@@ -4741,8 +4903,6 @@ static int bpf_prog_get_info_by_fd(struct file *file,
}
}
- if (prog->aux->btf)
- info.btf_id = btf_obj_id(prog->aux->btf);
info.attach_btf_id = prog->aux->attach_btf_id;
if (attach_btf)
info.attach_btf_obj_id = btf_obj_id(attach_btf);
@@ -4930,33 +5090,25 @@ static int bpf_link_get_info_by_fd(struct file *file,
static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
- int ufd = attr->info.bpf_fd;
- struct fd f;
- int err;
-
if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
return -EINVAL;
- f = fdget(ufd);
- if (!f.file)
+ CLASS(fd, f)(attr->info.bpf_fd);
+ if (fd_empty(f))
return -EBADFD;
- if (f.file->f_op == &bpf_prog_fops)
- err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr,
+ if (fd_file(f)->f_op == &bpf_prog_fops)
+ return bpf_prog_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr,
uattr);
- else if (f.file->f_op == &bpf_map_fops)
- err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr,
+ else if (fd_file(f)->f_op == &bpf_map_fops)
+ return bpf_map_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr,
uattr);
- else if (f.file->f_op == &btf_fops)
- err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
- else if (f.file->f_op == &bpf_link_fops)
- err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
+ else if (fd_file(f)->f_op == &btf_fops)
+ return bpf_btf_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr);
+ else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll)
+ return bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data,
attr, uattr);
- else
- err = -EINVAL;
-
- fdput(f);
- return err;
+ return -EINVAL;
}
#define BPF_BTF_LOAD_LAST_FIELD btf_token_fd
@@ -4991,15 +5143,34 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_
return btf_new_fd(attr, uattr, uattr_size);
}
-#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
+#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD fd_by_id_token_fd
static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
{
+ struct bpf_token *token = NULL;
+
if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (attr->open_flags & ~BPF_F_TOKEN_FD)
+ return -EINVAL;
+
+ if (attr->open_flags & BPF_F_TOKEN_FD) {
+ token = bpf_token_get_from_fd(attr->fd_by_id_token_fd);
+ if (IS_ERR(token))
+ return PTR_ERR(token);
+ if (!bpf_token_allow_cmd(token, BPF_BTF_GET_FD_BY_ID)) {
+ bpf_token_put(token);
+ token = NULL;
+ }
+ }
+
+ if (!bpf_token_capable(token, CAP_SYS_ADMIN)) {
+ bpf_token_put(token);
return -EPERM;
+ }
+
+ bpf_token_put(token);
return btf_get_fd_by_id(attr->btf_id);
}
@@ -5084,7 +5255,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (!file)
return -EBADF;
- if (file->f_op == &bpf_link_fops) {
+ if (file->f_op == &bpf_link_fops || file->f_op == &bpf_link_fops_poll) {
struct bpf_link *link = file->private_data;
if (link->ops == &bpf_raw_tp_link_lops) {
@@ -5144,14 +5315,13 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
struct bpf_map *map;
- int err, ufd;
- struct fd f;
+ int err;
if (CHECK_ATTR(BPF_MAP_BATCH))
return -EINVAL;
- ufd = attr->batch.map_fd;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->batch.map_fd);
+
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -5171,7 +5341,7 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
else if (cmd == BPF_MAP_UPDATE_BATCH)
- BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr);
+ BPF_DO_BATCH(map->ops->map_update_batch, map, fd_file(f), attr, uattr);
else
BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
err_put:
@@ -5179,7 +5349,6 @@ err_put:
maybe_wait_bpf_programs(map);
bpf_map_write_active_dec(map);
}
- fdput(f);
return err;
}
@@ -5227,7 +5396,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
goto out;
}
if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
- ret = bpf_raw_tp_link_attach(prog, NULL);
+ ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie);
else if (prog->expected_attach_type == BPF_TRACE_ITER)
ret = bpf_iter_link_attach(attr, uattr, prog);
else if (prog->expected_attach_type == BPF_LSM_CGROUP)
@@ -5242,6 +5411,10 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_PROG_TYPE_SK_LOOKUP:
ret = netns_bpf_link_create(attr, prog);
break;
+ case BPF_PROG_TYPE_SK_MSG:
+ case BPF_PROG_TYPE_SK_SKB:
+ ret = sock_map_link_create(attr, prog);
+ break;
#ifdef CONFIG_NET
case BPF_PROG_TYPE_XDP:
ret = bpf_xdp_link_attach(attr, prog);
@@ -5264,9 +5437,11 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_PROG_TYPE_KPROBE:
if (attr->link_create.attach_type == BPF_PERF_EVENT)
ret = bpf_perf_link_attach(attr, prog);
- else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI)
+ else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI ||
+ attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION)
ret = bpf_kprobe_multi_link_attach(attr, prog);
- else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI)
+ else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI ||
+ attr->link_create.attach_type == BPF_TRACE_UPROBE_SESSION)
ret = bpf_uprobe_multi_link_attach(attr, prog);
break;
default:
@@ -5389,10 +5564,11 @@ static int link_detach(union bpf_attr *attr)
return ret;
}
-static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
+struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
{
return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
}
+EXPORT_SYMBOL(bpf_link_inc_not_zero);
struct bpf_link *bpf_link_by_id(u32 id)
{
@@ -5618,7 +5794,7 @@ static int token_create(union bpf_attr *attr)
return bpf_token_create(attr);
}
-static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
+static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
{
union bpf_attr attr;
int err;
@@ -5633,13 +5809,13 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
if (copy_from_bpfptr(&attr, uattr, size) != 0)
return -EFAULT;
- err = security_bpf(cmd, &attr, size);
+ err = security_bpf(cmd, &attr, size, uattr.is_kernel);
if (err < 0)
return err;
switch (cmd) {
case BPF_MAP_CREATE:
- err = map_create(&attr);
+ err = map_create(&attr, uattr.is_kernel);
break;
case BPF_MAP_LOOKUP_ELEM:
err = map_lookup_elem(&attr);
@@ -5846,7 +6022,7 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
return ____bpf_sys_bpf(cmd, attr, size);
}
}
-EXPORT_SYMBOL(kern_sys_bpf);
+EXPORT_SYMBOL_NS(kern_sys_bpf, "BPF_INTERNAL");
static const struct bpf_func_proto bpf_sys_bpf_proto = {
.func = bpf_sys_bpf,
@@ -5882,6 +6058,7 @@ static const struct bpf_func_proto bpf_sys_close_proto = {
BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
{
+ *res = 0;
if (flags)
return -EINVAL;
@@ -5902,7 +6079,8 @@ static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
.arg1_type = ARG_PTR_TO_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_LONG,
+ .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
+ .arg4_size = sizeof(u64),
};
static const struct bpf_func_proto *
@@ -5933,7 +6111,7 @@ const struct bpf_prog_ops bpf_syscall_prog_ops = {
};
#ifdef CONFIG_SYSCTL
-static int bpf_stats_handler(struct ctl_table *table, int write,
+static int bpf_stats_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct static_key *key = (struct static_key *)table->data;
@@ -5968,7 +6146,7 @@ void __weak unpriv_ebpf_notify(int new_state)
{
}
-static int bpf_unpriv_handler(struct ctl_table *table, int write,
+static int bpf_unpriv_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret, unpriv_enable = *(int *)table->data;
@@ -5992,7 +6170,7 @@ static int bpf_unpriv_handler(struct ctl_table *table, int write,
return ret;
}
-static struct ctl_table bpf_syscall_table[] = {
+static const struct ctl_table bpf_syscall_table[] = {
{
.procname = "unprivileged_bpf_disabled",
.data = &sysctl_unprivileged_bpf_disabled,
@@ -6008,7 +6186,6 @@ static struct ctl_table bpf_syscall_table[] = {
.mode = 0644,
.proc_handler = bpf_stats_handler,
},
- { }
};
static int __init bpf_syscall_sysctl_init(void)
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
index ef6911aee3bb..941d0d2427e3 100644
--- a/kernel/bpf/sysfs_btf.c
+++ b/kernel/bpf/sysfs_btf.c
@@ -7,32 +7,56 @@
#include <linux/kobject.h>
#include <linux/init.h>
#include <linux/sysfs.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/btf.h>
/* See scripts/link-vmlinux.sh, gen_btf() func for details */
-extern char __weak __start_BTF[];
-extern char __weak __stop_BTF[];
+extern char __start_BTF[];
+extern char __stop_BTF[];
-static ssize_t
-btf_vmlinux_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t off, size_t len)
+static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj,
+ const struct bin_attribute *attr,
+ struct vm_area_struct *vma)
{
- memcpy(buf, __start_BTF + off, len);
- return len;
+ unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT;
+ size_t vm_size = vma->vm_end - vma->vm_start;
+ phys_addr_t addr = virt_to_phys(__start_BTF);
+ unsigned long pfn = addr >> PAGE_SHIFT;
+
+ if (attr->private != __start_BTF || !PAGE_ALIGNED(addr))
+ return -EINVAL;
+
+ if (vma->vm_pgoff)
+ return -EINVAL;
+
+ if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE))
+ return -EACCES;
+
+ if (pfn + pages < pfn)
+ return -EINVAL;
+
+ if ((vm_size >> PAGE_SHIFT) > pages)
+ return -EINVAL;
+
+ vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE);
+ return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot);
}
static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = {
.attr = { .name = "vmlinux", .mode = 0444, },
- .read = btf_vmlinux_read,
+ .read_new = sysfs_bin_attr_simple_read,
+ .mmap = btf_sysfs_vmlinux_mmap,
};
struct kobject *btf_kobj;
static int __init btf_vmlinux_init(void)
{
+ bin_attr_btf_vmlinux.private = __start_BTF;
bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF;
- if (!__start_BTF || bin_attr_btf_vmlinux.size == 0)
+ if (bin_attr_btf_vmlinux.size == 0)
return 0;
btf_kobj = kobject_create_and_add("btf", kernel_kobj);
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index ec4e97c61eef..98d9b4c0daff 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -5,7 +5,6 @@
#include <linux/namei.h>
#include <linux/pid_namespace.h>
#include <linux/fs.h>
-#include <linux/fdtable.h>
#include <linux/filter.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/btf_ids.h>
@@ -99,7 +98,7 @@ static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *co
rcu_read_lock();
pid = find_pid_ns(common->pid, common->ns);
if (pid) {
- task = get_pid_task(pid, PIDTYPE_TGID);
+ task = get_pid_task(pid, PIDTYPE_PID);
*tid = common->pid;
}
rcu_read_unlock();
@@ -261,6 +260,7 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
u32 saved_tid = info->tid;
struct task_struct *curr_task;
unsigned int curr_fd = info->fd;
+ struct file *f;
/* If this function returns a non-NULL file object,
* it held a reference to the task/file.
@@ -285,21 +285,14 @@ again:
curr_fd = 0;
}
- rcu_read_lock();
- for (;; curr_fd++) {
- struct file *f;
- f = task_lookup_next_fdget_rcu(curr_task, &curr_fd);
- if (!f)
- break;
-
+ f = fget_task_next(curr_task, &curr_fd);
+ if (f) {
/* set info->fd */
info->fd = curr_fd;
- rcu_read_unlock();
return f;
}
/* the current task is done, go to the next task */
- rcu_read_unlock();
put_task_struct(curr_task);
if (info->common.type == BPF_TASK_ITER_TID) {
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index d6ccf8d00eab..26057aa13503 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -1,6 +1,5 @@
#include <linux/bpf.h>
#include <linux/vmalloc.h>
-#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/kernel.h>
@@ -116,67 +115,52 @@ int bpf_token_create(union bpf_attr *attr)
struct user_namespace *userns;
struct inode *inode;
struct file *file;
+ CLASS(fd, f)(attr->token_create.bpffs_fd);
struct path path;
- struct fd f;
+ struct super_block *sb;
umode_t mode;
int err, fd;
- f = fdget(attr->token_create.bpffs_fd);
- if (!f.file)
+ if (fd_empty(f))
return -EBADF;
- path = f.file->f_path;
- path_get(&path);
- fdput(f);
+ path = fd_file(f)->f_path;
+ sb = path.dentry->d_sb;
- if (path.dentry != path.mnt->mnt_sb->s_root) {
- err = -EINVAL;
- goto out_path;
- }
- if (path.mnt->mnt_sb->s_op != &bpf_super_ops) {
- err = -EINVAL;
- goto out_path;
- }
+ if (path.dentry != sb->s_root)
+ return -EINVAL;
+ if (sb->s_op != &bpf_super_ops)
+ return -EINVAL;
err = path_permission(&path, MAY_ACCESS);
if (err)
- goto out_path;
+ return err;
- userns = path.dentry->d_sb->s_user_ns;
+ userns = sb->s_user_ns;
/*
* Enforce that creators of BPF tokens are in the same user
* namespace as the BPF FS instance. This makes reasoning about
* permissions a lot easier and we can always relax this later.
*/
- if (current_user_ns() != userns) {
- err = -EPERM;
- goto out_path;
- }
- if (!ns_capable(userns, CAP_BPF)) {
- err = -EPERM;
- goto out_path;
- }
+ if (current_user_ns() != userns)
+ return -EPERM;
+ if (!ns_capable(userns, CAP_BPF))
+ return -EPERM;
/* Creating BPF token in init_user_ns doesn't make much sense. */
- if (current_user_ns() == &init_user_ns) {
- err = -EOPNOTSUPP;
- goto out_path;
- }
+ if (current_user_ns() == &init_user_ns)
+ return -EOPNOTSUPP;
- mnt_opts = path.dentry->d_sb->s_fs_info;
+ mnt_opts = sb->s_fs_info;
if (mnt_opts->delegate_cmds == 0 &&
mnt_opts->delegate_maps == 0 &&
mnt_opts->delegate_progs == 0 &&
- mnt_opts->delegate_attachs == 0) {
- err = -ENOENT; /* no BPF token delegation is set up */
- goto out_path;
- }
+ mnt_opts->delegate_attachs == 0)
+ return -ENOENT; /* no BPF token delegation is set up */
mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
- inode = bpf_get_inode(path.mnt->mnt_sb, NULL, mode);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- goto out_path;
- }
+ inode = bpf_get_inode(sb, NULL, mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
inode->i_op = &bpf_token_iops;
inode->i_fop = &bpf_token_fops;
@@ -185,8 +169,7 @@ int bpf_token_create(union bpf_attr *attr)
file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops);
if (IS_ERR(file)) {
iput(inode);
- err = PTR_ERR(file);
- goto out_path;
+ return PTR_ERR(file);
}
token = kzalloc(sizeof(*token), GFP_USER);
@@ -218,33 +201,27 @@ int bpf_token_create(union bpf_attr *attr)
file->private_data = token;
fd_install(fd, file);
- path_put(&path);
return fd;
out_token:
bpf_token_free(token);
out_file:
fput(file);
-out_path:
- path_put(&path);
return err;
}
struct bpf_token *bpf_token_get_from_fd(u32 ufd)
{
- struct fd f = fdget(ufd);
+ CLASS(fd, f)(ufd);
struct bpf_token *token;
- if (!f.file)
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- if (f.file->f_op != &bpf_token_fops) {
- fdput(f);
+ if (fd_file(f)->f_op != &bpf_token_fops)
return ERR_PTR(-EINVAL);
- }
- token = f.file->private_data;
+ token = fd_file(f)->private_data;
bpf_token_inc(token);
- fdput(f);
return token;
}
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index db7599c59c78..c4b1a98ff726 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -115,10 +115,14 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
(ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
}
-void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym)
+void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym)
{
ksym->start = (unsigned long) data;
ksym->end = ksym->start + size;
+}
+
+void bpf_image_ksym_add(struct bpf_ksym *ksym)
+{
bpf_ksym_add(ksym);
perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
PAGE_SIZE, false, ksym->name);
@@ -333,7 +337,7 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im)
int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP,
NULL, im->ip_epilogue);
WARN_ON(err);
- if (IS_ENABLED(CONFIG_PREEMPTION))
+ if (IS_ENABLED(CONFIG_TASKS_RCU))
call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
else
percpu_ref_kill(&im->pcref);
@@ -377,7 +381,8 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size)
ksym = &im->ksym;
INIT_LIST_HEAD_RCU(&ksym->lnode);
snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key);
- bpf_image_ksym_add(image, size, ksym);
+ bpf_image_ksym_init(image, size, ksym);
+ bpf_image_ksym_add(ksym);
return im;
out_free_image:
@@ -456,7 +461,9 @@ again:
if (err < 0)
goto out_free;
- arch_protect_bpf_trampoline(im->image, im->size);
+ err = arch_protect_bpf_trampoline(im->image, im->size);
+ if (err)
+ goto out_free;
WARN_ON(tr->cur_image && total == 0);
if (tr->cur_image)
@@ -521,7 +528,27 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
}
}
-static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog)
+{
+ struct bpf_prog_aux *aux = tgt_prog->aux;
+
+ guard(mutex)(&aux->ext_mutex);
+ if (aux->prog_array_member_cnt)
+ /* Program extensions can not extend target prog when the target
+ * prog has been updated to any prog_array map as tail callee.
+ * It's to prevent a potential infinite loop like:
+ * tgt prog entry -> tgt prog subprog -> freplace prog entry
+ * --tailcall-> tgt prog entry.
+ */
+ return -EBUSY;
+
+ aux->is_extended = true;
+ return 0;
+}
+
+static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
+ struct bpf_trampoline *tr,
+ struct bpf_prog *tgt_prog)
{
enum bpf_tramp_prog_type kind;
struct bpf_tramp_link *link_exiting;
@@ -542,6 +569,9 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_tr
/* Cannot attach extension if fentry/fexit are in use. */
if (cnt)
return -EBUSY;
+ err = bpf_freplace_check_tgt_prog(tgt_prog);
+ if (err)
+ return err;
tr->extension_prog = link->link.prog;
return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
link->link.prog->bpf_func);
@@ -568,17 +598,21 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_tr
return err;
}
-int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+int bpf_trampoline_link_prog(struct bpf_tramp_link *link,
+ struct bpf_trampoline *tr,
+ struct bpf_prog *tgt_prog)
{
int err;
mutex_lock(&tr->mutex);
- err = __bpf_trampoline_link_prog(link, tr);
+ err = __bpf_trampoline_link_prog(link, tr, tgt_prog);
mutex_unlock(&tr->mutex);
return err;
}
-static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
+ struct bpf_trampoline *tr,
+ struct bpf_prog *tgt_prog)
{
enum bpf_tramp_prog_type kind;
int err;
@@ -589,6 +623,8 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_
err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
tr->extension_prog->bpf_func, NULL);
tr->extension_prog = NULL;
+ guard(mutex)(&tgt_prog->aux->ext_mutex);
+ tgt_prog->aux->is_extended = false;
return err;
}
hlist_del_init(&link->tramp_hlist);
@@ -597,12 +633,14 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_
}
/* bpf_trampoline_unlink_prog() should never fail. */
-int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
+ struct bpf_trampoline *tr,
+ struct bpf_prog *tgt_prog)
{
int err;
mutex_lock(&tr->mutex);
- err = __bpf_trampoline_unlink_prog(link, tr);
+ err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog);
mutex_unlock(&tr->mutex);
return err;
}
@@ -617,7 +655,7 @@ static void bpf_shim_tramp_link_release(struct bpf_link *link)
if (!shim_link->trampoline)
return;
- WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline));
+ WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline, NULL));
bpf_trampoline_put(shim_link->trampoline);
}
@@ -731,7 +769,7 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
goto err;
}
- err = __bpf_trampoline_link_prog(&shim_link->link, tr);
+ err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL);
if (err)
goto err;
@@ -866,6 +904,8 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
bpf_prog_inc_misses_counter(prog);
+ if (prog->aux->recursion_detected)
+ prog->aux->recursion_detected(prog);
return 0;
}
return bpf_prog_start_time();
@@ -883,12 +923,13 @@ static void notrace update_prog_stats(struct bpf_prog *prog,
* Hence check that 'start' is valid.
*/
start > NO_START_TIME) {
+ u64 duration = sched_clock() - start;
unsigned long flags;
stats = this_cpu_ptr(prog->stats);
flags = u64_stats_update_begin_irqsave(&stats->syncp);
u64_stats_inc(&stats->cnt);
- u64_stats_add(&stats->nsecs, sched_clock() - start);
+ u64_stats_add(&stats->nsecs, duration);
u64_stats_update_end_irqrestore(&stats->syncp, flags);
}
}
@@ -941,6 +982,8 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
bpf_prog_inc_misses_counter(prog);
+ if (prog->aux->recursion_detected)
+ prog->aux->recursion_detected(prog);
return 0;
}
return bpf_prog_start_time();
@@ -1072,17 +1115,10 @@ void __weak arch_free_bpf_trampoline(void *image, unsigned int size)
bpf_jit_free_exec(image);
}
-void __weak arch_protect_bpf_trampoline(void *image, unsigned int size)
-{
- WARN_ON_ONCE(size > PAGE_SIZE);
- set_memory_rox((long)image, 1);
-}
-
-void __weak arch_unprotect_bpf_trampoline(void *image, unsigned int size)
+int __weak arch_protect_bpf_trampoline(void *image, unsigned int size)
{
WARN_ON_ONCE(size > PAGE_SIZE);
- set_memory_nx((long)image, 1);
- set_memory_rw((long)image, 1);
+ return set_memory_rox((long)image, 1);
}
int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 98188379d5c7..a7d6e0c5928b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -28,6 +28,8 @@
#include <linux/cpumask.h>
#include <linux/bpf_mem_alloc.h>
#include <net/xdp.h>
+#include <linux/trace_events.h>
+#include <linux/kallsyms.h>
#include "disasm.h"
@@ -172,7 +174,7 @@ static bool bpf_global_percpu_ma_set;
/* verifier_state + insn_idx are pushed to stack when branch is encountered */
struct bpf_verifier_stack_elem {
- /* verifer state is 'st'
+ /* verifier state is 'st'
* before processing instruction 'insn_idx'
* and after processing instruction 'prev_insn_idx'
*/
@@ -190,14 +192,12 @@ struct bpf_verifier_stack_elem {
#define BPF_MAP_KEY_POISON (1ULL << 63)
#define BPF_MAP_KEY_SEEN (1ULL << 62)
-#define BPF_MAP_PTR_UNPRIV 1UL
-#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \
- POISON_POINTER_DELTA))
-#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
-
#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512
-static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
+#define BPF_PRIV_STACK_MIN_SIZE 64
+
+static int acquire_reference(struct bpf_verifier_env *env, int insn_idx);
+static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id);
static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
@@ -209,21 +209,22 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg);
static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
{
- return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON;
+ return aux->map_ptr_state.poison;
}
static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
{
- return aux->map_ptr_state & BPF_MAP_PTR_UNPRIV;
+ return aux->map_ptr_state.unpriv;
}
static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
- const struct bpf_map *map, bool unpriv)
+ struct bpf_map *map,
+ bool unpriv, bool poison)
{
- BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV);
unpriv |= bpf_map_ptr_unpriv(aux);
- aux->map_ptr_state = (unsigned long)map |
- (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
+ aux->map_ptr_state.unpriv = unpriv;
+ aux->map_ptr_state.poison = poison;
+ aux->map_ptr_state.map_ptr = map;
}
static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux)
@@ -286,6 +287,7 @@ struct bpf_call_arg_meta {
u32 ret_btf_id;
u32 subprogno;
struct btf_field *kptr_field;
+ s64 const_map_key;
};
struct bpf_kfunc_call_arg_meta {
@@ -320,6 +322,7 @@ struct bpf_kfunc_call_arg_meta {
struct btf *arg_btf;
u32 arg_btf_id;
bool arg_owning_ref;
+ bool arg_prog;
struct {
struct btf_field *field;
@@ -336,6 +339,10 @@ struct bpf_kfunc_call_arg_meta {
u8 spi;
u8 frameno;
} iter;
+ struct {
+ struct bpf_map *ptr;
+ int uid;
+ } map;
u64 mem_size;
};
@@ -383,11 +390,6 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env,
verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval);
}
-static bool type_may_be_null(u32 type)
-{
- return type & PTR_MAYBE_NULL;
-}
-
static bool reg_not_null(const struct bpf_reg_state *reg)
{
enum bpf_reg_type type;
@@ -455,7 +457,7 @@ static bool subprog_is_exc_cb(struct bpf_verifier_env *env, int subprog)
static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
{
- return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK);
+ return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK);
}
static bool type_is_rdonly_mem(u32 type)
@@ -501,8 +503,12 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)
}
static bool is_sync_callback_calling_kfunc(u32 btf_id);
+static bool is_async_callback_calling_kfunc(u32 btf_id);
+static bool is_callback_calling_kfunc(u32 btf_id);
static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
+static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id);
+
static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_for_each_map_elem ||
@@ -530,7 +536,8 @@ static bool is_sync_callback_calling_insn(struct bpf_insn *insn)
static bool is_async_callback_calling_insn(struct bpf_insn *insn)
{
- return bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm);
+ return (bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm)) ||
+ (bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm));
}
static bool is_may_goto_insn(struct bpf_insn *insn)
@@ -573,6 +580,13 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
insn->imm == BPF_CMPXCHG;
}
+static bool is_atomic_load_insn(const struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_LOAD_ACQ;
+}
+
static int __get_spi(s32 off)
{
return (-off - 1) / BPF_REG_SIZE;
@@ -637,6 +651,11 @@ static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
}
+static int irq_flag_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ return stack_slot_obj_get_spi(env, reg, "irq_flag", 1);
+}
+
static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
{
switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
@@ -748,7 +767,7 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
if (clone_ref_obj_id)
id = clone_ref_obj_id;
else
- id = acquire_reference_state(env, insn_idx);
+ id = acquire_reference(env, insn_idx);
if (id < 0)
return id;
@@ -1010,7 +1029,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
if (spi < 0)
return spi;
- id = acquire_reference_state(env, insn_idx);
+ id = acquire_reference(env, insn_idx);
if (id < 0)
return id;
@@ -1132,10 +1151,148 @@ static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_s
return 0;
}
+static int acquire_irq_state(struct bpf_verifier_env *env, int insn_idx);
+static int release_irq_state(struct bpf_verifier_state *state, int id);
+
+static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env,
+ struct bpf_kfunc_call_arg_meta *meta,
+ struct bpf_reg_state *reg, int insn_idx,
+ int kfunc_class)
+{
+ struct bpf_func_state *state = func(env, reg);
+ struct bpf_stack_state *slot;
+ struct bpf_reg_state *st;
+ int spi, i, id;
+
+ spi = irq_flag_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ id = acquire_irq_state(env, insn_idx);
+ if (id < 0)
+ return id;
+
+ slot = &state->stack[spi];
+ st = &slot->spilled_ptr;
+
+ __mark_reg_known_zero(st);
+ st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+ st->live |= REG_LIVE_WRITTEN;
+ st->ref_obj_id = id;
+ st->irq.kfunc_class = kfunc_class;
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ slot->slot_type[i] = STACK_IRQ_FLAG;
+
+ mark_stack_slot_scratched(env, spi);
+ return 0;
+}
+
+static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int kfunc_class)
+{
+ struct bpf_func_state *state = func(env, reg);
+ struct bpf_stack_state *slot;
+ struct bpf_reg_state *st;
+ int spi, i, err;
+
+ spi = irq_flag_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ slot = &state->stack[spi];
+ st = &slot->spilled_ptr;
+
+ if (st->irq.kfunc_class != kfunc_class) {
+ const char *flag_kfunc = st->irq.kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock";
+ const char *used_kfunc = kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock";
+
+ verbose(env, "irq flag acquired by %s kfuncs cannot be restored with %s kfuncs\n",
+ flag_kfunc, used_kfunc);
+ return -EINVAL;
+ }
+
+ err = release_irq_state(env->cur_state, st->ref_obj_id);
+ WARN_ON_ONCE(err && err != -EACCES);
+ if (err) {
+ int insn_idx = 0;
+
+ for (int i = 0; i < env->cur_state->acquired_refs; i++) {
+ if (env->cur_state->refs[i].id == env->cur_state->active_irq_id) {
+ insn_idx = env->cur_state->refs[i].insn_idx;
+ break;
+ }
+ }
+
+ verbose(env, "cannot restore irq state out of order, expected id=%d acquired at insn_idx=%d\n",
+ env->cur_state->active_irq_id, insn_idx);
+ return err;
+ }
+
+ __mark_reg_not_init(env, st);
+
+ /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
+ st->live |= REG_LIVE_WRITTEN;
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ slot->slot_type[i] = STACK_INVALID;
+
+ mark_stack_slot_scratched(env, spi);
+ return 0;
+}
+
+static bool is_irq_flag_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ struct bpf_stack_state *slot;
+ int spi, i;
+
+ /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
+ * will do check_mem_access to check and update stack bounds later, so
+ * return true for that case.
+ */
+ spi = irq_flag_get_spi(env, reg);
+ if (spi == -ERANGE)
+ return true;
+ if (spi < 0)
+ return false;
+
+ slot = &state->stack[spi];
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ if (slot->slot_type[i] == STACK_IRQ_FLAG)
+ return false;
+ return true;
+}
+
+static int is_irq_flag_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ struct bpf_stack_state *slot;
+ struct bpf_reg_state *st;
+ int spi, i;
+
+ spi = irq_flag_get_spi(env, reg);
+ if (spi < 0)
+ return -EINVAL;
+
+ slot = &state->stack[spi];
+ st = &slot->spilled_ptr;
+
+ if (!st->ref_obj_id)
+ return -EINVAL;
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ if (slot->slot_type[i] != STACK_IRQ_FLAG)
+ return -EINVAL;
+ return 0;
+}
+
/* Check if given stack slot is "special":
* - spilled register state (STACK_SPILL);
* - dynptr state (STACK_DYNPTR);
* - iter state (STACK_ITER).
+ * - irq flag state (STACK_IRQ_FLAG)
*/
static bool is_stack_slot_special(const struct bpf_stack_state *stack)
{
@@ -1145,6 +1302,7 @@ static bool is_stack_slot_special(const struct bpf_stack_state *stack)
case STACK_SPILL:
case STACK_DYNPTR:
case STACK_ITER:
+ case STACK_IRQ_FLAG:
return true;
case STACK_INVALID:
case STACK_MISC:
@@ -1179,14 +1337,17 @@ static bool is_spilled_scalar_reg64(const struct bpf_stack_state *stack)
/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which
* case they are equivalent, or it's STACK_ZERO, in which case we preserve
* more precise STACK_ZERO.
- * Note, in uprivileged mode leaving STACK_INVALID is wrong, so we take
- * env->allow_ptr_leaks into account and force STACK_MISC, if necessary.
+ * Regardless of allow_ptr_leaks setting (i.e., privileged or unprivileged
+ * mode), we won't promote STACK_INVALID to STACK_MISC. In privileged case it is
+ * unnecessary as both are considered equivalent when loading data and pruning,
+ * in case of unprivileged mode it will be incorrect to allow reads of invalid
+ * slots.
*/
static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype)
{
if (*stype == STACK_ZERO)
return;
- if (env->allow_ptr_leaks && *stype == STACK_INVALID)
+ if (*stype == STACK_INVALID)
return;
*stype = STACK_MISC;
}
@@ -1256,7 +1417,7 @@ out:
return arr ? arr : ZERO_SIZE_PTR;
}
-static int copy_reference_state(struct bpf_func_state *dst, const struct bpf_func_state *src)
+static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf_verifier_state *src)
{
dst->refs = copy_array(dst->refs, src->refs, src->acquired_refs,
sizeof(struct bpf_reference_state), GFP_KERNEL);
@@ -1264,6 +1425,12 @@ static int copy_reference_state(struct bpf_func_state *dst, const struct bpf_fun
return -ENOMEM;
dst->acquired_refs = src->acquired_refs;
+ dst->active_locks = src->active_locks;
+ dst->active_preempt_locks = src->active_preempt_locks;
+ dst->active_rcu_lock = src->active_rcu_lock;
+ dst->active_irq_id = src->active_irq_id;
+ dst->active_lock_id = src->active_lock_id;
+ dst->active_lock_ptr = src->active_lock_ptr;
return 0;
}
@@ -1280,7 +1447,7 @@ static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_st
return 0;
}
-static int resize_reference_state(struct bpf_func_state *state, size_t n)
+static int resize_reference_state(struct bpf_verifier_state *state, size_t n)
{
state->refs = realloc_array(state->refs, state->acquired_refs, n,
sizeof(struct bpf_reference_state));
@@ -1323,61 +1490,175 @@ static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state
* On success, returns a valid pointer id to associate with the register
* On failure, returns a negative errno.
*/
-static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
+static struct bpf_reference_state *acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
{
- struct bpf_func_state *state = cur_func(env);
+ struct bpf_verifier_state *state = env->cur_state;
int new_ofs = state->acquired_refs;
- int id, err;
+ int err;
err = resize_reference_state(state, state->acquired_refs + 1);
if (err)
- return err;
- id = ++env->id_gen;
- state->refs[new_ofs].id = id;
+ return NULL;
state->refs[new_ofs].insn_idx = insn_idx;
- state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0;
- return id;
+ return &state->refs[new_ofs];
+}
+
+static int acquire_reference(struct bpf_verifier_env *env, int insn_idx)
+{
+ struct bpf_reference_state *s;
+
+ s = acquire_reference_state(env, insn_idx);
+ if (!s)
+ return -ENOMEM;
+ s->type = REF_TYPE_PTR;
+ s->id = ++env->id_gen;
+ return s->id;
+}
+
+static int acquire_lock_state(struct bpf_verifier_env *env, int insn_idx, enum ref_state_type type,
+ int id, void *ptr)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_reference_state *s;
+
+ s = acquire_reference_state(env, insn_idx);
+ if (!s)
+ return -ENOMEM;
+ s->type = type;
+ s->id = id;
+ s->ptr = ptr;
+
+ state->active_locks++;
+ state->active_lock_id = id;
+ state->active_lock_ptr = ptr;
+ return 0;
+}
+
+static int acquire_irq_state(struct bpf_verifier_env *env, int insn_idx)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_reference_state *s;
+
+ s = acquire_reference_state(env, insn_idx);
+ if (!s)
+ return -ENOMEM;
+ s->type = REF_TYPE_IRQ;
+ s->id = ++env->id_gen;
+
+ state->active_irq_id = s->id;
+ return s->id;
}
-/* release function corresponding to acquire_reference_state(). Idempotent. */
-static int release_reference_state(struct bpf_func_state *state, int ptr_id)
+static void release_reference_state(struct bpf_verifier_state *state, int idx)
{
- int i, last_idx;
+ int last_idx;
+ size_t rem;
+ /* IRQ state requires the relative ordering of elements remaining the
+ * same, since it relies on the refs array to behave as a stack, so that
+ * it can detect out-of-order IRQ restore. Hence use memmove to shift
+ * the array instead of swapping the final element into the deleted idx.
+ */
last_idx = state->acquired_refs - 1;
+ rem = state->acquired_refs - idx - 1;
+ if (last_idx && idx != last_idx)
+ memmove(&state->refs[idx], &state->refs[idx + 1], sizeof(*state->refs) * rem);
+ memset(&state->refs[last_idx], 0, sizeof(*state->refs));
+ state->acquired_refs--;
+ return;
+}
+
+static bool find_reference_state(struct bpf_verifier_state *state, int ptr_id)
+{
+ int i;
+
+ for (i = 0; i < state->acquired_refs; i++)
+ if (state->refs[i].id == ptr_id)
+ return true;
+
+ return false;
+}
+
+static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr)
+{
+ void *prev_ptr = NULL;
+ u32 prev_id = 0;
+ int i;
+
for (i = 0; i < state->acquired_refs; i++) {
- if (state->refs[i].id == ptr_id) {
- /* Cannot release caller references in callbacks */
- if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
- return -EINVAL;
- if (last_idx && i != last_idx)
- memcpy(&state->refs[i], &state->refs[last_idx],
- sizeof(*state->refs));
- memset(&state->refs[last_idx], 0, sizeof(*state->refs));
- state->acquired_refs--;
+ if (state->refs[i].type == type && state->refs[i].id == id &&
+ state->refs[i].ptr == ptr) {
+ release_reference_state(state, i);
+ state->active_locks--;
+ /* Reassign active lock (id, ptr). */
+ state->active_lock_id = prev_id;
+ state->active_lock_ptr = prev_ptr;
+ return 0;
+ }
+ if (state->refs[i].type & REF_TYPE_LOCK_MASK) {
+ prev_id = state->refs[i].id;
+ prev_ptr = state->refs[i].ptr;
+ }
+ }
+ return -EINVAL;
+}
+
+static int release_irq_state(struct bpf_verifier_state *state, int id)
+{
+ u32 prev_id = 0;
+ int i;
+
+ if (id != state->active_irq_id)
+ return -EACCES;
+
+ for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].type != REF_TYPE_IRQ)
+ continue;
+ if (state->refs[i].id == id) {
+ release_reference_state(state, i);
+ state->active_irq_id = prev_id;
return 0;
+ } else {
+ prev_id = state->refs[i].id;
}
}
return -EINVAL;
}
+static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *state, enum ref_state_type type,
+ int id, void *ptr)
+{
+ int i;
+
+ for (i = 0; i < state->acquired_refs; i++) {
+ struct bpf_reference_state *s = &state->refs[i];
+
+ if (!(s->type & type))
+ continue;
+
+ if (s->id == id && s->ptr == ptr)
+ return s;
+ }
+ return NULL;
+}
+
+static void update_peak_states(struct bpf_verifier_env *env)
+{
+ u32 cur_states;
+
+ cur_states = env->explored_states_size + env->free_list_size;
+ env->peak_states = max(env->peak_states, cur_states);
+}
+
static void free_func_state(struct bpf_func_state *state)
{
if (!state)
return;
- kfree(state->refs);
kfree(state->stack);
kfree(state);
}
-static void clear_jmp_history(struct bpf_verifier_state *state)
-{
- kfree(state->jmp_history);
- state->jmp_history = NULL;
- state->jmp_history_cnt = 0;
-}
-
static void free_verifier_state(struct bpf_verifier_state *state,
bool free_self)
{
@@ -1387,23 +1668,62 @@ static void free_verifier_state(struct bpf_verifier_state *state,
free_func_state(state->frame[i]);
state->frame[i] = NULL;
}
- clear_jmp_history(state);
+ kfree(state->refs);
if (free_self)
kfree(state);
}
+/* struct bpf_verifier_state->{parent,loop_entry} refer to states
+ * that are in either of env->{expored_states,free_list}.
+ * In both cases the state is contained in struct bpf_verifier_state_list.
+ */
+static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_state *st)
+{
+ if (st->parent)
+ return container_of(st->parent, struct bpf_verifier_state_list, state);
+ return NULL;
+}
+
+static struct bpf_verifier_state_list *state_loop_entry_as_list(struct bpf_verifier_state *st)
+{
+ if (st->loop_entry)
+ return container_of(st->loop_entry, struct bpf_verifier_state_list, state);
+ return NULL;
+}
+
+/* A state can be freed if it is no longer referenced:
+ * - is in the env->free_list;
+ * - has no children states;
+ * - is not used as loop_entry.
+ *
+ * Freeing a state can make it's loop_entry free-able.
+ */
+static void maybe_free_verifier_state(struct bpf_verifier_env *env,
+ struct bpf_verifier_state_list *sl)
+{
+ struct bpf_verifier_state_list *loop_entry_sl;
+
+ while (sl && sl->in_free_list &&
+ sl->state.branches == 0 &&
+ sl->state.used_as_loop_entry == 0) {
+ loop_entry_sl = state_loop_entry_as_list(&sl->state);
+ if (loop_entry_sl)
+ loop_entry_sl->state.used_as_loop_entry--;
+ list_del(&sl->node);
+ free_verifier_state(&sl->state, false);
+ kfree(sl);
+ env->free_list_size--;
+ sl = loop_entry_sl;
+ }
+}
+
/* copy verifier state from src to dst growing dst stack space
* when necessary to accommodate larger src stack
*/
static int copy_func_state(struct bpf_func_state *dst,
const struct bpf_func_state *src)
{
- int err;
-
- memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs));
- err = copy_reference_state(dst, src);
- if (err)
- return err;
+ memcpy(dst, src, offsetof(struct bpf_func_state, stack));
return copy_stack_state(dst, src);
}
@@ -1413,13 +1733,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
struct bpf_func_state *dst;
int i, err;
- dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history,
- src->jmp_history_cnt, sizeof(*dst_state->jmp_history),
- GFP_USER);
- if (!dst_state->jmp_history)
- return -ENOMEM;
- dst_state->jmp_history_cnt = src->jmp_history_cnt;
-
/* if dst has more stack frames then src frame, free them, this is also
* necessary in case of exceptional exits using bpf_throw.
*/
@@ -1427,19 +1740,23 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
free_func_state(dst_state->frame[i]);
dst_state->frame[i] = NULL;
}
+ err = copy_reference_state(dst_state, src);
+ if (err)
+ return err;
dst_state->speculative = src->speculative;
- dst_state->active_rcu_lock = src->active_rcu_lock;
+ dst_state->in_sleepable = src->in_sleepable;
dst_state->curframe = src->curframe;
- dst_state->active_lock.ptr = src->active_lock.ptr;
- dst_state->active_lock.id = src->active_lock.id;
dst_state->branches = src->branches;
dst_state->parent = src->parent;
dst_state->first_insn_idx = src->first_insn_idx;
dst_state->last_insn_idx = src->last_insn_idx;
+ dst_state->insn_hist_start = src->insn_hist_start;
+ dst_state->insn_hist_end = src->insn_hist_end;
dst_state->dfs_depth = src->dfs_depth;
dst_state->callback_unroll_depth = src->callback_unroll_depth;
dst_state->used_as_loop_entry = src->used_as_loop_entry;
dst_state->may_goto_depth = src->may_goto_depth;
+ dst_state->loop_entry = src->loop_entry;
for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i];
if (!dst) {
@@ -1460,7 +1777,7 @@ static u32 state_htab_size(struct bpf_verifier_env *env)
return env->prog->len;
}
-static struct bpf_verifier_state_list **explored_state(struct bpf_verifier_env *env, int idx)
+static struct list_head *explored_state(struct bpf_verifier_env *env, int idx)
{
struct bpf_verifier_state *cur = env->cur_state;
struct bpf_func_state *state = cur->frame[cur->curframe];
@@ -1568,16 +1885,13 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta
* # Find outermost loop entry known for n
* def get_loop_entry(n):
* h = entries.get(n, None)
- * while h in entries and entries[h] != h:
+ * while h in entries:
* h = entries[h]
* return h
*
- * # Update n's loop entry if h's outermost entry comes
- * # before n's outermost entry in current DFS path.
+ * # Update n's loop entry if h comes before n in current DFS path.
* def update_loop_entry(n, h):
- * n1 = get_loop_entry(n) or n
- * h1 = get_loop_entry(h) or h
- * if h1 in path and depths[h1] <= depths[n1]:
+ * if h in path and depths[entries.get(n, n)] < depths[n]:
* entries[n] = h1
*
* def dfs(n, depth):
@@ -1589,7 +1903,7 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta
* # Case A: explore succ and update cur's loop entry
* # only if succ's entry is in current DFS path.
* dfs(succ, depth + 1)
- * h = get_loop_entry(succ)
+ * h = entries.get(succ, None)
* update_loop_entry(n, h)
* else:
* # Case B or C depending on `h1 in path` check in update_loop_entry().
@@ -1601,46 +1915,46 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta
* and cur's loop entry has to be updated (case A), handle this in
* update_branch_counts();
* - use st->branch > 0 as a signal that st is in the current DFS path;
- * - handle cases B and C in is_state_visited();
- * - update topmost loop entry for intermediate states in get_loop_entry().
+ * - handle cases B and C in is_state_visited().
*/
-static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_state *st)
+static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st)
{
- struct bpf_verifier_state *topmost = st->loop_entry, *old;
+ struct bpf_verifier_state *topmost = st->loop_entry;
+ u32 steps = 0;
- while (topmost && topmost->loop_entry && topmost != topmost->loop_entry)
+ while (topmost && topmost->loop_entry) {
+ if (verifier_bug_if(steps++ > st->dfs_depth, env, "infinite loop"))
+ return ERR_PTR(-EFAULT);
topmost = topmost->loop_entry;
- /* Update loop entries for intermediate states to avoid this
- * traversal in future get_loop_entry() calls.
- */
- while (st && st->loop_entry != topmost) {
- old = st->loop_entry;
- st->loop_entry = topmost;
- st = old;
}
return topmost;
}
-static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr)
+static void update_loop_entry(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr)
{
- struct bpf_verifier_state *cur1, *hdr1;
-
- cur1 = get_loop_entry(cur) ?: cur;
- hdr1 = get_loop_entry(hdr) ?: hdr;
- /* The head1->branches check decides between cases B and C in
- * comment for get_loop_entry(). If hdr1->branches == 0 then
+ /* The hdr->branches check decides between cases B and C in
+ * comment for get_loop_entry(). If hdr->branches == 0 then
* head's topmost loop entry is not in current DFS path,
* hence 'cur' and 'hdr' are not in the same loop and there is
* no need to update cur->loop_entry.
*/
- if (hdr1->branches && hdr1->dfs_depth <= cur1->dfs_depth) {
+ if (hdr->branches && hdr->dfs_depth < (cur->loop_entry ?: cur)->dfs_depth) {
+ if (cur->loop_entry) {
+ cur->loop_entry->used_as_loop_entry--;
+ maybe_free_verifier_state(env, state_loop_entry_as_list(cur));
+ }
cur->loop_entry = hdr;
- hdr->used_as_loop_entry = true;
+ hdr->used_as_loop_entry++;
}
}
static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{
+ struct bpf_verifier_state_list *sl = NULL, *parent_sl;
+ struct bpf_verifier_state *parent;
+
while (st) {
u32 br = --st->branches;
@@ -1650,7 +1964,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi
* This is a part of 'case A' in get_loop_entry() comment.
*/
if (br == 0 && st->parent && st->loop_entry)
- update_loop_entry(st->parent, st->loop_entry);
+ update_loop_entry(env, st->parent, st->loop_entry);
/* WARN_ON(br > 1) technically makes sense here,
* but see comment in push_stack(), hence:
@@ -1660,7 +1974,12 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi
br);
if (br)
break;
- st = st->parent;
+ parent = st->parent;
+ parent_sl = state_parent_as_list(st);
+ if (sl)
+ maybe_free_verifier_state(env, sl);
+ st = parent;
+ sl = parent_sl;
}
}
@@ -1842,6 +2161,8 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
*/
if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER))
reg->map_uid = reg->id;
+ if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE))
+ reg->map_uid = reg->id;
} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
reg->type = PTR_TO_XDP_SOCK;
} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
@@ -2135,7 +2456,7 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
{
/* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit
- * values on both sides of 64-bit range in hope to have tigher range.
+ * values on both sides of 64-bit range in hope to have tighter range.
* E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from
* 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff].
* With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound
@@ -2143,7 +2464,7 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
* _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a
* better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff].
* We just need to make sure that derived bounds we are intersecting
- * with are well-formed ranges in respecitve s64 or u64 domain, just
+ * with are well-formed ranges in respective s64 or u64 domain, just
* like we do with similar kinds of 32-to-64 or 64-to-32 adjustments.
*/
__u64 new_umin, new_umax;
@@ -2173,6 +2494,44 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
reg->smin_value = max_t(s64, reg->smin_value, new_smin);
reg->smax_value = min_t(s64, reg->smax_value, new_smax);
}
+
+ /* Here we would like to handle a special case after sign extending load,
+ * when upper bits for a 64-bit range are all 1s or all 0s.
+ *
+ * Upper bits are all 1s when register is in a range:
+ * [0xffff_ffff_0000_0000, 0xffff_ffff_ffff_ffff]
+ * Upper bits are all 0s when register is in a range:
+ * [0x0000_0000_0000_0000, 0x0000_0000_ffff_ffff]
+ * Together this forms are continuous range:
+ * [0xffff_ffff_0000_0000, 0x0000_0000_ffff_ffff]
+ *
+ * Now, suppose that register range is in fact tighter:
+ * [0xffff_ffff_8000_0000, 0x0000_0000_ffff_ffff] (R)
+ * Also suppose that it's 32-bit range is positive,
+ * meaning that lower 32-bits of the full 64-bit register
+ * are in the range:
+ * [0x0000_0000, 0x7fff_ffff] (W)
+ *
+ * If this happens, then any value in a range:
+ * [0xffff_ffff_0000_0000, 0xffff_ffff_7fff_ffff]
+ * is smaller than a lowest bound of the range (R):
+ * 0xffff_ffff_8000_0000
+ * which means that upper bits of the full 64-bit register
+ * can't be all 1s, when lower bits are in range (W).
+ *
+ * Note that:
+ * - 0xffff_ffff_8000_0000 == (s64)S32_MIN
+ * - 0x0000_0000_7fff_ffff == (s64)S32_MAX
+ * These relations are used in the conditions below.
+ */
+ if (reg->s32_min_value >= 0 && reg->smin_value >= S32_MIN && reg->smax_value <= S32_MAX) {
+ reg->smin_value = reg->s32_min_value;
+ reg->smax_value = reg->s32_max_value;
+ reg->umin_value = reg->s32_min_value;
+ reg->umax_value = reg->s32_max_value;
+ reg->var_off = tnum_intersect(reg->var_off,
+ tnum_range(reg->smin_value, reg->smax_value));
+ }
}
static void __reg_deduce_bounds(struct bpf_reg_state *reg)
@@ -2325,6 +2684,25 @@ static void mark_reg_unknown(struct bpf_verifier_env *env,
__mark_reg_unknown(env, regs + regno);
}
+static int __mark_reg_s32_range(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs,
+ u32 regno,
+ s32 s32_min,
+ s32 s32_max)
+{
+ struct bpf_reg_state *reg = regs + regno;
+
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, s32_min);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, s32_max);
+
+ reg->smin_value = max_t(s64, reg->smin_value, s32_min);
+ reg->smax_value = min_t(s64, reg->smax_value, s32_max);
+
+ reg_bounds_sync(reg);
+
+ return reg_bounds_sanity_check(env, reg, "s32_range");
+}
+
static void __mark_reg_not_init(const struct bpf_verifier_env *env,
struct bpf_reg_state *reg)
{
@@ -2359,6 +2737,8 @@ static void mark_btf_ld_reg(struct bpf_verifier_env *env,
regs[regno].type = PTR_TO_BTF_ID | flag;
regs[regno].btf = btf;
regs[regno].btf_id = btf_id;
+ if (type_may_be_null(flag))
+ regs[regno].id = ++env->id_gen;
}
#define DEF_NOT_SUBREG (0)
@@ -2402,7 +2782,7 @@ static void init_func_state(struct bpf_verifier_env *env,
/* Similar to push_stack(), but for async callbacks */
static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
int insn_idx, int prev_insn_idx,
- int subprog)
+ int subprog, bool is_sleepable)
{
struct bpf_verifier_stack_elem *elem;
struct bpf_func_state *frame;
@@ -2427,8 +2807,14 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
* The caller state doesn't matter.
* This is async callback. It starts in a fresh stack.
* Initialize it similar to do_check_common().
+ * But we do need to make sure to not clobber insn_hist, so we keep
+ * chaining insn_hist_start/insn_hist_end indices as for a normal
+ * child state.
*/
elem->st.branches = 1;
+ elem->st.in_sleepable = is_sleepable;
+ elem->st.insn_hist_start = env->cur_state->insn_hist_end;
+ elem->st.insn_hist_end = elem->st.insn_hist_start;
frame = kzalloc(sizeof(*frame), GFP_KERNEL);
if (!frame)
goto err;
@@ -2459,16 +2845,36 @@ static int cmp_subprogs(const void *a, const void *b)
((struct bpf_subprog_info *)b)->start;
}
+/* Find subprogram that contains instruction at 'off' */
+static struct bpf_subprog_info *find_containing_subprog(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *vals = env->subprog_info;
+ int l, r, m;
+
+ if (off >= env->prog->len || off < 0 || env->subprog_cnt == 0)
+ return NULL;
+
+ l = 0;
+ r = env->subprog_cnt - 1;
+ while (l < r) {
+ m = l + (r - l + 1) / 2;
+ if (vals[m].start <= off)
+ l = m;
+ else
+ r = m - 1;
+ }
+ return &vals[l];
+}
+
+/* Find subprogram that starts exactly at 'off' */
static int find_subprog(struct bpf_verifier_env *env, int off)
{
struct bpf_subprog_info *p;
- p = bsearch(&off, env->subprog_info, env->subprog_cnt,
- sizeof(env->subprog_info[0]), cmp_subprogs);
- if (!p)
+ p = find_containing_subprog(env, off);
+ if (!p || p->start != off)
return -ENOENT;
return p - env->subprog_info;
-
}
static int add_subprog(struct bpf_verifier_env *env, int off)
@@ -2684,10 +3090,16 @@ static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
b->module = mod;
b->offset = offset;
+ /* sort() reorders entries by value, so b may no longer point
+ * to the right entry after this
+ */
sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
kfunc_btf_cmp_by_off, NULL);
+ } else {
+ btf = b->btf;
}
- return b->btf;
+
+ return btf;
}
void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab)
@@ -2892,6 +3304,21 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
return res ? &res->func_model : NULL;
}
+static int add_kfunc_in_insns(struct bpf_verifier_env *env,
+ struct bpf_insn *insn, int cnt)
+{
+ int i, ret;
+
+ for (i = 0; i < cnt; i++, insn++) {
+ if (bpf_pseudo_kfunc_call(insn)) {
+ ret = add_kfunc_call(env, insn->imm, insn->off);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return 0;
+}
+
static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
{
struct bpf_subprog_info *subprog = env->subprog_info;
@@ -2955,6 +3382,15 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
return 0;
}
+static int jmp_offset(struct bpf_insn *insn)
+{
+ u8 code = insn->code;
+
+ if (code == (BPF_JMP32 | BPF_JA))
+ return insn->imm;
+ return insn->off;
+}
+
static int check_subprogs(struct bpf_verifier_env *env)
{
int i, subprog_start, subprog_end, off, cur_subprog = 0;
@@ -2970,8 +3406,10 @@ static int check_subprogs(struct bpf_verifier_env *env)
if (code == (BPF_JMP | BPF_CALL) &&
insn[i].src_reg == 0 &&
- insn[i].imm == BPF_FUNC_tail_call)
+ insn[i].imm == BPF_FUNC_tail_call) {
subprog[cur_subprog].has_tail_call = true;
+ subprog[cur_subprog].tail_call_reachable = true;
+ }
if (BPF_CLASS(code) == BPF_LD &&
(BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND))
subprog[cur_subprog].has_ld_abs = true;
@@ -2979,10 +3417,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
goto next;
if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
goto next;
- if (code == (BPF_JMP32 | BPF_JA))
- off = i + insn[i].imm + 1;
- else
- off = i + insn[i].off + 1;
+ off = i + jmp_offset(&insn[i]) + 1;
if (off < subprog_start || off >= subprog_end) {
verbose(env, "jump out of range from insn %d to %d\n", i, off);
return -EINVAL;
@@ -3022,12 +3457,11 @@ static int mark_reg_read(struct bpf_verifier_env *env,
/* if read wasn't screened by an earlier write ... */
if (writes && state->live & REG_LIVE_WRITTEN)
break;
- if (parent->live & REG_LIVE_DONE) {
- verbose(env, "verifier BUG type %s var_off %lld off %d\n",
- reg_type_str(env, parent->type),
- parent->var_off.value, parent->off);
+ if (verifier_bug_if(parent->live & REG_LIVE_DONE, env,
+ "type %s var_off %lld off %d",
+ reg_type_str(env, parent->type),
+ parent->var_off.value, parent->off))
return -EFAULT;
- }
/* The first condition is more likely to be true than the
* second, checked it first.
*/
@@ -3059,10 +3493,27 @@ static int mark_reg_read(struct bpf_verifier_env *env,
return 0;
}
-static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int spi, int nr_slots)
{
struct bpf_func_state *state = func(env, reg);
- int spi, ret;
+ int err, i;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr;
+
+ err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64);
+ if (err)
+ return err;
+
+ mark_stack_slot_scratched(env, spi - i);
+ }
+ return 0;
+}
+
+static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ int spi;
/* For CONST_PTR_TO_DYNPTR, it must have already been done by
* check_reg_arg in check_helper_call and mark_btf_func_reg_size in
@@ -3077,31 +3528,23 @@ static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *
* bounds and spi is the first dynptr slot. Simply mark stack slot as
* read.
*/
- ret = mark_reg_read(env, &state->stack[spi].spilled_ptr,
- state->stack[spi].spilled_ptr.parent, REG_LIVE_READ64);
- if (ret)
- return ret;
- return mark_reg_read(env, &state->stack[spi - 1].spilled_ptr,
- state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64);
+ return mark_stack_slot_obj_read(env, reg, spi, BPF_DYNPTR_NR_SLOTS);
}
static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
int spi, int nr_slots)
{
- struct bpf_func_state *state = func(env, reg);
- int err, i;
-
- for (i = 0; i < nr_slots; i++) {
- struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr;
-
- err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64);
- if (err)
- return err;
+ return mark_stack_slot_obj_read(env, reg, spi, nr_slots);
+}
- mark_stack_slot_scratched(env, spi - i);
- }
+static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ int spi;
- return 0;
+ spi = irq_flag_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ return mark_stack_slot_obj_read(env, reg, spi, 1);
}
/* This function is supposed to be used by the following 32-bit optimization
@@ -3158,7 +3601,7 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
if (class == BPF_STX) {
- /* BPF_STX (including atomic variants) has multiple source
+ /* BPF_STX (including atomic variants) has one or more source
* operands, one of which is a ptr. Check whether the caller is
* asking about it.
*/
@@ -3203,15 +3646,16 @@ static int insn_def_regno(const struct bpf_insn *insn)
case BPF_ST:
return -1;
case BPF_STX:
- if (BPF_MODE(insn->code) == BPF_ATOMIC &&
- (insn->imm & BPF_FETCH)) {
+ if (BPF_MODE(insn->code) == BPF_ATOMIC ||
+ BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) {
if (insn->imm == BPF_CMPXCHG)
return BPF_REG_0;
- else
+ else if (insn->imm == BPF_LOAD_ACQ)
+ return insn->dst_reg;
+ else if (insn->imm & BPF_FETCH)
return insn->src_reg;
- } else {
- return -1;
}
+ return -1;
default:
return insn->dst_reg;
}
@@ -3320,12 +3764,89 @@ static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
return env->insn_aux_data[insn_idx].jmp_point;
}
+#define LR_FRAMENO_BITS 3
+#define LR_SPI_BITS 6
+#define LR_ENTRY_BITS (LR_SPI_BITS + LR_FRAMENO_BITS + 1)
+#define LR_SIZE_BITS 4
+#define LR_FRAMENO_MASK ((1ull << LR_FRAMENO_BITS) - 1)
+#define LR_SPI_MASK ((1ull << LR_SPI_BITS) - 1)
+#define LR_SIZE_MASK ((1ull << LR_SIZE_BITS) - 1)
+#define LR_SPI_OFF LR_FRAMENO_BITS
+#define LR_IS_REG_OFF (LR_SPI_BITS + LR_FRAMENO_BITS)
+#define LINKED_REGS_MAX 6
+
+struct linked_reg {
+ u8 frameno;
+ union {
+ u8 spi;
+ u8 regno;
+ };
+ bool is_reg;
+};
+
+struct linked_regs {
+ int cnt;
+ struct linked_reg entries[LINKED_REGS_MAX];
+};
+
+static struct linked_reg *linked_regs_push(struct linked_regs *s)
+{
+ if (s->cnt < LINKED_REGS_MAX)
+ return &s->entries[s->cnt++];
+
+ return NULL;
+}
+
+/* Use u64 as a vector of 6 10-bit values, use first 4-bits to track
+ * number of elements currently in stack.
+ * Pack one history entry for linked registers as 10 bits in the following format:
+ * - 3-bits frameno
+ * - 6-bits spi_or_reg
+ * - 1-bit is_reg
+ */
+static u64 linked_regs_pack(struct linked_regs *s)
+{
+ u64 val = 0;
+ int i;
+
+ for (i = 0; i < s->cnt; ++i) {
+ struct linked_reg *e = &s->entries[i];
+ u64 tmp = 0;
+
+ tmp |= e->frameno;
+ tmp |= e->spi << LR_SPI_OFF;
+ tmp |= (e->is_reg ? 1 : 0) << LR_IS_REG_OFF;
+
+ val <<= LR_ENTRY_BITS;
+ val |= tmp;
+ }
+ val <<= LR_SIZE_BITS;
+ val |= s->cnt;
+ return val;
+}
+
+static void linked_regs_unpack(u64 val, struct linked_regs *s)
+{
+ int i;
+
+ s->cnt = val & LR_SIZE_MASK;
+ val >>= LR_SIZE_BITS;
+
+ for (i = 0; i < s->cnt; ++i) {
+ struct linked_reg *e = &s->entries[i];
+
+ e->frameno = val & LR_FRAMENO_MASK;
+ e->spi = (val >> LR_SPI_OFF) & LR_SPI_MASK;
+ e->is_reg = (val >> LR_IS_REG_OFF) & 0x1;
+ val >>= LR_ENTRY_BITS;
+ }
+}
+
/* for any branch, call, exit record the history of jmps in the given state */
-static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
- int insn_flags)
+static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
+ int insn_flags, u64 linked_regs)
{
- u32 cnt = cur->jmp_history_cnt;
- struct bpf_jmp_history_entry *p;
+ struct bpf_insn_hist_entry *p;
size_t alloc_size;
/* combine instruction flags if we already recorded this instruction */
@@ -3333,36 +3854,44 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_st
/* atomic instructions push insn_flags twice, for READ and
* WRITE sides, but they should agree on stack slot
*/
- WARN_ONCE((env->cur_hist_ent->flags & insn_flags) &&
- (env->cur_hist_ent->flags & insn_flags) != insn_flags,
- "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n",
- env->insn_idx, env->cur_hist_ent->flags, insn_flags);
+ verifier_bug_if((env->cur_hist_ent->flags & insn_flags) &&
+ (env->cur_hist_ent->flags & insn_flags) != insn_flags,
+ env, "insn history: insn_idx %d cur flags %x new flags %x",
+ env->insn_idx, env->cur_hist_ent->flags, insn_flags);
env->cur_hist_ent->flags |= insn_flags;
+ verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env,
+ "insn history: insn_idx %d linked_regs: %#llx",
+ env->insn_idx, env->cur_hist_ent->linked_regs);
+ env->cur_hist_ent->linked_regs = linked_regs;
return 0;
}
- cnt++;
- alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
- p = krealloc(cur->jmp_history, alloc_size, GFP_USER);
- if (!p)
- return -ENOMEM;
- cur->jmp_history = p;
+ if (cur->insn_hist_end + 1 > env->insn_hist_cap) {
+ alloc_size = size_mul(cur->insn_hist_end + 1, sizeof(*p));
+ p = kvrealloc(env->insn_hist, alloc_size, GFP_USER);
+ if (!p)
+ return -ENOMEM;
+ env->insn_hist = p;
+ env->insn_hist_cap = alloc_size / sizeof(*p);
+ }
- p = &cur->jmp_history[cnt - 1];
+ p = &env->insn_hist[cur->insn_hist_end];
p->idx = env->insn_idx;
p->prev_idx = env->prev_insn_idx;
p->flags = insn_flags;
- cur->jmp_history_cnt = cnt;
+ p->linked_regs = linked_regs;
+
+ cur->insn_hist_end++;
env->cur_hist_ent = p;
return 0;
}
-static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st,
- u32 hist_end, int insn_idx)
+static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env *env,
+ u32 hist_start, u32 hist_end, int insn_idx)
{
- if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx)
- return &st->jmp_history[hist_end - 1];
+ if (hist_end > hist_start && env->insn_hist[hist_end - 1].idx == insn_idx)
+ return &env->insn_hist[hist_end - 1];
return NULL;
}
@@ -3379,25 +3908,26 @@ static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_stat
* history entry recording a jump from last instruction of parent state and
* first instruction of given state.
*/
-static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
- u32 *history)
+static int get_prev_insn_idx(const struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st,
+ int insn_idx, u32 hist_start, u32 *hist_endp)
{
- u32 cnt = *history;
+ u32 hist_end = *hist_endp;
+ u32 cnt = hist_end - hist_start;
- if (i == st->first_insn_idx) {
+ if (insn_idx == st->first_insn_idx) {
if (cnt == 0)
return -ENOENT;
- if (cnt == 1 && st->jmp_history[0].idx == i)
+ if (cnt == 1 && env->insn_hist[hist_start].idx == insn_idx)
return -ENOENT;
}
- if (cnt && st->jmp_history[cnt - 1].idx == i) {
- i = st->jmp_history[cnt - 1].prev_idx;
- (*history)--;
+ if (cnt && env->insn_hist[hist_end - 1].idx == insn_idx) {
+ (*hist_endp)--;
+ return env->insn_hist[hist_end - 1].prev_idx;
} else {
- i--;
+ return insn_idx - 1;
}
- return i;
}
static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
@@ -3416,6 +3946,17 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
return btf_name_by_offset(desc_btf, func->name_off);
}
+static void verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ const struct bpf_insn_cbs cbs = {
+ .cb_call = disasm_kfunc_name,
+ .cb_print = verbose,
+ .private_data = env,
+ };
+
+ print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
+}
+
static inline void bt_init(struct backtrack_state *bt, u32 frame)
{
bt->frame = frame;
@@ -3443,8 +3984,7 @@ static inline u32 bt_empty(struct backtrack_state *bt)
static inline int bt_subprog_enter(struct backtrack_state *bt)
{
if (bt->frame == MAX_CALL_FRAMES - 1) {
- verbose(bt->env, "BUG subprog enter from frame %d\n", bt->frame);
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(bt->env, "subprog enter from frame %d", bt->frame);
return -EFAULT;
}
bt->frame++;
@@ -3454,8 +3994,7 @@ static inline int bt_subprog_enter(struct backtrack_state *bt)
static inline int bt_subprog_exit(struct backtrack_state *bt)
{
if (bt->frame == 0) {
- verbose(bt->env, "BUG subprog exit from frame 0\n");
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(bt->env, "subprog exit from frame 0");
return -EFAULT;
}
bt->frame--;
@@ -3517,6 +4056,11 @@ static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
return bt->reg_masks[bt->frame] & (1 << reg);
}
+static inline bool bt_is_frame_reg_set(struct backtrack_state *bt, u32 frame, u32 reg)
+{
+ return bt->reg_masks[frame] & (1 << reg);
+}
+
static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot)
{
return bt->stack_masks[frame] & (1ull << slot);
@@ -3561,6 +4105,42 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
}
}
+/* If any register R in hist->linked_regs is marked as precise in bt,
+ * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs.
+ */
+static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_insn_hist_entry *hist)
+{
+ struct linked_regs linked_regs;
+ bool some_precise = false;
+ int i;
+
+ if (!hist || hist->linked_regs == 0)
+ return;
+
+ linked_regs_unpack(hist->linked_regs, &linked_regs);
+ for (i = 0; i < linked_regs.cnt; ++i) {
+ struct linked_reg *e = &linked_regs.entries[i];
+
+ if ((e->is_reg && bt_is_frame_reg_set(bt, e->frameno, e->regno)) ||
+ (!e->is_reg && bt_is_frame_slot_set(bt, e->frameno, e->spi))) {
+ some_precise = true;
+ break;
+ }
+ }
+
+ if (!some_precise)
+ return;
+
+ for (i = 0; i < linked_regs.cnt; ++i) {
+ struct linked_reg *e = &linked_regs.entries[i];
+
+ if (e->is_reg)
+ bt_set_frame_reg(bt, e->frameno, e->regno);
+ else
+ bt_set_frame_slot(bt, e->frameno, e->spi);
+ }
+}
+
static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
/* For given verifier state backtrack_insn() is called from the last insn to
@@ -3573,13 +4153,8 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
* - *was* processed previously during backtracking.
*/
static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
- struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
+ struct bpf_insn_hist_entry *hist, struct backtrack_state *bt)
{
- const struct bpf_insn_cbs cbs = {
- .cb_call = disasm_kfunc_name,
- .cb_print = verbose,
- .private_data = env,
- };
struct bpf_insn *insn = env->prog->insnsi + idx;
u8 class = BPF_CLASS(insn->code);
u8 opcode = BPF_OP(insn->code);
@@ -3597,9 +4172,15 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt));
verbose(env, "stack=%s before ", env->tmp_str_buf);
verbose(env, "%d: ", idx);
- print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
+ verbose_insn(env, insn);
}
+ /* If there is a history record that some registers gained range at this insn,
+ * propagate precision marks to those registers, so that bt_is_reg_set()
+ * accounts for these registers.
+ */
+ bt_sync_linked_regs(bt, hist);
+
if (class == BPF_ALU || class == BPF_ALU64) {
if (!bt_is_reg_set(bt, dreg))
return 0;
@@ -3615,7 +4196,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* sreg needs precision before this insn
*/
bt_clear_reg(bt, dreg);
- bt_set_reg(bt, sreg);
+ if (sreg != BPF_REG_FP)
+ bt_set_reg(bt, sreg);
} else {
/* dreg = K
* dreg needs precision after this insn.
@@ -3631,12 +4213,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* both dreg and sreg need precision
* before this insn
*/
- bt_set_reg(bt, sreg);
+ if (sreg != BPF_REG_FP)
+ bt_set_reg(bt, sreg);
} /* else dreg += K
* dreg still needs precision before this insn
*/
}
- } else if (class == BPF_LDX) {
+ } else if (class == BPF_LDX || is_atomic_load_insn(insn)) {
if (!bt_is_reg_set(bt, dreg))
return 0;
bt_clear_reg(bt, dreg);
@@ -3689,14 +4272,15 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* should be literally next instruction in
* caller program
*/
- WARN_ONCE(idx + 1 != subseq_idx, "verifier backtracking bug");
+ verifier_bug_if(idx + 1 != subseq_idx, env,
+ "extra insn from subprog");
/* r1-r5 are invalidated after subprog call,
* so for global func call it shouldn't be set
* anymore
*/
if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
- verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "global subprog unexpected regs %x",
+ bt_reg_mask(bt));
return -EFAULT;
}
/* global subprog always sets R0 */
@@ -3710,16 +4294,17 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* the current frame should be zero by now
*/
if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
- verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "static subprog unexpected regs %x",
+ bt_reg_mask(bt));
return -EFAULT;
}
/* we are now tracking register spills correctly,
* so any instance of leftover slots is a bug
*/
if (bt_stack_mask(bt) != 0) {
- verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug (subprog leftover stack slots)");
+ verifier_bug(env,
+ "static subprog leftover stack slots %llx",
+ bt_stack_mask(bt));
return -EFAULT;
}
/* propagate r1-r5 to the caller */
@@ -3742,13 +4327,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* not actually arguments passed directly to callback subprogs
*/
if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
- verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "callback unexpected regs %x",
+ bt_reg_mask(bt));
return -EFAULT;
}
if (bt_stack_mask(bt) != 0) {
- verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug (callback leftover stack slots)");
+ verifier_bug(env, "callback leftover stack slots %llx",
+ bt_stack_mask(bt));
return -EFAULT;
}
/* clear r1-r5 in callback subprog's mask */
@@ -3767,11 +4352,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
/* regular helper call sets R0 */
bt_clear_reg(bt, BPF_REG_0);
if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
- /* if backtracing was looking for registers R1-R5
+ /* if backtracking was looking for registers R1-R5
* they should have been found already.
*/
- verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "backtracking call unexpected regs %x",
+ bt_reg_mask(bt));
return -EFAULT;
}
} else if (opcode == BPF_EXIT) {
@@ -3789,8 +4374,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
for (i = BPF_REG_1; i <= BPF_REG_5; i++)
bt_clear_reg(bt, i);
if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
- verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "backtracking exit unexpected regs %x",
+ bt_reg_mask(bt));
return -EFAULT;
}
@@ -3825,9 +4410,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* before it would be equally necessary to
* propagate it to dreg.
*/
- bt_set_reg(bt, dreg);
- bt_set_reg(bt, sreg);
- /* else dreg <cond> K
+ if (!hist || !(hist->flags & INSN_F_SRC_REG_STACK))
+ bt_set_reg(bt, sreg);
+ if (!hist || !(hist->flags & INSN_F_DST_REG_STACK))
+ bt_set_reg(bt, dreg);
+ } else if (BPF_SRC(insn->code) == BPF_K) {
+ /* dreg <cond> K
* Only dreg still needs precision before
* this insn, so for the K-based conditional
* there is nothing new to be marked.
@@ -3845,6 +4433,10 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
/* to be analyzed */
return -ENOTSUPP;
}
+ /* Propagate precision marks to linked registers, to account for
+ * registers marked as precise in this function.
+ */
+ bt_sync_linked_regs(bt, hist);
return 0;
}
@@ -3972,96 +4564,6 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_
}
}
-static bool idset_contains(struct bpf_idset *s, u32 id)
-{
- u32 i;
-
- for (i = 0; i < s->count; ++i)
- if (s->ids[i] == id)
- return true;
-
- return false;
-}
-
-static int idset_push(struct bpf_idset *s, u32 id)
-{
- if (WARN_ON_ONCE(s->count >= ARRAY_SIZE(s->ids)))
- return -EFAULT;
- s->ids[s->count++] = id;
- return 0;
-}
-
-static void idset_reset(struct bpf_idset *s)
-{
- s->count = 0;
-}
-
-/* Collect a set of IDs for all registers currently marked as precise in env->bt.
- * Mark all registers with these IDs as precise.
- */
-static int mark_precise_scalar_ids(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
-{
- struct bpf_idset *precise_ids = &env->idset_scratch;
- struct backtrack_state *bt = &env->bt;
- struct bpf_func_state *func;
- struct bpf_reg_state *reg;
- DECLARE_BITMAP(mask, 64);
- int i, fr;
-
- idset_reset(precise_ids);
-
- for (fr = bt->frame; fr >= 0; fr--) {
- func = st->frame[fr];
-
- bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
- for_each_set_bit(i, mask, 32) {
- reg = &func->regs[i];
- if (!reg->id || reg->type != SCALAR_VALUE)
- continue;
- if (idset_push(precise_ids, reg->id))
- return -EFAULT;
- }
-
- bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
- for_each_set_bit(i, mask, 64) {
- if (i >= func->allocated_stack / BPF_REG_SIZE)
- break;
- if (!is_spilled_scalar_reg(&func->stack[i]))
- continue;
- reg = &func->stack[i].spilled_ptr;
- if (!reg->id)
- continue;
- if (idset_push(precise_ids, reg->id))
- return -EFAULT;
- }
- }
-
- for (fr = 0; fr <= st->curframe; ++fr) {
- func = st->frame[fr];
-
- for (i = BPF_REG_0; i < BPF_REG_10; ++i) {
- reg = &func->regs[i];
- if (!reg->id)
- continue;
- if (!idset_contains(precise_ids, reg->id))
- continue;
- bt_set_frame_reg(bt, fr, i);
- }
- for (i = 0; i < func->allocated_stack / BPF_REG_SIZE; ++i) {
- if (!is_spilled_scalar_reg(&func->stack[i]))
- continue;
- reg = &func->stack[i].spilled_ptr;
- if (!reg->id)
- continue;
- if (!idset_contains(precise_ids, reg->id))
- continue;
- bt_set_frame_slot(bt, fr, i);
- }
- }
-
- return 0;
-}
-
/*
* __mark_chain_precision() backtracks BPF program instruction sequence and
* chain of verifier states making sure that register *regno* (if regno >= 0)
@@ -4069,7 +4571,7 @@ static int mark_precise_scalar_ids(struct bpf_verifier_env *env, struct bpf_veri
* SCALARS, as well as any other registers and slots that contribute to
* a tracked state of given registers/stack slots, depending on specific BPF
* assembly instructions (see backtrack_insns() for exact instruction handling
- * logic). This backtracking relies on recorded jmp_history and is able to
+ * logic). This backtracking relies on recorded insn_hist and is able to
* traverse entire chain of parent states. This process ends only when all the
* necessary registers/slots and their transitive dependencies are marked as
* precise.
@@ -4186,39 +4688,15 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
for (;;) {
DECLARE_BITMAP(mask, 64);
- u32 history = st->jmp_history_cnt;
- struct bpf_jmp_history_entry *hist;
+ u32 hist_start = st->insn_hist_start;
+ u32 hist_end = st->insn_hist_end;
+ struct bpf_insn_hist_entry *hist;
if (env->log.level & BPF_LOG_LEVEL2) {
verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
bt->frame, last_idx, first_idx, subseq_idx);
}
- /* If some register with scalar ID is marked as precise,
- * make sure that all registers sharing this ID are also precise.
- * This is needed to estimate effect of find_equal_scalars().
- * Do this at the last instruction of each state,
- * bpf_reg_state::id fields are valid for these instructions.
- *
- * Allows to track precision in situation like below:
- *
- * r2 = unknown value
- * ...
- * --- state #0 ---
- * ...
- * r1 = r2 // r1 and r2 now share the same ID
- * ...
- * --- state #1 {r1.id = A, r2.id = A} ---
- * ...
- * if (r2 > 10) goto exit; // find_equal_scalars() assigns range to r1
- * ...
- * --- state #2 {r1.id = A, r2.id = A} ---
- * r3 = r10
- * r3 += r1 // need to mark both r1 and r2
- */
- if (mark_precise_scalar_ids(env, st))
- return -EFAULT;
-
if (last_idx < 0) {
/* we are at the entry into subprog, which
* is expected for global funcs, but only if
@@ -4240,9 +4718,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
return 0;
}
- verbose(env, "BUG backtracking func entry subprog %d reg_mask %x stack_mask %llx\n",
- st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "backtracking func entry subprog %d reg_mask %x stack_mask %llx",
+ st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt));
return -EFAULT;
}
@@ -4251,7 +4728,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
err = 0;
skip_first = false;
} else {
- hist = get_jmp_hist_entry(st, history, i);
+ hist = get_insn_hist_entry(env, hist_start, hist_end, i);
err = backtrack_insn(env, i, subseq_idx, hist, bt);
}
if (err == -ENOTSUPP) {
@@ -4268,7 +4745,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
*/
return 0;
subseq_idx = i;
- i = get_prev_insn_idx(st, i, &history);
+ i = get_prev_insn_idx(env, st, i, hist_start, &hist_end);
if (i == -ENOENT)
break;
if (i >= env->prog->len) {
@@ -4278,8 +4755,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
* It means the backtracking missed the spot where
* particular register was initialized with a constant.
*/
- verbose(env, "BUG backtracking idx %d\n", i);
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "backtracking idx %d", i);
return -EFAULT;
}
}
@@ -4304,12 +4780,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
for_each_set_bit(i, mask, 64) {
- if (i >= func->allocated_stack / BPF_REG_SIZE) {
- verbose(env, "BUG backtracking (stack slot %d, total slots %d)\n",
- i, func->allocated_stack / BPF_REG_SIZE);
- WARN_ONCE(1, "verifier backtracking bug (stack slot out of bounds)");
+ if (verifier_bug_if(i >= func->allocated_stack / BPF_REG_SIZE,
+ env, "stack slot %d, total slots %d",
+ i, func->allocated_stack / BPF_REG_SIZE))
return -EFAULT;
- }
if (!is_spilled_scalar_reg(&func->stack[i])) {
bt_clear_frame_slot(bt, fr, i);
@@ -4329,7 +4803,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
bt_frame_stack_mask(bt, fr));
verbose(env, "stack=%s: ", env->tmp_str_buf);
- print_verifier_state(env, func, true);
+ print_verifier_state(env, st, fr, true);
}
}
@@ -4424,10 +4898,22 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
static void assign_scalar_id_before_mov(struct bpf_verifier_env *env,
struct bpf_reg_state *src_reg)
{
- if (src_reg->type == SCALAR_VALUE && !src_reg->id &&
- !tnum_is_const(src_reg->var_off))
+ if (src_reg->type != SCALAR_VALUE)
+ return;
+
+ if (src_reg->id & BPF_ADD_CONST) {
+ /*
+ * The verifier is processing rX = rY insn and
+ * rY->id has special linked register already.
+ * Cleared it, since multiple rX += const are not supported.
+ */
+ src_reg->id = 0;
+ src_reg->off = 0;
+ }
+
+ if (!src_reg->id && !tnum_is_const(src_reg->var_off))
/* Ensure that src_reg has a valid ID that will be copied to
- * dst_reg and then will be used by find_equal_scalars() to
+ * dst_reg and then will be used by sync_linked_regs() to
* propagate min/max range.
*/
src_reg->id = ++env->id_gen;
@@ -4473,6 +4959,31 @@ static int get_reg_width(struct bpf_reg_state *reg)
return fls64(reg->umax_value);
}
+/* See comment for mark_fastcall_pattern_for_call() */
+static void check_fastcall_stack_contract(struct bpf_verifier_env *env,
+ struct bpf_func_state *state, int insn_idx, int off)
+{
+ struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno];
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ int i;
+
+ if (subprog->fastcall_stack_off <= off || aux[insn_idx].fastcall_pattern)
+ return;
+ /* access to the region [max_stack_depth .. fastcall_stack_off)
+ * from something that is not a part of the fastcall pattern,
+ * disable fastcall rewrites for current subprogram by setting
+ * fastcall_stack_off to a value smaller than any possible offset.
+ */
+ subprog->fastcall_stack_off = S16_MIN;
+ /* reset fastcall aux flags within subprogram,
+ * happens at most once per subprogram
+ */
+ for (i = subprog->start; i < (subprog + 1)->start; ++i) {
+ aux[i].fastcall_spills_num = 0;
+ aux[i].fastcall_pattern = 0;
+ }
+}
+
/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
@@ -4493,6 +5004,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
*/
if (!env->allow_ptr_leaks &&
is_spilled_reg(&state->stack[spi]) &&
+ !is_spilled_scalar_reg(&state->stack[spi]) &&
size != BPF_REG_SIZE) {
verbose(env, "attempt to corrupt spilled pointer on stack\n");
return -EACCES;
@@ -4521,6 +5033,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
if (err)
return err;
+ check_fastcall_stack_contract(env, state, insn_idx, off);
mark_stack_slot_scratched(env, spi);
if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) {
bool reg_value_fits;
@@ -4535,11 +5048,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
state->stack[spi].spilled_ptr.id = 0;
} else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) &&
env->bpf_capable) {
- struct bpf_reg_state fake_reg = {};
+ struct bpf_reg_state *tmp_reg = &env->fake_reg[0];
- __mark_reg_known(&fake_reg, insn->imm);
- fake_reg.type = SCALAR_VALUE;
- save_register_state(env, state, spi, &fake_reg, size);
+ memset(tmp_reg, 0, sizeof(*tmp_reg));
+ __mark_reg_known(tmp_reg, insn->imm);
+ tmp_reg->type = SCALAR_VALUE;
+ save_register_state(env, state, spi, tmp_reg, size);
} else if (reg && is_spillable_regtype(reg->type)) {
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
@@ -4595,7 +5109,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
}
if (insn_flags)
- return push_jmp_history(env, env->cur_state, insn_flags);
+ return push_insn_history(env, env->cur_state, insn_flags, 0);
return 0;
}
@@ -4654,6 +5168,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
return err;
}
+ check_fastcall_stack_contract(env, state, insn_idx, min_off);
/* Variable offset writes destroy any spilled pointers in range. */
for (i = min_off; i < max_off; i++) {
u8 new_type, *stype;
@@ -4792,6 +5307,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
reg = &reg_state->stack[spi].spilled_ptr;
mark_stack_slot_scratched(env, spi);
+ check_fastcall_stack_contract(env, state, env->insn_idx, off);
if (is_spilled_reg(&reg_state->stack[spi])) {
u8 spill_size = 1;
@@ -4900,7 +5416,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
insn_flags = 0; /* we are not restoring spilled register */
}
if (insn_flags)
- return push_jmp_history(env, env->cur_state, insn_flags);
+ return push_insn_history(env, env->cur_state, insn_flags, 0);
return 0;
}
@@ -4912,7 +5428,7 @@ enum bpf_access_src {
static int check_stack_range_initialized(struct bpf_verifier_env *env,
int regno, int off, int access_size,
bool zero_size_allowed,
- enum bpf_access_src type,
+ enum bpf_access_type type,
struct bpf_call_arg_meta *meta);
static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
@@ -4945,13 +5461,14 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env,
/* Note that we pass a NULL meta, so raw access will not be permitted.
*/
err = check_stack_range_initialized(env, ptr_regno, off, size,
- false, ACCESS_DIRECT, NULL);
+ false, BPF_READ, NULL);
if (err)
return err;
min_off = reg->smin_value + off;
max_off = reg->smax_value + off;
mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno);
+ check_fastcall_stack_contract(env, ptr_state, env->insn_idx, min_off);
return 0;
}
@@ -5274,7 +5791,8 @@ bad_type:
static bool in_sleepable(struct bpf_verifier_env *env)
{
- return env->prog->sleepable;
+ return env->prog->sleepable ||
+ (env->cur_state && env->cur_state->in_sleepable);
}
/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
@@ -5283,13 +5801,15 @@ static bool in_sleepable(struct bpf_verifier_env *env)
static bool in_rcu_cs(struct bpf_verifier_env *env)
{
return env->cur_state->active_rcu_lock ||
- env->cur_state->active_lock.ptr ||
+ env->cur_state->active_locks ||
!in_sleepable(env);
}
/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
BTF_SET_START(rcu_protected_types)
+#ifdef CONFIG_NET
BTF_ID(struct, prog_test_ref_kfunc)
+#endif
#ifdef CONFIG_CGROUPS
BTF_ID(struct, cgroup)
#endif
@@ -5297,6 +5817,9 @@ BTF_ID(struct, cgroup)
BTF_ID(struct, bpf_cpumask)
#endif
BTF_ID(struct, task_struct)
+#ifdef CONFIG_CRYPTO
+BTF_ID(struct, bpf_crypto_ctx)
+#endif
BTF_SET_END(rcu_protected_types)
static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
@@ -5350,6 +5873,22 @@ static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr
return ret;
}
+static int mark_uptr_ld_reg(struct bpf_verifier_env *env, u32 regno,
+ struct btf_field *field)
+{
+ struct bpf_reg_state *reg;
+ const struct btf_type *t;
+
+ t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id);
+ mark_reg_known_zero(env, cur_regs(env), regno);
+ reg = reg_state(env, regno);
+ reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;
+ reg->mem_size = t->size;
+ reg->id = ++env->id_gen;
+
+ return 0;
+}
+
static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
int value_regno, int insn_idx,
struct btf_field *kptr_field)
@@ -5378,16 +5917,20 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
verbose(env, "store to referenced kptr disallowed\n");
return -EACCES;
}
+ if (class != BPF_LDX && kptr_field->type == BPF_UPTR) {
+ verbose(env, "store to uptr disallowed\n");
+ return -EACCES;
+ }
if (class == BPF_LDX) {
- val_reg = reg_state(env, value_regno);
+ if (kptr_field->type == BPF_UPTR)
+ return mark_uptr_ld_reg(env, value_regno, kptr_field);
+
/* We can simply mark the value_regno receiving the pointer
* value from map as PTR_TO_BTF_ID, with the correct type.
*/
mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field));
- /* For mark_ptr_or_null_reg */
- val_reg->id = ++env->id_gen;
} else if (class == BPF_STX) {
val_reg = reg_state(env, value_regno);
if (!register_is_null(val_reg) &&
@@ -5434,27 +5977,32 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* this program. To check that [x1, x2) overlaps with [y1, y2),
* it is sufficient to check x1 < y2 && y1 < x2.
*/
- if (reg->smin_value + off < p + btf_field_type_size(field->type) &&
+ if (reg->smin_value + off < p + field->size &&
p < reg->umax_value + off + size) {
switch (field->type) {
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
if (src != ACCESS_DIRECT) {
- verbose(env, "kptr cannot be accessed indirectly by helper\n");
+ verbose(env, "%s cannot be accessed indirectly by helper\n",
+ btf_field_type_name(field->type));
return -EACCES;
}
if (!tnum_is_const(reg->var_off)) {
- verbose(env, "kptr access cannot have variable offset\n");
+ verbose(env, "%s access cannot have variable offset\n",
+ btf_field_type_name(field->type));
return -EACCES;
}
if (p != off + reg->var_off.value) {
- verbose(env, "kptr access misaligned expected=%u off=%llu\n",
+ verbose(env, "%s access misaligned expected=%u off=%llu\n",
+ btf_field_type_name(field->type),
p, off + reg->var_off.value);
return -EACCES;
}
if (size != bpf_size_to_bytes(BPF_DW)) {
- verbose(env, "kptr access size must be BPF_DW\n");
+ verbose(env, "%s access size must be BPF_DW\n",
+ btf_field_type_name(field->type));
return -EACCES;
}
break;
@@ -5556,16 +6104,10 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
/* check access to 'struct bpf_context' fields. Supports fixed offsets only */
static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
- enum bpf_access_type t, enum bpf_reg_type *reg_type,
- struct btf **btf, u32 *btf_id)
+ enum bpf_access_type t, struct bpf_insn_access_aux *info)
{
- struct bpf_insn_access_aux info = {
- .reg_type = *reg_type,
- .log = &env->log,
- };
-
if (env->ops->is_valid_access &&
- env->ops->is_valid_access(off, size, t, env->prog, &info)) {
+ env->ops->is_valid_access(off, size, t, env->prog, info)) {
/* A non zero info.ctx_field_size indicates that this field is a
* candidate for later verifier transformation to load the whole
* field and then apply a mask when accessed with a narrower
@@ -5573,13 +6115,15 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
* will only allow for whole field access and rejects any other
* type of narrower access.
*/
- *reg_type = info.reg_type;
-
- if (base_type(*reg_type) == PTR_TO_BTF_ID) {
- *btf = info.btf;
- *btf_id = info.btf_id;
+ if (base_type(info->reg_type) == PTR_TO_BTF_ID) {
+ if (info->ref_obj_id &&
+ !find_reference_state(env->cur_state, info->ref_obj_id)) {
+ verbose(env, "invalid bpf_context access off=%d. Reference may already be released\n",
+ off);
+ return -EACCES;
+ }
} else {
- env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
+ env->insn_aux_data[insn_idx].ctx_field_size = info->ctx_field_size;
}
/* remember the offset of last byte accessed in ctx */
if (env->prog->aux->max_ctx_offset < off + size)
@@ -5689,6 +6233,26 @@ static bool is_arena_reg(struct bpf_verifier_env *env, int regno)
return reg->type == PTR_TO_ARENA;
}
+/* Return false if @regno contains a pointer whose type isn't supported for
+ * atomic instruction @insn.
+ */
+static bool atomic_ptr_type_ok(struct bpf_verifier_env *env, int regno,
+ struct bpf_insn *insn)
+{
+ if (is_ctx_reg(env, regno))
+ return false;
+ if (is_pkt_reg(env, regno))
+ return false;
+ if (is_flow_key_reg(env, regno))
+ return false;
+ if (is_sk_reg(env, regno))
+ return false;
+ if (is_arena_reg(env, regno))
+ return bpf_jit_supports_insn(insn, true);
+
+ return true;
+}
+
static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
#ifdef CONFIG_NET
[PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
@@ -5705,7 +6269,8 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg)
return true;
/* Types listed in the reg2btf_ids are always trusted */
- if (reg2btf_ids[base_type(reg->type)])
+ if (reg2btf_ids[base_type(reg->type)] &&
+ !bpf_type_has_unsafe_modifiers(reg->type))
return true;
/* If a register is not referenced, it is trusted if it has the
@@ -5845,6 +6410,34 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
strict);
}
+static enum priv_stack_mode bpf_enable_priv_stack(struct bpf_prog *prog)
+{
+ if (!bpf_jit_supports_private_stack())
+ return NO_PRIV_STACK;
+
+ /* bpf_prog_check_recur() checks all prog types that use bpf trampoline
+ * while kprobe/tp/perf_event/raw_tp don't use trampoline hence checked
+ * explicitly.
+ */
+ switch (prog->type) {
+ case BPF_PROG_TYPE_KPROBE:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ return PRIV_STACK_ADAPTIVE;
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ if (prog->aux->priv_stack_requested || bpf_prog_check_recur(prog))
+ return PRIV_STACK_ADAPTIVE;
+ fallthrough;
+ default:
+ break;
+ }
+
+ return NO_PRIV_STACK;
+}
+
static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
{
if (env->prog->jit_requested)
@@ -5862,17 +6455,20 @@ static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
* Since recursion is prevented by check_cfg() this algorithm
* only needs a local stack of MAX_CALL_FRAMES to remember callsites
*/
-static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
+static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
+ bool priv_stack_supported)
{
struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
- int depth = 0, frame = 0, i, subprog_end;
+ int depth = 0, frame = 0, i, subprog_end, subprog_depth;
bool tail_call_reachable = false;
int ret_insn[MAX_CALL_FRAMES];
int ret_prog[MAX_CALL_FRAMES];
int j;
i = subprog[idx].start;
+ if (!priv_stack_supported)
+ subprog[idx].priv_stack_mode = NO_PRIV_STACK;
process_func:
/* protect against potential stack overflow that might happen when
* bpf2bpf calls get combined with tailcalls. Limit the caller's stack
@@ -5899,11 +6495,31 @@ process_func:
depth);
return -EACCES;
}
- depth += round_up_stack_depth(env, subprog[idx].stack_depth);
- if (depth > MAX_BPF_STACK) {
- verbose(env, "combined stack size of %d calls is %d. Too large\n",
- frame + 1, depth);
- return -EACCES;
+
+ subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth);
+ if (priv_stack_supported) {
+ /* Request private stack support only if the subprog stack
+ * depth is no less than BPF_PRIV_STACK_MIN_SIZE. This is to
+ * avoid jit penalty if the stack usage is small.
+ */
+ if (subprog[idx].priv_stack_mode == PRIV_STACK_UNKNOWN &&
+ subprog_depth >= BPF_PRIV_STACK_MIN_SIZE)
+ subprog[idx].priv_stack_mode = PRIV_STACK_ADAPTIVE;
+ }
+
+ if (subprog[idx].priv_stack_mode == PRIV_STACK_ADAPTIVE) {
+ if (subprog_depth > MAX_BPF_STACK) {
+ verbose(env, "stack size of subprog %d is %d. Too large\n",
+ idx, subprog_depth);
+ return -EACCES;
+ }
+ } else {
+ depth += subprog_depth;
+ if (depth > MAX_BPF_STACK) {
+ verbose(env, "combined stack size of %d calls is %d. Too large\n",
+ frame + 1, depth);
+ return -EACCES;
+ }
}
continue_func:
subprog_end = subprog[idx + 1].start;
@@ -5940,26 +6556,25 @@ continue_func:
/* find the callee */
next_insn = i + insn[i].imm + 1;
sidx = find_subprog(env, next_insn);
- if (sidx < 0) {
- WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
- next_insn);
+ if (verifier_bug_if(sidx < 0, env, "callee not found at insn %d", next_insn))
return -EFAULT;
- }
if (subprog[sidx].is_async_cb) {
if (subprog[sidx].has_tail_call) {
- verbose(env, "verifier bug. subprog has tail_call and async cb\n");
+ verifier_bug(env, "subprog has tail_call and async cb");
return -EFAULT;
}
/* async callbacks don't increase bpf prog stack size unless called directly */
if (!bpf_pseudo_call(insn + i))
continue;
if (subprog[sidx].is_exception_cb) {
- verbose(env, "insn %d cannot call exception cb directly\n", i);
+ verbose(env, "insn %d cannot call exception cb directly", i);
return -EINVAL;
}
}
i = next_insn;
idx = sidx;
+ if (!priv_stack_supported)
+ subprog[idx].priv_stack_mode = NO_PRIV_STACK;
if (subprog[idx].has_tail_call)
tail_call_reachable = true;
@@ -5993,7 +6608,8 @@ continue_func:
*/
if (frame == 0)
return 0;
- depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
+ if (subprog[idx].priv_stack_mode != PRIV_STACK_ADAPTIVE)
+ depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
frame--;
i = ret_insn[frame];
idx = ret_prog[frame];
@@ -6002,17 +6618,45 @@ continue_func:
static int check_max_stack_depth(struct bpf_verifier_env *env)
{
+ enum priv_stack_mode priv_stack_mode = PRIV_STACK_UNKNOWN;
struct bpf_subprog_info *si = env->subprog_info;
+ bool priv_stack_supported;
int ret;
for (int i = 0; i < env->subprog_cnt; i++) {
+ if (si[i].has_tail_call) {
+ priv_stack_mode = NO_PRIV_STACK;
+ break;
+ }
+ }
+
+ if (priv_stack_mode == PRIV_STACK_UNKNOWN)
+ priv_stack_mode = bpf_enable_priv_stack(env->prog);
+
+ /* All async_cb subprogs use normal kernel stack. If a particular
+ * subprog appears in both main prog and async_cb subtree, that
+ * subprog will use normal kernel stack to avoid potential nesting.
+ * The reverse subprog traversal ensures when main prog subtree is
+ * checked, the subprogs appearing in async_cb subtrees are already
+ * marked as using normal kernel stack, so stack size checking can
+ * be done properly.
+ */
+ for (int i = env->subprog_cnt - 1; i >= 0; i--) {
if (!i || si[i].is_async_cb) {
- ret = check_max_stack_depth_subprog(env, i);
+ priv_stack_supported = !i && priv_stack_mode == PRIV_STACK_ADAPTIVE;
+ ret = check_max_stack_depth_subprog(env, i, priv_stack_supported);
if (ret < 0)
return ret;
}
- continue;
}
+
+ for (int i = 0; i < env->subprog_cnt; i++) {
+ if (si[i].priv_stack_mode == PRIV_STACK_ADAPTIVE) {
+ env->prog->aux->jits_use_priv_stack = true;
+ break;
+ }
+ }
+
return 0;
}
@@ -6023,11 +6667,8 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
int start = idx + insn->imm + 1, subprog;
subprog = find_subprog(env, start);
- if (subprog < 0) {
- WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
- start);
+ if (verifier_bug_if(subprog < 0, env, "get stack depth: no program at insn %d", start))
return -EFAULT;
- }
return env->subprog_info[subprog].stack_depth;
}
#endif
@@ -6196,10 +6837,10 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size)
/* both of s64_max/s64_min positive or negative */
if ((s64_max >= 0) == (s64_min >= 0)) {
- reg->smin_value = reg->s32_min_value = s64_min;
- reg->smax_value = reg->s32_max_value = s64_max;
- reg->umin_value = reg->u32_min_value = s64_min;
- reg->umax_value = reg->u32_max_value = s64_max;
+ reg->s32_min_value = reg->smin_value = s64_min;
+ reg->s32_max_value = reg->smax_value = s64_max;
+ reg->u32_min_value = reg->umin_value = s64_min;
+ reg->u32_max_value = reg->umax_value = s64_max;
reg->var_off = tnum_range(s64_min, s64_max);
return;
}
@@ -6220,6 +6861,7 @@ static void set_sext32_default_val(struct bpf_reg_state *reg, int size)
}
reg->u32_min_value = 0;
reg->u32_max_value = U32_MAX;
+ reg->var_off = tnum_subreg(tnum_unknown);
}
static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size)
@@ -6264,6 +6906,7 @@ static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size)
reg->s32_max_value = s32_max;
reg->u32_min_value = (u32)s32_min;
reg->u32_max_value = (u32)s32_max;
+ reg->var_off = tnum_subreg(tnum_range(s32_min, s32_max));
return;
}
@@ -6325,6 +6968,7 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val,
#define BTF_TYPE_SAFE_RCU(__type) __PASTE(__type, __safe_rcu)
#define BTF_TYPE_SAFE_RCU_OR_NULL(__type) __PASTE(__type, __safe_rcu_or_null)
#define BTF_TYPE_SAFE_TRUSTED(__type) __PASTE(__type, __safe_trusted)
+#define BTF_TYPE_SAFE_TRUSTED_OR_NULL(__type) __PASTE(__type, __safe_trusted_or_null)
/*
* Allow list few fields as RCU trusted or full trusted.
@@ -6388,7 +7032,7 @@ BTF_TYPE_SAFE_TRUSTED(struct dentry) {
struct inode *d_inode;
};
-BTF_TYPE_SAFE_TRUSTED(struct socket) {
+BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) {
struct sock *sk;
};
@@ -6423,11 +7067,20 @@ static bool type_is_trusted(struct bpf_verifier_env *env,
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry));
- BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket));
return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted");
}
+static bool type_is_trusted_or_null(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id,
+ "__safe_trusted_or_null");
+}
+
static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
struct bpf_reg_state *regs,
int regno, int off, int size,
@@ -6536,6 +7189,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
*/
if (type_is_trusted(env, reg, field_name, btf_id)) {
flag |= PTR_TRUSTED;
+ } else if (type_is_trusted_or_null(env, reg, field_name, btf_id)) {
+ flag |= PTR_TRUSTED | PTR_MAYBE_NULL;
} else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) {
if (type_is_rcu(env, reg, field_name, btf_id)) {
/* ignore __rcu tag and mark it MEM_RCU */
@@ -6667,7 +7322,7 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env,
static int check_stack_access_within_bounds(
struct bpf_verifier_env *env,
int regno, int off, int access_size,
- enum bpf_access_src src, enum bpf_access_type type)
+ enum bpf_access_type type)
{
struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = regs + regno;
@@ -6676,10 +7331,7 @@ static int check_stack_access_within_bounds(
int err;
char *err_extra;
- if (src == ACCESS_HELPER)
- /* We don't know if helpers are reading or writing (or both). */
- err_extra = " indirect access to";
- else if (type == BPF_READ)
+ if (type == BPF_READ)
err_extra = " read from";
else
err_extra = " write to";
@@ -6727,6 +7379,17 @@ static int check_stack_access_within_bounds(
return grow_stack_state(env, state, -min_off /* size */);
}
+static bool get_func_retval_range(struct bpf_prog *prog,
+ struct bpf_retval_range *range)
+{
+ if (prog->type == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type == BPF_LSM_MAC &&
+ !bpf_lsm_get_retval_range(prog, range)) {
+ return true;
+ }
+ return false;
+}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
@@ -6781,7 +7444,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return err;
if (tnum_is_const(reg->var_off))
kptr_field = btf_record_find(reg->map_ptr->record,
- off + reg->var_off.value, BPF_KPTR);
+ off + reg->var_off.value, BPF_KPTR | BPF_UPTR);
if (kptr_field) {
err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field);
} else if (t == BPF_READ && value_regno >= 0) {
@@ -6831,9 +7494,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
- enum bpf_reg_type reg_type = SCALAR_VALUE;
- struct btf *btf = NULL;
- u32 btf_id = 0;
+ struct bpf_retval_range range;
+ struct bpf_insn_access_aux info = {
+ .reg_type = SCALAR_VALUE,
+ .is_ldsx = is_ldsx,
+ .log = &env->log,
+ };
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
@@ -6845,8 +7511,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (err < 0)
return err;
- err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf,
- &btf_id);
+ err = check_ctx_access(env, insn_idx, off, size, t, &info);
if (err)
verbose_linfo(env, insn_idx, "; ");
if (!err && t == BPF_READ && value_regno >= 0) {
@@ -6854,12 +7519,19 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* PTR_TO_PACKET[_META,_END]. In the latter
* case, we know the offset is zero.
*/
- if (reg_type == SCALAR_VALUE) {
- mark_reg_unknown(env, regs, value_regno);
+ if (info.reg_type == SCALAR_VALUE) {
+ if (info.is_retval && get_func_retval_range(env->prog, &range)) {
+ err = __mark_reg_s32_range(env, regs, value_regno,
+ range.minval, range.maxval);
+ if (err)
+ return err;
+ } else {
+ mark_reg_unknown(env, regs, value_regno);
+ }
} else {
mark_reg_known_zero(env, regs,
value_regno);
- if (type_may_be_null(reg_type))
+ if (type_may_be_null(info.reg_type))
regs[value_regno].id = ++env->id_gen;
/* A load of ctx field could have different
* actual load size with the one encoded in the
@@ -6867,17 +7539,18 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* a sub-register.
*/
regs[value_regno].subreg_def = DEF_NOT_SUBREG;
- if (base_type(reg_type) == PTR_TO_BTF_ID) {
- regs[value_regno].btf = btf;
- regs[value_regno].btf_id = btf_id;
+ if (base_type(info.reg_type) == PTR_TO_BTF_ID) {
+ regs[value_regno].btf = info.btf;
+ regs[value_regno].btf_id = info.btf_id;
+ regs[value_regno].ref_obj_id = info.ref_obj_id;
}
}
- regs[value_regno].type = reg_type;
+ regs[value_regno].type = info.reg_type;
}
} else if (reg->type == PTR_TO_STACK) {
/* Basic bounds checks. */
- err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t);
+ err = check_stack_access_within_bounds(env, regno, off, size, t);
if (err)
return err;
@@ -6972,27 +7645,75 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return err;
}
-static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
+static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
+ bool allow_trust_mismatch);
+
+static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ bool strict_alignment_once, bool is_ldsx,
+ bool allow_trust_mismatch, const char *ctx)
{
- int load_reg;
+ struct bpf_reg_state *regs = cur_regs(env);
+ enum bpf_reg_type src_reg_type;
int err;
- switch (insn->imm) {
- case BPF_ADD:
- case BPF_ADD | BPF_FETCH:
- case BPF_AND:
- case BPF_AND | BPF_FETCH:
- case BPF_OR:
- case BPF_OR | BPF_FETCH:
- case BPF_XOR:
- case BPF_XOR | BPF_FETCH:
- case BPF_XCHG:
- case BPF_CMPXCHG:
- break;
- default:
- verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm);
- return -EINVAL;
- }
+ /* check src operand */
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check dst operand */
+ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
+ if (err)
+ return err;
+
+ src_reg_type = regs[insn->src_reg].type;
+
+ /* Check if (src_reg + off) is readable. The state of dst_reg will be
+ * updated by this call.
+ */
+ err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ, insn->dst_reg,
+ strict_alignment_once, is_ldsx);
+ err = err ?: save_aux_ptr_type(env, src_reg_type,
+ allow_trust_mismatch);
+ err = err ?: reg_bounds_sanity_check(env, &regs[insn->dst_reg], ctx);
+
+ return err;
+}
+
+static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ bool strict_alignment_once)
+{
+ struct bpf_reg_state *regs = cur_regs(env);
+ enum bpf_reg_type dst_reg_type;
+ int err;
+
+ /* check src1 operand */
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check src2 operand */
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ dst_reg_type = regs[insn->dst_reg].type;
+
+ /* Check if (dst_reg + off) is writeable. */
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg,
+ strict_alignment_once, false);
+ err = err ?: save_aux_ptr_type(env, dst_reg_type, false);
+
+ return err;
+}
+
+static int check_atomic_rmw(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ int load_reg;
+ int err;
if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) {
verbose(env, "invalid atomic operand size\n");
@@ -7028,11 +7749,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
return -EACCES;
}
- if (is_ctx_reg(env, insn->dst_reg) ||
- is_pkt_reg(env, insn->dst_reg) ||
- is_flow_key_reg(env, insn->dst_reg) ||
- is_sk_reg(env, insn->dst_reg) ||
- is_arena_reg(env, insn->dst_reg)) {
+ if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) {
verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
insn->dst_reg,
reg_type_str(env, reg_state(env, insn->dst_reg)->type));
@@ -7059,23 +7776,101 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
/* Check whether we can read the memory, with second call for fetch
* case to simulate the register fill.
*/
- err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, -1, true, false);
if (!err && load_reg >= 0)
- err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_READ, load_reg,
- true, false);
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg,
+ insn->off, BPF_SIZE(insn->code),
+ BPF_READ, load_reg, true, false);
if (err)
return err;
+ if (is_arena_reg(env, insn->dst_reg)) {
+ err = save_aux_ptr_type(env, PTR_TO_ARENA, false);
+ if (err)
+ return err;
+ }
/* Check whether we can write into the same memory. */
- err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE, -1, true, false);
if (err)
return err;
return 0;
}
+static int check_atomic_load(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ int err;
+
+ err = check_load_mem(env, insn, true, false, false, "atomic_load");
+ if (err)
+ return err;
+
+ if (!atomic_ptr_type_ok(env, insn->src_reg, insn)) {
+ verbose(env, "BPF_ATOMIC loads from R%d %s is not allowed\n",
+ insn->src_reg,
+ reg_type_str(env, reg_state(env, insn->src_reg)->type));
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int check_atomic_store(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ int err;
+
+ err = check_store_reg(env, insn, true);
+ if (err)
+ return err;
+
+ if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) {
+ verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
+ insn->dst_reg,
+ reg_type_str(env, reg_state(env, insn->dst_reg)->type));
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ switch (insn->imm) {
+ case BPF_ADD:
+ case BPF_ADD | BPF_FETCH:
+ case BPF_AND:
+ case BPF_AND | BPF_FETCH:
+ case BPF_OR:
+ case BPF_OR | BPF_FETCH:
+ case BPF_XOR:
+ case BPF_XOR | BPF_FETCH:
+ case BPF_XCHG:
+ case BPF_CMPXCHG:
+ return check_atomic_rmw(env, insn);
+ case BPF_LOAD_ACQ:
+ if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) {
+ verbose(env,
+ "64-bit load-acquires are only supported on 64-bit arches\n");
+ return -EOPNOTSUPP;
+ }
+ return check_atomic_load(env, insn);
+ case BPF_STORE_REL:
+ if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) {
+ verbose(env,
+ "64-bit store-releases are only supported on 64-bit arches\n");
+ return -EOPNOTSUPP;
+ }
+ return check_atomic_store(env, insn);
+ default:
+ verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n",
+ insn->imm);
+ return -EINVAL;
+ }
+}
+
/* When register 'regno' is used to read the stack (either directly or through
* a helper function) make sure that it's within stack boundary and, depending
* on the access type and privileges, that all elements of the stack are
@@ -7089,13 +7884,11 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
static int check_stack_range_initialized(
struct bpf_verifier_env *env, int regno, int off,
int access_size, bool zero_size_allowed,
- enum bpf_access_src type, struct bpf_call_arg_meta *meta)
+ enum bpf_access_type type, struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_func_state *state = func(env, reg);
int err, min_off, max_off, i, j, slot, spi;
- char *err_extra = type == ACCESS_HELPER ? " indirect" : "";
- enum bpf_access_type bounds_check_type;
/* Some accesses can write anything into the stack, others are
* read-only.
*/
@@ -7106,18 +7899,10 @@ static int check_stack_range_initialized(
return -EACCES;
}
- if (type == ACCESS_HELPER) {
- /* The bounds checks for writes are more permissive than for
- * reads. However, if raw_mode is not set, we'll do extra
- * checks below.
- */
- bounds_check_type = BPF_WRITE;
+ if (type == BPF_WRITE)
clobber = true;
- } else {
- bounds_check_type = BPF_READ;
- }
- err = check_stack_access_within_bounds(env, regno, off, access_size,
- type, bounds_check_type);
+
+ err = check_stack_access_within_bounds(env, regno, off, access_size, type);
if (err)
return err;
@@ -7134,8 +7919,8 @@ static int check_stack_range_initialized(
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n",
- regno, err_extra, tn_buf);
+ verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
+ regno, tn_buf);
return -EACCES;
}
/* Only initialized buffer on stack is allowed to be accessed
@@ -7188,7 +7973,7 @@ static int check_stack_range_initialized(
slot = -i - 1;
spi = slot / BPF_REG_SIZE;
if (state->allocated_stack <= slot) {
- verbose(env, "verifier bug: allocated_stack too small");
+ verbose(env, "allocated_stack too small\n");
return -EFAULT;
}
@@ -7216,14 +8001,14 @@ static int check_stack_range_initialized(
}
if (tnum_is_const(reg->var_off)) {
- verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n",
- err_extra, regno, min_off, i - min_off, access_size);
+ verbose(env, "invalid read from stack R%d off %d+%d size %d\n",
+ regno, min_off, i - min_off, access_size);
} else {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n",
- err_extra, regno, tn_buf, i - min_off, access_size);
+ verbose(env, "invalid read from stack R%d var_off %s+%d size %d\n",
+ regno, tn_buf, i - min_off, access_size);
}
return -EACCES;
mark:
@@ -7243,7 +8028,8 @@ mark:
}
static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
- int access_size, bool zero_size_allowed,
+ int access_size, enum bpf_access_type access_type,
+ bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
@@ -7255,7 +8041,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return check_packet_access(env, regno, reg->off, access_size,
zero_size_allowed);
case PTR_TO_MAP_KEY:
- if (meta && meta->raw_mode) {
+ if (access_type == BPF_WRITE) {
verbose(env, "R%d cannot write into %s\n", regno,
reg_type_str(env, reg->type));
return -EACCES;
@@ -7263,15 +8049,13 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return check_mem_region_access(env, regno, reg->off, access_size,
reg->map_ptr->key_size, false);
case PTR_TO_MAP_VALUE:
- if (check_map_access_type(env, regno, reg->off, access_size,
- meta && meta->raw_mode ? BPF_WRITE :
- BPF_READ))
+ if (check_map_access_type(env, regno, reg->off, access_size, access_type))
return -EACCES;
return check_map_access(env, regno, reg->off, access_size,
zero_size_allowed, ACCESS_HELPER);
case PTR_TO_MEM:
if (type_is_rdonly_mem(reg->type)) {
- if (meta && meta->raw_mode) {
+ if (access_type == BPF_WRITE) {
verbose(env, "R%d cannot write into %s\n", regno,
reg_type_str(env, reg->type));
return -EACCES;
@@ -7282,7 +8066,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
zero_size_allowed);
case PTR_TO_BUF:
if (type_is_rdonly_mem(reg->type)) {
- if (meta && meta->raw_mode) {
+ if (access_type == BPF_WRITE) {
verbose(env, "R%d cannot write into %s\n", regno,
reg_type_str(env, reg->type));
return -EACCES;
@@ -7299,7 +8083,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return check_stack_range_initialized(
env,
regno, reg->off, access_size,
- zero_size_allowed, ACCESS_HELPER, meta);
+ zero_size_allowed, access_type, meta);
case PTR_TO_BTF_ID:
return check_ptr_to_btf_access(env, regs, regno, reg->off,
access_size, BPF_READ, -1);
@@ -7310,7 +8094,6 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
* Dynamically check it now.
*/
if (!env->ops->convert_ctx_access) {
- enum bpf_access_type atype = meta && meta->raw_mode ? BPF_WRITE : BPF_READ;
int offset = access_size - 1;
/* Allow zero-byte read from PTR_TO_CTX */
@@ -7318,7 +8101,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return zero_size_allowed ? 0 : -EACCES;
return check_mem_access(env, env->insn_idx, regno, offset, BPF_B,
- atype, -1, false, false);
+ access_type, -1, false, false);
}
fallthrough;
@@ -7343,6 +8126,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
*/
static int check_mem_size_reg(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, u32 regno,
+ enum bpf_access_type access_type,
bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
@@ -7358,15 +8142,12 @@ static int check_mem_size_reg(struct bpf_verifier_env *env,
*/
meta->msize_max_value = reg->umax_value;
- /* The register is SCALAR_VALUE; the access check
- * happens using its boundaries.
+ /* The register is SCALAR_VALUE; the access check happens using
+ * its boundaries. For unprivileged variable accesses, disable
+ * raw mode so that the program is required to initialize all
+ * the memory that the helper could just partially fill up.
*/
if (!tnum_is_const(reg->var_off))
- /* For unprivileged variable accesses, disable raw
- * mode so that the program is required to
- * initialize all the memory that the helper could
- * just partially fill up.
- */
meta = NULL;
if (reg->smin_value < 0) {
@@ -7386,9 +8167,8 @@ static int check_mem_size_reg(struct bpf_verifier_env *env,
regno);
return -EACCES;
}
- err = check_helper_mem_access(env, regno - 1,
- reg->umax_value,
- zero_size_allowed, meta);
+ err = check_helper_mem_access(env, regno - 1, reg->umax_value,
+ access_type, zero_size_allowed, meta);
if (!err)
err = mark_chain_precision(env, regno);
return err;
@@ -7399,13 +8179,11 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg
{
bool may_be_null = type_may_be_null(reg->type);
struct bpf_reg_state saved_reg;
- struct bpf_call_arg_meta meta;
int err;
if (register_is_null(reg))
return 0;
- memset(&meta, 0, sizeof(meta));
/* Assuming that the register contains a value check if the memory
* access is safe. Temporarily save and restore the register's state as
* the conversion shouldn't be visible to a caller.
@@ -7415,10 +8193,8 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg
mark_ptr_not_null_reg(reg);
}
- err = check_helper_mem_access(env, regno, mem_size, true, &meta);
- /* Check access for BPF_WRITE */
- meta.raw_mode = true;
- err = err ?: check_helper_mem_access(env, regno, mem_size, true, &meta);
+ err = check_helper_mem_access(env, regno, mem_size, BPF_READ, true, NULL);
+ err = err ?: check_helper_mem_access(env, regno, mem_size, BPF_WRITE, true, NULL);
if (may_be_null)
*reg = saved_reg;
@@ -7444,16 +8220,21 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg
mark_ptr_not_null_reg(mem_reg);
}
- err = check_mem_size_reg(env, reg, regno, true, &meta);
- /* Check access for BPF_WRITE */
- meta.raw_mode = true;
- err = err ?: check_mem_size_reg(env, reg, regno, true, &meta);
+ err = check_mem_size_reg(env, reg, regno, BPF_READ, true, &meta);
+ err = err ?: check_mem_size_reg(env, reg, regno, BPF_WRITE, true, &meta);
if (may_be_null)
*mem_reg = saved_reg;
+
return err;
}
+enum {
+ PROCESS_SPIN_LOCK = (1 << 0),
+ PROCESS_RES_LOCK = (1 << 1),
+ PROCESS_LOCK_IRQ = (1 << 2),
+};
+
/* Implementation details:
* bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL.
* bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL.
@@ -7473,32 +8254,36 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg
* Since only one bpf_spin_lock is allowed the checks are simpler than
* reg_is_refcounted() logic. The verifier needs to remember only
* one spin_lock instead of array of acquired_refs.
- * cur_state->active_lock remembers which map value element or allocated
+ * env->cur_state->active_locks remembers which map value element or allocated
* object got locked and clears it after bpf_spin_unlock.
*/
-static int process_spin_lock(struct bpf_verifier_env *env, int regno,
- bool is_lock)
+static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags)
{
+ bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK;
+ const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin";
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
struct bpf_verifier_state *cur = env->cur_state;
bool is_const = tnum_is_const(reg->var_off);
+ bool is_irq = flags & PROCESS_LOCK_IRQ;
u64 val = reg->var_off.value;
struct bpf_map *map = NULL;
struct btf *btf = NULL;
struct btf_record *rec;
+ u32 spin_lock_off;
+ int err;
if (!is_const) {
verbose(env,
- "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n",
- regno);
+ "R%d doesn't have constant offset. %s_lock has to be at the constant offset\n",
+ regno, lock_str);
return -EINVAL;
}
if (reg->type == PTR_TO_MAP_VALUE) {
map = reg->map_ptr;
if (!map->btf) {
verbose(env,
- "map '%s' has to have BTF in order to use bpf_spin_lock\n",
- map->name);
+ "map '%s' has to have BTF in order to use %s_lock\n",
+ map->name, lock_str);
return -EINVAL;
}
} else {
@@ -7506,49 +8291,84 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
}
rec = reg_btf_record(reg);
- if (!btf_record_has_field(rec, BPF_SPIN_LOCK)) {
- verbose(env, "%s '%s' has no valid bpf_spin_lock\n", map ? "map" : "local",
- map ? map->name : "kptr");
+ if (!btf_record_has_field(rec, is_res_lock ? BPF_RES_SPIN_LOCK : BPF_SPIN_LOCK)) {
+ verbose(env, "%s '%s' has no valid %s_lock\n", map ? "map" : "local",
+ map ? map->name : "kptr", lock_str);
return -EINVAL;
}
- if (rec->spin_lock_off != val + reg->off) {
- verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n",
- val + reg->off, rec->spin_lock_off);
+ spin_lock_off = is_res_lock ? rec->res_spin_lock_off : rec->spin_lock_off;
+ if (spin_lock_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct %s_lock' that is at %d\n",
+ val + reg->off, lock_str, spin_lock_off);
return -EINVAL;
}
if (is_lock) {
- if (cur->active_lock.ptr) {
- verbose(env,
- "Locking two bpf_spin_locks are not allowed\n");
- return -EINVAL;
- }
+ void *ptr;
+ int type;
+
if (map)
- cur->active_lock.ptr = map;
+ ptr = map;
+ else
+ ptr = btf;
+
+ if (!is_res_lock && cur->active_locks) {
+ if (find_lock_state(env->cur_state, REF_TYPE_LOCK, 0, NULL)) {
+ verbose(env,
+ "Locking two bpf_spin_locks are not allowed\n");
+ return -EINVAL;
+ }
+ } else if (is_res_lock && cur->active_locks) {
+ if (find_lock_state(env->cur_state, REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ, reg->id, ptr)) {
+ verbose(env, "Acquiring the same lock again, AA deadlock detected\n");
+ return -EINVAL;
+ }
+ }
+
+ if (is_res_lock && is_irq)
+ type = REF_TYPE_RES_LOCK_IRQ;
+ else if (is_res_lock)
+ type = REF_TYPE_RES_LOCK;
else
- cur->active_lock.ptr = btf;
- cur->active_lock.id = reg->id;
+ type = REF_TYPE_LOCK;
+ err = acquire_lock_state(env, env->insn_idx, type, reg->id, ptr);
+ if (err < 0) {
+ verbose(env, "Failed to acquire lock state\n");
+ return err;
+ }
} else {
void *ptr;
+ int type;
if (map)
ptr = map;
else
ptr = btf;
- if (!cur->active_lock.ptr) {
- verbose(env, "bpf_spin_unlock without taking a lock\n");
+ if (!cur->active_locks) {
+ verbose(env, "%s_unlock without taking a lock\n", lock_str);
return -EINVAL;
}
- if (cur->active_lock.ptr != ptr ||
- cur->active_lock.id != reg->id) {
- verbose(env, "bpf_spin_unlock of different lock\n");
+
+ if (is_res_lock && is_irq)
+ type = REF_TYPE_RES_LOCK_IRQ;
+ else if (is_res_lock)
+ type = REF_TYPE_RES_LOCK;
+ else
+ type = REF_TYPE_LOCK;
+ if (!find_lock_state(cur, type, reg->id, ptr)) {
+ verbose(env, "%s_unlock of different lock\n", lock_str);
+ return -EINVAL;
+ }
+ if (reg->id != cur->active_lock_id || ptr != cur->active_lock_ptr) {
+ verbose(env, "%s_unlock cannot be out of order\n", lock_str);
+ return -EINVAL;
+ }
+ if (release_lock_state(cur, type, reg->id, ptr)) {
+ verbose(env, "%s_unlock of different lock\n", lock_str);
return -EINVAL;
}
invalidate_non_owning_refs(env);
-
- cur->active_lock.ptr = NULL;
- cur->active_lock.id = 0;
}
return 0;
}
@@ -7582,7 +8402,7 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
return -EINVAL;
}
if (meta->map_ptr) {
- verbose(env, "verifier bug. Two map pointers in a timer helper\n");
+ verifier_bug(env, "Two map pointers in a timer helper");
return -EFAULT;
}
meta->map_uid = reg->map_uid;
@@ -7590,33 +8410,59 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
return 0;
}
+static int process_wq_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_map *map = reg->map_ptr;
+ u64 val = reg->var_off.value;
+
+ if (map->record->wq_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct bpf_wq' that is at %d\n",
+ val + reg->off, map->record->wq_off);
+ return -EINVAL;
+ }
+ meta->map.uid = reg->map_uid;
+ meta->map.ptr = map;
+ return 0;
+}
+
static int process_kptr_func(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- struct bpf_map *map_ptr = reg->map_ptr;
struct btf_field *kptr_field;
+ struct bpf_map *map_ptr;
+ struct btf_record *rec;
u32 kptr_off;
+ if (type_is_ptr_alloc_obj(reg->type)) {
+ rec = reg_btf_record(reg);
+ } else { /* PTR_TO_MAP_VALUE */
+ map_ptr = reg->map_ptr;
+ if (!map_ptr->btf) {
+ verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n",
+ map_ptr->name);
+ return -EINVAL;
+ }
+ rec = map_ptr->record;
+ meta->map_ptr = map_ptr;
+ }
+
if (!tnum_is_const(reg->var_off)) {
verbose(env,
"R%d doesn't have constant offset. kptr has to be at the constant offset\n",
regno);
return -EINVAL;
}
- if (!map_ptr->btf) {
- verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n",
- map_ptr->name);
- return -EINVAL;
- }
- if (!btf_record_has_field(map_ptr->record, BPF_KPTR)) {
- verbose(env, "map '%s' has no valid kptr\n", map_ptr->name);
+
+ if (!btf_record_has_field(rec, BPF_KPTR)) {
+ verbose(env, "R%d has no valid kptr\n", regno);
return -EINVAL;
}
- meta->map_ptr = map_ptr;
kptr_off = reg->off + reg->var_off.value;
- kptr_field = btf_record_find(map_ptr->record, kptr_off, BPF_KPTR);
+ kptr_field = btf_record_find(rec, kptr_off, BPF_KPTR);
if (!kptr_field) {
verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
return -EACCES;
@@ -7660,6 +8506,13 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
int err;
+ if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) {
+ verbose(env,
+ "arg#%d expected pointer to stack or const struct bpf_dynptr\n",
+ regno - 1);
+ return -EINVAL;
+ }
+
/* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
* ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
*/
@@ -7710,7 +8563,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
if (!is_dynptr_reg_valid_init(env, reg)) {
verbose(env,
"Expected an initialized dynptr as arg #%d\n",
- regno);
+ regno - 1);
return -EINVAL;
}
@@ -7718,7 +8571,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
verbose(env,
"Expected a dynptr of type %s as arg #%d\n",
- dynptr_type_str(arg_to_dynptr_type(arg_type)), regno);
+ dynptr_type_str(arg_to_dynptr_type(arg_type)), regno - 1);
return -EINVAL;
}
@@ -7754,12 +8607,17 @@ static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta)
return meta->kfunc_flags & KF_ITER_DESTROY;
}
-static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg)
+static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx,
+ const struct btf_param *arg)
{
/* btf_check_iter_kfuncs() guarantees that first argument of any iter
* kfunc is iter state pointer
*/
- return arg == 0 && is_iter_kfunc(meta);
+ if (is_iter_kfunc(meta))
+ return arg_idx == 0;
+
+ /* iter passed as an argument to a generic kfunc */
+ return btf_param_match_suffix(meta->btf, arg, "__iter");
}
static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
@@ -7767,21 +8625,32 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
const struct btf_type *t;
- const struct btf_param *arg;
- int spi, err, i, nr_slots;
- u32 btf_id;
+ int spi, err, i, nr_slots, btf_id;
+
+ if (reg->type != PTR_TO_STACK) {
+ verbose(env, "arg#%d expected pointer to an iterator on stack\n", regno - 1);
+ return -EINVAL;
+ }
- /* btf_check_iter_kfuncs() ensures we don't need to validate anything here */
- arg = &btf_params(meta->func_proto)[0];
- t = btf_type_skip_modifiers(meta->btf, arg->type, NULL); /* PTR */
- t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id); /* STRUCT */
+ /* For iter_{new,next,destroy} functions, btf_check_iter_kfuncs()
+ * ensures struct convention, so we wouldn't need to do any BTF
+ * validation here. But given iter state can be passed as a parameter
+ * to any kfunc, if arg has "__iter" suffix, we need to be a bit more
+ * conservative here.
+ */
+ btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, regno - 1);
+ if (btf_id < 0) {
+ verbose(env, "expected valid iter pointer as arg #%d\n", regno - 1);
+ return -EINVAL;
+ }
+ t = btf_type_by_id(meta->btf, btf_id);
nr_slots = t->size / BPF_REG_SIZE;
if (is_iter_new_kfunc(meta)) {
/* bpf_iter_<type>_new() expects pointer to uninit iter state */
if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
verbose(env, "expected uninitialized iter_%s as arg #%d\n",
- iter_type_str(meta->btf, btf_id), regno);
+ iter_type_str(meta->btf, btf_id), regno - 1);
return -EINVAL;
}
@@ -7796,14 +8665,16 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
if (err)
return err;
} else {
- /* iter_next() or iter_destroy() expect initialized iter state*/
+ /* iter_next() or iter_destroy(), as well as any kfunc
+ * accepting iter argument, expect initialized iter state
+ */
err = is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots);
switch (err) {
case 0:
break;
case -EINVAL:
verbose(env, "expected an initialized iter_%s as arg #%d\n",
- iter_type_str(meta->btf, btf_id), regno);
+ iter_type_str(meta->btf, btf_id), regno - 1);
return err;
case -EPROTO:
verbose(env, "expected an RCU CS when using %s\n", meta->func_name);
@@ -7844,10 +8715,12 @@ static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env,
{
struct bpf_verifier_state_list *sl;
struct bpf_verifier_state *st;
+ struct list_head *pos, *head;
/* Explored states are pushed in stack order, most recent states come first */
- sl = *explored_state(env, insn_idx);
- for (; sl; sl = sl->next) {
+ head = explored_state(env, insn_idx);
+ list_for_each(pos, head) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
/* If st->branches != 0 state is a part of current DFS verification path,
* hence cur & st for a loop.
*/
@@ -7910,6 +8783,15 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env,
return 0;
}
+static struct bpf_reg_state *get_iter_from_state(struct bpf_verifier_state *cur_st,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ int iter_frameno = meta->iter.frameno;
+ int iter_spi = meta->iter.spi;
+
+ return &cur_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+}
+
/* process_iter_next_call() is called when verifier gets to iterator's next
* "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
* to it as just "iter_next()" in comments below.
@@ -7994,12 +8876,10 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
struct bpf_reg_state *cur_iter, *queued_iter;
- int iter_frameno = meta->iter.frameno;
- int iter_spi = meta->iter.spi;
BTF_TYPE_EMIT(struct bpf_iter);
- cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+ cur_iter = get_iter_from_state(cur_st, meta);
if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE &&
cur_iter->iter.state != BPF_ITER_STATE_DRAINED) {
@@ -8027,7 +8907,7 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
if (!queued_st)
return -ENOMEM;
- queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+ queued_iter = get_iter_from_state(queued_st, meta);
queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
queued_iter->iter.depth++;
if (prev_st)
@@ -8051,6 +8931,12 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type)
type == ARG_CONST_SIZE_OR_ZERO;
}
+static bool arg_type_is_raw_mem(enum bpf_arg_type type)
+{
+ return base_type(type) == ARG_PTR_TO_MEM &&
+ type & MEM_UNINIT;
+}
+
static bool arg_type_is_release(enum bpf_arg_type type)
{
return type & OBJ_RELEASE;
@@ -8061,16 +8947,6 @@ static bool arg_type_is_dynptr(enum bpf_arg_type type)
return base_type(type) == ARG_PTR_TO_DYNPTR;
}
-static int int_ptr_type_to_size(enum bpf_arg_type type)
-{
- if (type == ARG_PTR_TO_INT)
- return sizeof(u32);
- else if (type == ARG_PTR_TO_LONG)
- return sizeof(u64);
-
- return -EINVAL;
-}
-
static int resolve_map_arg_type(struct bpf_verifier_env *env,
const struct bpf_call_arg_meta *meta,
enum bpf_arg_type *arg_type)
@@ -8143,16 +9019,6 @@ static const struct bpf_reg_types mem_types = {
},
};
-static const struct bpf_reg_types int_ptr_types = {
- .types = {
- PTR_TO_STACK,
- PTR_TO_PACKET,
- PTR_TO_PACKET_META,
- PTR_TO_MAP_KEY,
- PTR_TO_MAP_VALUE,
- },
-};
-
static const struct bpf_reg_types spin_lock_types = {
.types = {
PTR_TO_MAP_VALUE,
@@ -8183,7 +9049,12 @@ static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
-static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types kptr_xchg_dest_types = {
+ .types = {
+ PTR_TO_MAP_VALUE,
+ PTR_TO_BTF_ID | MEM_ALLOC
+ }
+};
static const struct bpf_reg_types dynptr_types = {
.types = {
PTR_TO_STACK,
@@ -8208,14 +9079,12 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types,
[ARG_PTR_TO_MEM] = &mem_types,
[ARG_PTR_TO_RINGBUF_MEM] = &ringbuf_mem_types,
- [ARG_PTR_TO_INT] = &int_ptr_types,
- [ARG_PTR_TO_LONG] = &int_ptr_types,
[ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
[ARG_PTR_TO_FUNC] = &func_ptr_types,
[ARG_PTR_TO_STACK] = &stack_ptr_types,
[ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
[ARG_PTR_TO_TIMER] = &timer_types,
- [ARG_PTR_TO_KPTR] = &kptr_types,
+ [ARG_KPTR_XCHG_DEST] = &kptr_xchg_dest_types,
[ARG_PTR_TO_DYNPTR] = &dynptr_types,
};
@@ -8254,7 +9123,8 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
if (base_type(arg_type) == ARG_PTR_TO_MEM)
type &= ~DYNPTR_TYPE_FLAG_MASK;
- if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) {
+ /* Local kptr types are allowed as the source argument of bpf_kptr_xchg */
+ if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && regno == BPF_REG_2) {
type &= ~MEM_ALLOC;
type &= ~MEM_PERCPU;
}
@@ -8347,7 +9217,8 @@ found:
verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
return -EFAULT;
}
- if (meta->func_id == BPF_FUNC_kptr_xchg) {
+ /* Check if local kptr in src arg matches kptr in dst arg */
+ if (meta->func_id == BPF_FUNC_kptr_xchg && regno == BPF_REG_2) {
if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
return -EACCES;
}
@@ -8566,6 +9437,69 @@ static int check_reg_const_str(struct bpf_verifier_env *env,
return 0;
}
+/* Returns constant key value in `value` if possible, else negative error */
+static int get_constant_map_key(struct bpf_verifier_env *env,
+ struct bpf_reg_state *key,
+ u32 key_size,
+ s64 *value)
+{
+ struct bpf_func_state *state = func(env, key);
+ struct bpf_reg_state *reg;
+ int slot, spi, off;
+ int spill_size = 0;
+ int zero_size = 0;
+ int stack_off;
+ int i, err;
+ u8 *stype;
+
+ if (!env->bpf_capable)
+ return -EOPNOTSUPP;
+ if (key->type != PTR_TO_STACK)
+ return -EOPNOTSUPP;
+ if (!tnum_is_const(key->var_off))
+ return -EOPNOTSUPP;
+
+ stack_off = key->off + key->var_off.value;
+ slot = -stack_off - 1;
+ spi = slot / BPF_REG_SIZE;
+ off = slot % BPF_REG_SIZE;
+ stype = state->stack[spi].slot_type;
+
+ /* First handle precisely tracked STACK_ZERO */
+ for (i = off; i >= 0 && stype[i] == STACK_ZERO; i--)
+ zero_size++;
+ if (zero_size >= key_size) {
+ *value = 0;
+ return 0;
+ }
+
+ /* Check that stack contains a scalar spill of expected size */
+ if (!is_spilled_scalar_reg(&state->stack[spi]))
+ return -EOPNOTSUPP;
+ for (i = off; i >= 0 && stype[i] == STACK_SPILL; i--)
+ spill_size++;
+ if (spill_size != key_size)
+ return -EOPNOTSUPP;
+
+ reg = &state->stack[spi].spilled_ptr;
+ if (!tnum_is_const(reg->var_off))
+ /* Stack value not statically known */
+ return -EOPNOTSUPP;
+
+ /* We are relying on a constant value. So mark as precise
+ * to prevent pruning on it.
+ */
+ bt_set_frame_slot(&env->bt, key->frameno, spi);
+ err = mark_chain_precision_batch(env);
+ if (err < 0)
+ return err;
+
+ *value = reg->var_off.value;
+ return 0;
+}
+
+static bool can_elide_value_nullness(enum bpf_map_type type);
+
static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
struct bpf_call_arg_meta *meta,
const struct bpf_func_proto *fn,
@@ -8576,6 +9510,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
enum bpf_arg_type arg_type = fn->arg_type[arg];
enum bpf_reg_type type = reg->type;
u32 *arg_btf_id = NULL;
+ u32 key_size;
int err = 0;
if (arg_type == ARG_DONTCARE)
@@ -8658,7 +9593,7 @@ skip_type_check:
meta->release_regno = regno;
}
- if (reg->ref_obj_id) {
+ if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) {
if (meta->ref_obj_id) {
verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
regno, reg->ref_obj_id,
@@ -8709,9 +9644,20 @@ skip_type_check:
verbose(env, "invalid map_ptr to access map->key\n");
return -EACCES;
}
- err = check_helper_mem_access(env, regno,
- meta->map_ptr->key_size, false,
- NULL);
+ key_size = meta->map_ptr->key_size;
+ err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL);
+ if (err)
+ return err;
+ if (can_elide_value_nullness(meta->map_ptr->map_type)) {
+ err = get_constant_map_key(env, reg, key_size, &meta->const_map_key);
+ if (err < 0) {
+ meta->const_map_key = -1;
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ else
+ return err;
+ }
+ }
break;
case ARG_PTR_TO_MAP_VALUE:
if (type_may_be_null(arg_type) && register_is_null(reg))
@@ -8726,9 +9672,9 @@ skip_type_check:
return -EACCES;
}
meta->raw_mode = arg_type & MEM_UNINIT;
- err = check_helper_mem_access(env, regno,
- meta->map_ptr->value_size, false,
- meta);
+ err = check_helper_mem_access(env, regno, meta->map_ptr->value_size,
+ arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ,
+ false, meta);
break;
case ARG_PTR_TO_PERCPU_BTF_ID:
if (!reg->btf_id) {
@@ -8744,11 +9690,11 @@ skip_type_check:
return -EACCES;
}
if (meta->func_id == BPF_FUNC_spin_lock) {
- err = process_spin_lock(env, regno, true);
+ err = process_spin_lock(env, regno, PROCESS_SPIN_LOCK);
if (err)
return err;
} else if (meta->func_id == BPF_FUNC_spin_unlock) {
- err = process_spin_lock(env, regno, false);
+ err = process_spin_lock(env, regno, 0);
if (err)
return err;
} else {
@@ -8770,16 +9716,26 @@ skip_type_check:
*/
meta->raw_mode = arg_type & MEM_UNINIT;
if (arg_type & MEM_FIXED_SIZE) {
- err = check_helper_mem_access(env, regno,
- fn->arg_size[arg], false,
- meta);
+ err = check_helper_mem_access(env, regno, fn->arg_size[arg],
+ arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ,
+ false, meta);
+ if (err)
+ return err;
+ if (arg_type & MEM_ALIGNED)
+ err = check_ptr_alignment(env, reg, 0, fn->arg_size[arg], true);
}
break;
case ARG_CONST_SIZE:
- err = check_mem_size_reg(env, reg, regno, false, meta);
+ err = check_mem_size_reg(env, reg, regno,
+ fn->arg_type[arg - 1] & MEM_WRITE ?
+ BPF_WRITE : BPF_READ,
+ false, meta);
break;
case ARG_CONST_SIZE_OR_ZERO:
- err = check_mem_size_reg(env, reg, regno, true, meta);
+ err = check_mem_size_reg(env, reg, regno,
+ fn->arg_type[arg - 1] & MEM_WRITE ?
+ BPF_WRITE : BPF_READ,
+ true, meta);
break;
case ARG_PTR_TO_DYNPTR:
err = process_dynptr_func(env, regno, insn_idx, arg_type, 0);
@@ -8797,17 +9753,6 @@ skip_type_check:
if (err)
return err;
break;
- case ARG_PTR_TO_INT:
- case ARG_PTR_TO_LONG:
- {
- int size = int_ptr_type_to_size(arg_type);
-
- err = check_helper_mem_access(env, regno, size, false, meta);
- if (err)
- return err;
- err = check_ptr_alignment(env, reg, 0, size, true);
- break;
- }
case ARG_PTR_TO_CONST_STR:
{
err = check_reg_const_str(env, reg, regno);
@@ -8815,7 +9760,7 @@ skip_type_check:
return err;
break;
}
- case ARG_PTR_TO_KPTR:
+ case ARG_KPTR_XCHG_DEST:
err = process_kptr_func(env, regno, meta);
if (err)
return err;
@@ -8830,7 +9775,8 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
enum bpf_attach_type eatype = env->prog->expected_attach_type;
enum bpf_prog_type type = resolve_prog_type(env->prog);
- if (func_id != BPF_FUNC_map_update_elem)
+ if (func_id != BPF_FUNC_map_update_elem &&
+ func_id != BPF_FUNC_map_delete_elem)
return false;
/* It's not possible to get access to a locked struct sock in these
@@ -8841,6 +9787,11 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
if (eatype == BPF_TRACE_ITER)
return true;
break;
+ case BPF_PROG_TYPE_SOCK_OPS:
+ /* map_update allowed only via dedicated helpers with event type checks */
+ if (func_id == BPF_FUNC_map_delete_elem)
+ return true;
+ break;
case BPF_PROG_TYPE_SOCKET_FILTER:
case BPF_PROG_TYPE_SCHED_CLS:
case BPF_PROG_TYPE_SCHED_ACT:
@@ -8936,7 +9887,6 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_SOCKMAP:
if (func_id != BPF_FUNC_sk_redirect_map &&
func_id != BPF_FUNC_sock_map_update &&
- func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_msg_redirect_map &&
func_id != BPF_FUNC_sk_select_reuseport &&
func_id != BPF_FUNC_map_lookup_elem &&
@@ -8946,7 +9896,6 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_SOCKHASH:
if (func_id != BPF_FUNC_sk_redirect_hash &&
func_id != BPF_FUNC_sock_hash_update &&
- func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_msg_redirect_hash &&
func_id != BPF_FUNC_sk_select_reuseport &&
func_id != BPF_FUNC_map_lookup_elem &&
@@ -9003,7 +9952,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
goto error;
if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) {
- verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n");
+ verbose(env, "mixing of tail_calls and bpf-to-bpf calls is not supported\n");
return -EINVAL;
}
break;
@@ -9120,15 +10069,15 @@ static bool check_raw_mode_ok(const struct bpf_func_proto *fn)
{
int count = 0;
- if (fn->arg1_type == ARG_PTR_TO_UNINIT_MEM)
+ if (arg_type_is_raw_mem(fn->arg1_type))
count++;
- if (fn->arg2_type == ARG_PTR_TO_UNINIT_MEM)
+ if (arg_type_is_raw_mem(fn->arg2_type))
count++;
- if (fn->arg3_type == ARG_PTR_TO_UNINIT_MEM)
+ if (arg_type_is_raw_mem(fn->arg3_type))
count++;
- if (fn->arg4_type == ARG_PTR_TO_UNINIT_MEM)
+ if (arg_type_is_raw_mem(fn->arg4_type))
count++;
- if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM)
+ if (arg_type_is_raw_mem(fn->arg5_type))
count++;
/* We only support one arg being in raw mode at the moment,
@@ -9240,21 +10189,38 @@ static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range
reg->range = AT_PKT_END;
}
+static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id)
+{
+ int i;
+
+ for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].type != REF_TYPE_PTR)
+ continue;
+ if (state->refs[i].id == ref_obj_id) {
+ release_reference_state(state, i);
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
/* The pointer with the specified id has released its reference to kernel
* resources. Identify all copies of the same pointer and clear the reference.
+ *
+ * This is the release function corresponding to acquire_reference(). Idempotent.
*/
-static int release_reference(struct bpf_verifier_env *env,
- int ref_obj_id)
+static int release_reference(struct bpf_verifier_env *env, int ref_obj_id)
{
+ struct bpf_verifier_state *vstate = env->cur_state;
struct bpf_func_state *state;
struct bpf_reg_state *reg;
int err;
- err = release_reference_state(cur_func(env), ref_obj_id);
+ err = release_reference_nomark(vstate, ref_obj_id);
if (err)
return err;
- bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
if (reg->ref_obj_id == ref_obj_id)
mark_reg_invalid(env, reg);
}));
@@ -9308,8 +10274,7 @@ static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int calls
}
if (state->frame[state->curframe + 1]) {
- verbose(env, "verifier bug. Frame %d already allocated\n",
- state->curframe + 1);
+ verifier_bug(env, "Frame %d already allocated", state->curframe + 1);
return -EFAULT;
}
@@ -9328,9 +10293,7 @@ static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int calls
callsite,
state->curframe + 1 /* frameno within this callchain */,
subprog /* subprog number within this prog */);
- /* Transfer references to the callee */
- err = copy_reference_state(callee, caller);
- err = err ?: set_callee_state_cb(env, caller, callee, callsite);
+ err = set_callee_state_cb(env, caller, callee, callsite);
if (err)
goto err_out;
@@ -9405,6 +10368,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
return -EINVAL;
}
} else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
+ ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_DYNPTR);
+ if (ret)
+ return ret;
+
ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0);
if (ret)
return ret;
@@ -9421,8 +10388,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
if (err)
return err;
} else {
- bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n",
- i, arg->arg_type);
+ verifier_bug(env, "unrecognized arg#%d type %d", i, arg->arg_type);
return -EFAULT;
}
}
@@ -9484,24 +10450,25 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
*/
env->subprog_info[subprog].is_cb = true;
if (bpf_pseudo_kfunc_call(insn) &&
- !is_sync_callback_calling_kfunc(insn->imm)) {
- verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
- func_id_name(insn->imm), insn->imm);
+ !is_callback_calling_kfunc(insn->imm)) {
+ verifier_bug(env, "kfunc %s#%d not marked as callback-calling",
+ func_id_name(insn->imm), insn->imm);
return -EFAULT;
} else if (!bpf_pseudo_kfunc_call(insn) &&
!is_callback_calling_function(insn->imm)) { /* helper */
- verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n",
- func_id_name(insn->imm), insn->imm);
+ verifier_bug(env, "helper %s#%d not marked as callback-calling",
+ func_id_name(insn->imm), insn->imm);
return -EFAULT;
}
if (is_async_callback_calling_insn(insn)) {
struct bpf_verifier_state *async_cb;
- /* there is no real recursion here. timer callbacks are async */
+ /* there is no real recursion here. timer and workqueue callbacks are async */
env->subprog_info[subprog].is_async_cb = true;
async_cb = push_async_cb(env, env->subprog_info[subprog].start,
- insn_idx, subprog);
+ insn_idx, subprog,
+ is_bpf_wq_set_callback_impl_kfunc(insn->imm));
if (!async_cb)
return -EFAULT;
callee = async_cb->frame[0];
@@ -9542,10 +10509,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
target_insn = *insn_idx + insn->imm + 1;
subprog = find_subprog(env, target_insn);
- if (subprog < 0) {
- verbose(env, "verifier bug. No program starts at insn %d\n", target_insn);
+ if (verifier_bug_if(subprog < 0, env, "target of func call at insn %d is not a program",
+ target_insn))
return -EFAULT;
- }
caller = state->frame[state->curframe];
err = btf_check_subprog_call(env, subprog, caller->regs);
@@ -9554,13 +10520,21 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (subprog_is_global(env, subprog)) {
const char *sub_name = subprog_name(env, subprog);
- /* Only global subprogs cannot be called with a lock held. */
- if (env->cur_state->active_lock.ptr) {
+ if (env->cur_state->active_locks) {
verbose(env, "global function calls are not allowed while holding a lock,\n"
"use static function instead\n");
return -EINVAL;
}
+ if (env->subprog_info[subprog].might_sleep &&
+ (env->cur_state->active_rcu_lock || env->cur_state->active_preempt_locks ||
+ env->cur_state->active_irq_id || !in_sleepable(env))) {
+ verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n"
+ "i.e., in a RCU/IRQ/preempt-disabled section, or in\n"
+ "a non-sleepable BPF program context\n");
+ return -EINVAL;
+ }
+
if (err) {
verbose(env, "Caller passes invalid args into func#%d ('%s')\n",
subprog, sub_name);
@@ -9569,6 +10543,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
verbose(env, "Func#%d ('%s') is global and assumed valid.\n",
subprog, sub_name);
+ if (env->subprog_info[subprog].changes_pkt_data)
+ clear_all_pkt_pointers(env);
/* mark global subprog for verifying after main prog */
subprog_aux(env, subprog)->called = true;
clear_caller_saved_regs(env, caller->regs);
@@ -9595,9 +10571,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "caller:\n");
- print_verifier_state(env, caller, true);
+ print_verifier_state(env, state, caller->frameno, true);
verbose(env, "callee:\n");
- print_verifier_state(env, state->frame[state->curframe], true);
+ print_verifier_state(env, state, state->curframe, true);
}
return 0;
@@ -9653,12 +10629,8 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
struct bpf_map *map;
int err;
- if (bpf_map_ptr_poisoned(insn_aux)) {
- verbose(env, "tail_call abusing map_ptr\n");
- return -EINVAL;
- }
-
- map = BPF_MAP_PTR(insn_aux->map_ptr_state);
+ /* valid map_ptr and poison value does not matter */
+ map = insn_aux->map_ptr_state.map_ptr;
if (!map->ops->map_set_for_each_callback_args ||
!map->ops->map_for_each_callback) {
verbose(env, "callback function not allowed for map\n");
@@ -9681,7 +10653,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env,
{
/* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx,
* u64 flags);
- * callback_fn(u32 index, void *callback_ctx);
+ * callback_fn(u64 index, void *callback_ctx);
*/
callee->regs[BPF_REG_1].type = SCALAR_VALUE;
callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
@@ -9834,9 +10806,13 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
return is_rbtree_lock_required_kfunc(kfunc_btf_id);
}
-static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg)
+static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg,
+ bool return_32bit)
{
- return range.minval <= reg->smin_value && reg->smax_value <= range.maxval;
+ if (return_32bit)
+ return range.minval <= reg->s32_min_value && reg->s32_max_value <= range.maxval;
+ else
+ return range.minval <= reg->smin_value && reg->smax_value <= range.maxval;
}
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
@@ -9873,8 +10849,8 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
if (err)
return err;
- /* enforce R0 return value range */
- if (!retval_range_within(callee->callback_ret_range, r0)) {
+ /* enforce R0 return value range, and bpf_callback_t returns 64bit */
+ if (!retval_range_within(callee->callback_ret_range, r0, false)) {
verbose_invalid_scalar(env, r0, callee->callback_ret_range,
"At callback return", "R0");
return -EINVAL;
@@ -9889,18 +10865,6 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
caller->regs[BPF_REG_0] = *r0;
}
- /* callback_fn frame should have released its own additions to parent's
- * reference state at this point, or check_reference_leak would
- * complain, hence it must be the same as the caller. There is no need
- * to copy it back.
- */
- if (!callee->in_callback_fn) {
- /* Transfer references to the caller */
- err = copy_reference_state(caller, callee);
- if (err)
- return err;
- }
-
/* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite,
* there function call logic would reschedule callback visit. If iteration
* converges is_state_visited() would prune that visit eventually.
@@ -9913,9 +10877,9 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "returning from callee:\n");
- print_verifier_state(env, callee, true);
+ print_verifier_state(env, state, callee->frameno, true);
verbose(env, "to caller at %d:\n", *insn_idx);
- print_verifier_state(env, caller, true);
+ print_verifier_state(env, state, caller->frameno, true);
}
/* clear everything in the callee. In case of exceptional exits using
* bpf_throw, this will be done by copy_verifier_state for extra frames. */
@@ -10017,12 +10981,12 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
return -EACCES;
}
- if (!BPF_MAP_PTR(aux->map_ptr_state))
+ if (!aux->map_ptr_state.map_ptr)
bpf_map_ptr_store(aux, meta->map_ptr,
- !meta->map_ptr->bypass_spec_v1);
- else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr)
- bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON,
- !meta->map_ptr->bypass_spec_v1);
+ !meta->map_ptr->bypass_spec_v1, false);
+ else if (aux->map_ptr_state.map_ptr != meta->map_ptr)
+ bpf_map_ptr_store(aux, meta->map_ptr,
+ !meta->map_ptr->bypass_spec_v1, true);
return 0;
}
@@ -10065,15 +11029,23 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit)
{
- struct bpf_func_state *state = cur_func(env);
+ struct bpf_verifier_state *state = env->cur_state;
+ enum bpf_prog_type type = resolve_prog_type(env->prog);
+ struct bpf_reg_state *reg = reg_state(env, BPF_REG_0);
bool refs_lingering = false;
int i;
- if (!exception_exit && state->frameno && !state->in_callback_fn)
+ if (!exception_exit && cur_func(env)->frameno)
return 0;
for (i = 0; i < state->acquired_refs; i++) {
- if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
+ if (state->refs[i].type != REF_TYPE_PTR)
+ continue;
+ /* Allow struct_ops programs to return a referenced kptr back to
+ * kernel. Type checks are performed later in check_return_code.
+ */
+ if (type == BPF_PROG_TYPE_STRUCT_OPS && !exception_exit &&
+ reg->ref_obj_id == state->refs[i].id)
continue;
verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
state->refs[i].id, state->refs[i].insn_idx);
@@ -10082,6 +11054,39 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi
return refs_lingering ? -EINVAL : 0;
}
+static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit, bool check_lock, const char *prefix)
+{
+ int err;
+
+ if (check_lock && env->cur_state->active_locks) {
+ verbose(env, "%s cannot be used inside bpf_spin_lock-ed region\n", prefix);
+ return -EINVAL;
+ }
+
+ err = check_reference_leak(env, exception_exit);
+ if (err) {
+ verbose(env, "%s would lead to reference leak\n", prefix);
+ return err;
+ }
+
+ if (check_lock && env->cur_state->active_irq_id) {
+ verbose(env, "%s cannot be used inside bpf_local_irq_save-ed region\n", prefix);
+ return -EINVAL;
+ }
+
+ if (check_lock && env->cur_state->active_rcu_lock) {
+ verbose(env, "%s cannot be used inside bpf_rcu_read_lock-ed region\n", prefix);
+ return -EINVAL;
+ }
+
+ if (check_lock && env->cur_state->active_preempt_locks) {
+ verbose(env, "%s cannot be used inside bpf_preempt_disable-ed region\n", prefix);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
struct bpf_reg_state *regs)
{
@@ -10105,7 +11110,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr,
fmt_map_off);
if (err) {
- verbose(env, "verifier bug\n");
+ verbose(env, "failed to retrieve map value address\n");
return -EFAULT;
}
fmt = (char *)(long)fmt_addr + fmt_map_off;
@@ -10176,6 +11181,34 @@ static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno
state->callback_subprogno == subprogno);
}
+/* Returns whether or not the given map type can potentially elide
+ * lookup return value nullness check. This is possible if the key
+ * is statically known.
+ */
+static bool can_elide_value_nullness(enum bpf_map_type type)
+{
+ switch (type) {
+ case BPF_MAP_TYPE_ARRAY:
+ case BPF_MAP_TYPE_PERCPU_ARRAY:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int get_helper_proto(struct bpf_verifier_env *env, int func_id,
+ const struct bpf_func_proto **ptr)
+{
+ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID)
+ return -ERANGE;
+
+ if (!env->ops->get_func_proto)
+ return -EINVAL;
+
+ *ptr = env->ops->get_func_proto(func_id, env->prog);
+ return *ptr ? 0 : -EINVAL;
+}
+
static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
@@ -10192,18 +11225,16 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* find function prototype */
func_id = insn->imm;
- if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
- verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
- func_id);
+ err = get_helper_proto(env, insn->imm, &fn);
+ if (err == -ERANGE) {
+ verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id);
return -EINVAL;
}
- if (env->ops->get_func_proto)
- fn = env->ops->get_func_proto(func_id, env->prog);
- if (!fn) {
- verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
- func_id);
- return -EINVAL;
+ if (err) {
+ verbose(env, "program of this type cannot use helper %s#%d\n",
+ func_id_name(func_id), func_id);
+ return err;
}
/* eBPF programs must be GPL compatible to use GPL-ed functions */
@@ -10223,7 +11254,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
/* With LD_ABS/IND some JITs save/restore skb from r1. */
- changes_data = bpf_helper_changes_pkt_data(fn->func);
+ changes_data = bpf_helper_changes_pkt_data(func_id);
if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n",
func_id_name(func_id), func_id);
@@ -10251,6 +11282,28 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
}
+ if (env->cur_state->active_preempt_locks) {
+ if (fn->might_sleep) {
+ verbose(env, "sleepable helper %s#%d in non-preemptible region\n",
+ func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
+
+ if (in_sleepable(env) && is_storage_get_function(func_id))
+ env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
+ }
+
+ if (env->cur_state->active_irq_id) {
+ if (fn->might_sleep) {
+ verbose(env, "sleepable helper %s#%d in IRQ-disabled region\n",
+ func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
+
+ if (in_sleepable(env) && is_storage_get_function(func_id))
+ env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
+ }
+
meta.func_id = func_id;
/* check args */
for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
@@ -10297,7 +11350,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
struct bpf_func_state *state;
struct bpf_reg_state *reg;
- err = release_reference_state(cur_func(env), ref_obj_id);
+ err = release_reference_nomark(env->cur_state, ref_obj_id);
if (!err) {
bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
if (reg->ref_obj_id == ref_obj_id) {
@@ -10328,11 +11381,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
switch (func_id) {
case BPF_FUNC_tail_call:
- err = check_reference_leak(env, false);
- if (err) {
- verbose(env, "tail_call would lead to reference leak\n");
+ err = check_resource_leak(env, false, true, "tail_call");
+ if (err)
return err;
- }
break;
case BPF_FUNC_get_local_storage:
/* check that flags argument in get_local_storage(map, flags) is 0,
@@ -10511,11 +11562,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
"kernel subsystem misconfigured verifier\n");
return -EINVAL;
}
+
+ if (func_id == BPF_FUNC_map_lookup_elem &&
+ can_elide_value_nullness(meta.map_ptr->map_type) &&
+ meta.const_map_key >= 0 &&
+ meta.const_map_key < meta.map_ptr->max_entries)
+ ret_flag &= ~PTR_MAYBE_NULL;
+
regs[BPF_REG_0].map_ptr = meta.map_ptr;
regs[BPF_REG_0].map_uid = meta.map_uid;
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
- if (!type_may_be_null(ret_type) &&
- btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) {
+ if (!type_may_be_null(ret_flag) &&
+ btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) {
regs[BPF_REG_0].id = ++env->id_gen;
}
break;
@@ -10632,7 +11690,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
} else if (is_acquire_function(func_id, meta.map_ptr)) {
- int id = acquire_reference_state(env, insn_idx);
+ int id = acquire_reference(env, insn_idx);
if (id < 0)
return id;
@@ -10687,10 +11745,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* mark_btf_func_reg_size() is used when the reg size is determined by
* the BTF func_proto's return value size and argument.
*/
-static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
- size_t reg_size)
+static void __mark_btf_func_reg_size(struct bpf_verifier_env *env, struct bpf_reg_state *regs,
+ u32 regno, size_t reg_size)
{
- struct bpf_reg_state *reg = &cur_regs(env)[regno];
+ struct bpf_reg_state *reg = &regs[regno];
if (regno == BPF_REG_0) {
/* Function return value */
@@ -10708,6 +11766,12 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
}
}
+static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
+ size_t reg_size)
+{
+ return __mark_btf_func_reg_size(env, cur_regs(env), regno, reg_size);
+}
+
static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
{
return meta->kfunc_flags & KF_ACQUIRE;
@@ -10814,6 +11878,16 @@ static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param
return btf_param_match_suffix(btf, arg, "__str");
}
+static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__irq_flag");
+}
+
+static bool is_kfunc_arg_prog(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__prog");
+}
+
static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
const struct btf_param *arg,
const char *name)
@@ -10839,14 +11913,18 @@ enum {
KF_ARG_LIST_NODE_ID,
KF_ARG_RB_ROOT_ID,
KF_ARG_RB_NODE_ID,
+ KF_ARG_WORKQUEUE_ID,
+ KF_ARG_RES_SPIN_LOCK_ID,
};
BTF_ID_LIST(kf_arg_btf_ids)
-BTF_ID(struct, bpf_dynptr_kern)
+BTF_ID(struct, bpf_dynptr)
BTF_ID(struct, bpf_list_head)
BTF_ID(struct, bpf_list_node)
BTF_ID(struct, bpf_rb_root)
BTF_ID(struct, bpf_rb_node)
+BTF_ID(struct, bpf_wq)
+BTF_ID(struct, bpf_res_spin_lock)
static bool __is_kfunc_ptr_arg_type(const struct btf *btf,
const struct btf_param *arg, int type)
@@ -10890,6 +11968,26 @@ static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_par
return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID);
}
+static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID);
+}
+
+static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID);
+}
+
+static bool is_rbtree_node_type(const struct btf_type *t)
+{
+ return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_RB_NODE_ID]);
+}
+
+static bool is_list_node_type(const struct btf_type *t)
+{
+ return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_LIST_NODE_ID]);
+}
+
static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf,
const struct btf_param *arg)
{
@@ -10959,6 +12057,9 @@ enum kfunc_ptr_arg_type {
KF_ARG_PTR_TO_NULL,
KF_ARG_PTR_TO_CONST_STR,
KF_ARG_PTR_TO_MAP,
+ KF_ARG_PTR_TO_WORKQUEUE,
+ KF_ARG_PTR_TO_IRQ_FLAG,
+ KF_ARG_PTR_TO_RES_SPIN_LOCK,
};
enum special_kfunc_type {
@@ -10969,6 +12070,8 @@ enum special_kfunc_type {
KF_bpf_list_push_back_impl,
KF_bpf_list_pop_front,
KF_bpf_list_pop_back,
+ KF_bpf_list_front,
+ KF_bpf_list_back,
KF_bpf_cast_to_kern_ctx,
KF_bpf_rdonly_cast,
KF_bpf_rcu_read_lock,
@@ -10976,6 +12079,9 @@ enum special_kfunc_type {
KF_bpf_rbtree_remove,
KF_bpf_rbtree_add_impl,
KF_bpf_rbtree_first,
+ KF_bpf_rbtree_root,
+ KF_bpf_rbtree_left,
+ KF_bpf_rbtree_right,
KF_bpf_dynptr_from_skb,
KF_bpf_dynptr_from_xdp,
KF_bpf_dynptr_slice,
@@ -10984,35 +12090,26 @@ enum special_kfunc_type {
KF_bpf_percpu_obj_new_impl,
KF_bpf_percpu_obj_drop_impl,
KF_bpf_throw,
+ KF_bpf_wq_set_callback_impl,
+ KF_bpf_preempt_disable,
+ KF_bpf_preempt_enable,
KF_bpf_iter_css_task_new,
+ KF_bpf_session_cookie,
+ KF_bpf_get_kmem_cache,
+ KF_bpf_local_irq_save,
+ KF_bpf_local_irq_restore,
+ KF_bpf_iter_num_new,
+ KF_bpf_iter_num_next,
+ KF_bpf_iter_num_destroy,
+ KF_bpf_set_dentry_xattr,
+ KF_bpf_remove_dentry_xattr,
+ KF_bpf_res_spin_lock,
+ KF_bpf_res_spin_unlock,
+ KF_bpf_res_spin_lock_irqsave,
+ KF_bpf_res_spin_unlock_irqrestore,
+ KF___bpf_trap,
};
-BTF_SET_START(special_kfunc_set)
-BTF_ID(func, bpf_obj_new_impl)
-BTF_ID(func, bpf_obj_drop_impl)
-BTF_ID(func, bpf_refcount_acquire_impl)
-BTF_ID(func, bpf_list_push_front_impl)
-BTF_ID(func, bpf_list_push_back_impl)
-BTF_ID(func, bpf_list_pop_front)
-BTF_ID(func, bpf_list_pop_back)
-BTF_ID(func, bpf_cast_to_kern_ctx)
-BTF_ID(func, bpf_rdonly_cast)
-BTF_ID(func, bpf_rbtree_remove)
-BTF_ID(func, bpf_rbtree_add_impl)
-BTF_ID(func, bpf_rbtree_first)
-BTF_ID(func, bpf_dynptr_from_skb)
-BTF_ID(func, bpf_dynptr_from_xdp)
-BTF_ID(func, bpf_dynptr_slice)
-BTF_ID(func, bpf_dynptr_slice_rdwr)
-BTF_ID(func, bpf_dynptr_clone)
-BTF_ID(func, bpf_percpu_obj_new_impl)
-BTF_ID(func, bpf_percpu_obj_drop_impl)
-BTF_ID(func, bpf_throw)
-#ifdef CONFIG_CGROUPS
-BTF_ID(func, bpf_iter_css_task_new)
-#endif
-BTF_SET_END(special_kfunc_set)
-
BTF_ID_LIST(special_kfunc_list)
BTF_ID(func, bpf_obj_new_impl)
BTF_ID(func, bpf_obj_drop_impl)
@@ -11021,6 +12118,8 @@ BTF_ID(func, bpf_list_push_front_impl)
BTF_ID(func, bpf_list_push_back_impl)
BTF_ID(func, bpf_list_pop_front)
BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_list_front)
+BTF_ID(func, bpf_list_back)
BTF_ID(func, bpf_cast_to_kern_ctx)
BTF_ID(func, bpf_rdonly_cast)
BTF_ID(func, bpf_rcu_read_lock)
@@ -11028,19 +12127,53 @@ BTF_ID(func, bpf_rcu_read_unlock)
BTF_ID(func, bpf_rbtree_remove)
BTF_ID(func, bpf_rbtree_add_impl)
BTF_ID(func, bpf_rbtree_first)
+BTF_ID(func, bpf_rbtree_root)
+BTF_ID(func, bpf_rbtree_left)
+BTF_ID(func, bpf_rbtree_right)
+#ifdef CONFIG_NET
BTF_ID(func, bpf_dynptr_from_skb)
BTF_ID(func, bpf_dynptr_from_xdp)
+#else
+BTF_ID_UNUSED
+BTF_ID_UNUSED
+#endif
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
BTF_ID(func, bpf_percpu_obj_new_impl)
BTF_ID(func, bpf_percpu_obj_drop_impl)
BTF_ID(func, bpf_throw)
+BTF_ID(func, bpf_wq_set_callback_impl)
+BTF_ID(func, bpf_preempt_disable)
+BTF_ID(func, bpf_preempt_enable)
#ifdef CONFIG_CGROUPS
BTF_ID(func, bpf_iter_css_task_new)
#else
BTF_ID_UNUSED
#endif
+#ifdef CONFIG_BPF_EVENTS
+BTF_ID(func, bpf_session_cookie)
+#else
+BTF_ID_UNUSED
+#endif
+BTF_ID(func, bpf_get_kmem_cache)
+BTF_ID(func, bpf_local_irq_save)
+BTF_ID(func, bpf_local_irq_restore)
+BTF_ID(func, bpf_iter_num_new)
+BTF_ID(func, bpf_iter_num_next)
+BTF_ID(func, bpf_iter_num_destroy)
+#ifdef CONFIG_BPF_LSM
+BTF_ID(func, bpf_set_dentry_xattr)
+BTF_ID(func, bpf_remove_dentry_xattr)
+#else
+BTF_ID_UNUSED
+BTF_ID_UNUSED
+#endif
+BTF_ID(func, bpf_res_spin_lock)
+BTF_ID(func, bpf_res_spin_unlock)
+BTF_ID(func, bpf_res_spin_lock_irqsave)
+BTF_ID(func, bpf_res_spin_unlock_irqrestore)
+BTF_ID(func, __bpf_trap)
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -11062,6 +12195,16 @@ static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta)
return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock];
}
+static bool is_kfunc_bpf_preempt_disable(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_preempt_disable];
+}
+
+static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable];
+}
+
static enum kfunc_ptr_arg_type
get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
struct bpf_kfunc_call_arg_meta *meta,
@@ -11085,6 +12228,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
return KF_ARG_PTR_TO_CTX;
+ if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg))
+ return KF_ARG_PTR_TO_NULL;
+
if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_ALLOC_BTF_ID;
@@ -11094,7 +12240,7 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_DYNPTR;
- if (is_kfunc_arg_iter(meta, argno))
+ if (is_kfunc_arg_iter(meta, argno, &args[argno]))
return KF_ARG_PTR_TO_ITER;
if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
@@ -11115,6 +12261,15 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_map(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_MAP;
+ if (is_kfunc_arg_wq(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_WORKQUEUE;
+
+ if (is_kfunc_arg_irq_flag(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_IRQ_FLAG;
+
+ if (is_kfunc_arg_res_spin_lock(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_RES_SPIN_LOCK;
+
if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
if (!btf_type_is_struct(ref_t)) {
verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
@@ -11127,9 +12282,6 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
return KF_ARG_PTR_TO_CALLBACK;
- if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg))
- return KF_ARG_PTR_TO_NULL;
-
if (argno + 1 < nargs &&
(is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
@@ -11160,6 +12312,8 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
bool strict_type_match = false;
const struct btf *reg_btf;
const char *reg_ref_tname;
+ bool taking_projection;
+ bool struct_same;
u32 reg_ref_id;
if (base_type(reg->type) == PTR_TO_BTF_ID) {
@@ -11194,16 +12348,23 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
* btf_struct_ids_match() to walk the struct at the 0th offset, and
* resolve types.
*/
- if (is_kfunc_acquire(meta) ||
- (is_kfunc_release(meta) && reg->ref_obj_id) ||
+ if ((is_kfunc_release(meta) && reg->ref_obj_id) ||
btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))
strict_type_match = true;
- WARN_ON_ONCE(is_kfunc_trusted_args(meta) && reg->off);
+ WARN_ON_ONCE(is_kfunc_release(meta) &&
+ (reg->off || !tnum_is_const(reg->var_off) ||
+ reg->var_off.value));
reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, &reg_ref_id);
reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off);
- if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) {
+ struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match);
+ /* If kfunc is accepting a projection type (ie. __sk_buff), it cannot
+ * actually use it -- it must cast to the underlying type. So we allow
+ * caller to pass in the underlying type.
+ */
+ taking_projection = btf_is_projection_of(ref_tname, reg_ref_tname);
+ if (!taking_projection && !struct_same) {
verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1,
btf_type_str(reg_ref_t), reg_ref_tname);
@@ -11212,12 +12373,65 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
return 0;
}
+static int process_irq_flag(struct bpf_verifier_env *env, int regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ int err, kfunc_class = IRQ_NATIVE_KFUNC;
+ bool irq_save;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save] ||
+ meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) {
+ irq_save = true;
+ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])
+ kfunc_class = IRQ_LOCK_KFUNC;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore] ||
+ meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) {
+ irq_save = false;
+ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore])
+ kfunc_class = IRQ_LOCK_KFUNC;
+ } else {
+ verbose(env, "verifier internal error: unknown irq flags kfunc\n");
+ return -EFAULT;
+ }
+
+ if (irq_save) {
+ if (!is_irq_flag_reg_valid_uninit(env, reg)) {
+ verbose(env, "expected uninitialized irq flag as arg#%d\n", regno - 1);
+ return -EINVAL;
+ }
+
+ err = check_mem_access(env, env->insn_idx, regno, 0, BPF_DW, BPF_WRITE, -1, false, false);
+ if (err)
+ return err;
+
+ err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx, kfunc_class);
+ if (err)
+ return err;
+ } else {
+ err = is_irq_flag_reg_valid_init(env, reg);
+ if (err) {
+ verbose(env, "expected an initialized irq flag as arg#%d\n", regno - 1);
+ return err;
+ }
+
+ err = mark_irq_flag_read(env, reg);
+ if (err)
+ return err;
+
+ err = unmark_stack_slot_irq_flag(env, reg, kfunc_class);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+
static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- struct bpf_verifier_state *state = env->cur_state;
struct btf_record *rec = reg_btf_record(reg);
- if (!state->active_lock.ptr) {
+ if (!env->cur_state->active_locks) {
verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
return -EFAULT;
}
@@ -11236,12 +12450,11 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id)
{
- struct bpf_func_state *state, *unused;
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_func_state *unused;
struct bpf_reg_state *reg;
int i;
- state = cur_func(env);
-
if (!ref_obj_id) {
verbose(env, "verifier internal error: ref_obj_id is zero for "
"owning -> non-owning conversion\n");
@@ -11314,6 +12527,7 @@ static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_o
*/
static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
+ struct bpf_reference_state *s;
void *ptr;
u32 id;
@@ -11330,10 +12544,10 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_
}
id = reg->id;
- if (!env->cur_state->active_lock.ptr)
+ if (!env->cur_state->active_locks)
return -EINVAL;
- if (env->cur_state->active_lock.ptr != ptr ||
- env->cur_state->active_lock.id != id) {
+ s = find_lock_state(env->cur_state, REF_TYPE_LOCK_MASK, id, ptr);
+ if (!s) {
verbose(env, "held lock and object are not in the same allocation\n");
return -EINVAL;
}
@@ -11345,14 +12559,26 @@ static bool is_bpf_list_api_kfunc(u32 btf_id)
return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
- btf_id == special_kfunc_list[KF_bpf_list_pop_back];
+ btf_id == special_kfunc_list[KF_bpf_list_pop_back] ||
+ btf_id == special_kfunc_list[KF_bpf_list_front] ||
+ btf_id == special_kfunc_list[KF_bpf_list_back];
}
static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
{
return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
- btf_id == special_kfunc_list[KF_bpf_rbtree_first];
+ btf_id == special_kfunc_list[KF_bpf_rbtree_first] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_root] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_left] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_right];
+}
+
+static bool is_bpf_iter_num_api_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_iter_num_new] ||
+ btf_id == special_kfunc_list[KF_bpf_iter_num_next] ||
+ btf_id == special_kfunc_list[KF_bpf_iter_num_destroy];
}
static bool is_bpf_graph_api_kfunc(u32 btf_id)
@@ -11361,17 +12587,47 @@ static bool is_bpf_graph_api_kfunc(u32 btf_id)
btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
}
+static bool is_bpf_res_spin_lock_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_res_spin_lock] ||
+ btf_id == special_kfunc_list[KF_bpf_res_spin_unlock] ||
+ btf_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] ||
+ btf_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore];
+}
+
+static bool kfunc_spin_allowed(u32 btf_id)
+{
+ return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id) ||
+ is_bpf_res_spin_lock_kfunc(btf_id);
+}
+
static bool is_sync_callback_calling_kfunc(u32 btf_id)
{
return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
}
+static bool is_async_callback_calling_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl];
+}
+
static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
{
return bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
insn->imm == special_kfunc_list[KF_bpf_throw];
}
+static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl];
+}
+
+static bool is_callback_calling_kfunc(u32 btf_id)
+{
+ return is_sync_callback_calling_kfunc(btf_id) ||
+ is_async_callback_calling_kfunc(btf_id);
+}
+
static bool is_rbtree_lock_required_kfunc(u32 btf_id)
{
return is_bpf_rbtree_api_kfunc(btf_id);
@@ -11415,7 +12671,9 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
break;
case BPF_RB_NODE:
ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
- kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]);
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_left] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_right]);
break;
default:
verbose(env, "verifier internal error: unexpected graph node argument type %s\n",
@@ -11527,7 +12785,7 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
node_off = reg->off + reg->var_off.value;
field = reg_find_field_offset(reg, node_off, node_field_type);
- if (!field || field->offset != node_off) {
+ if (!field) {
verbose(env, "%s not found at offset=%u\n", node_type_name, node_off);
return -EINVAL;
}
@@ -11635,6 +12893,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (is_kfunc_arg_ignore(btf, &args[i]))
continue;
+ if (is_kfunc_arg_prog(btf, &args[i])) {
+ /* Used to reject repeated use of __prog. */
+ if (meta->arg_prog) {
+ verbose(env, "Only 1 prog->aux argument supported per-kfunc\n");
+ return -EFAULT;
+ }
+ meta->arg_prog = true;
+ cur_aux(env)->arg_prog = regno;
+ continue;
+ }
+
if (btf_type_is_scalar(t)) {
if (reg->type != SCALAR_VALUE) {
verbose(env, "R%d is not a scalar\n", regno);
@@ -11716,6 +12985,34 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_NULL:
continue;
case KF_ARG_PTR_TO_MAP:
+ if (!reg->map_ptr) {
+ verbose(env, "pointer in R%d isn't map pointer\n", regno);
+ return -EINVAL;
+ }
+ if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) {
+ /* Use map_uid (which is unique id of inner map) to reject:
+ * inner_map1 = bpf_map_lookup_elem(outer_map, key1)
+ * inner_map2 = bpf_map_lookup_elem(outer_map, key2)
+ * if (inner_map1 && inner_map2) {
+ * wq = bpf_map_lookup_elem(inner_map1);
+ * if (wq)
+ * // mismatch would have been allowed
+ * bpf_wq_init(wq, inner_map2);
+ * }
+ *
+ * Comparing map_ptr is enough to distinguish normal and outer maps.
+ */
+ if (meta->map.ptr != reg->map_ptr ||
+ meta->map.uid != reg->map_uid) {
+ verbose(env,
+ "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
+ meta->map.uid, reg->map_uid);
+ return -EINVAL;
+ }
+ }
+ meta->map.ptr = reg->map_ptr;
+ meta->map.uid = reg->map_uid;
+ fallthrough;
case KF_ARG_PTR_TO_ALLOC_BTF_ID:
case KF_ARG_PTR_TO_BTF_ID:
if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
@@ -11731,12 +13028,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return -EINVAL;
}
}
-
fallthrough;
case KF_ARG_PTR_TO_CTX:
- /* Trusted arguments have the same offset checks as release arguments */
- arg_type |= OBJ_RELEASE;
- break;
case KF_ARG_PTR_TO_DYNPTR:
case KF_ARG_PTR_TO_ITER:
case KF_ARG_PTR_TO_LIST_HEAD:
@@ -11748,7 +13041,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_CALLBACK:
case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
case KF_ARG_PTR_TO_CONST_STR:
- /* Trusted by default */
+ case KF_ARG_PTR_TO_WORKQUEUE:
+ case KF_ARG_PTR_TO_IRQ_FLAG:
+ case KF_ARG_PTR_TO_RES_SPIN_LOCK:
break;
default:
WARN_ON_ONCE(1);
@@ -11764,7 +13059,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
switch (kf_arg_type) {
case KF_ARG_PTR_TO_CTX:
if (reg->type != PTR_TO_CTX) {
- verbose(env, "arg#%d expected pointer to ctx, but got %s\n", i, btf_type_str(t));
+ verbose(env, "arg#%d expected pointer to ctx, but got %s\n",
+ i, reg_type_str(env, reg->type));
return -EINVAL;
}
@@ -11804,12 +13100,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;
int clone_ref_obj_id = 0;
- if (reg->type != PTR_TO_STACK &&
- reg->type != CONST_PTR_TO_DYNPTR) {
- verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i);
- return -EINVAL;
- }
-
if (reg->type == CONST_PTR_TO_DYNPTR)
dynptr_arg_type |= MEM_RDONLY;
@@ -11908,22 +13198,22 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return ret;
break;
case KF_ARG_PTR_TO_RB_NODE:
- if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_remove]) {
- if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) {
- verbose(env, "rbtree_remove node input must be non-owning ref\n");
+ if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
return -EINVAL;
}
- if (in_rbtree_lock_required_cb(env)) {
- verbose(env, "rbtree_remove not allowed in rbtree cb\n");
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
return -EINVAL;
}
} else {
- if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
- verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ if (!type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) {
+ verbose(env, "%s can only take non-owning or refcounted bpf_rb_node pointer\n", func_name);
return -EINVAL;
}
- if (!reg->ref_obj_id) {
- verbose(env, "allocated object must be referenced\n");
+ if (in_rbtree_lock_required_cb(env)) {
+ verbose(env, "%s not allowed in rbtree cb\n", func_name);
return -EINVAL;
}
}
@@ -12034,6 +13324,46 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (ret)
return ret;
break;
+ case KF_ARG_PTR_TO_WORKQUEUE:
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "arg#%d doesn't point to a map value\n", i);
+ return -EINVAL;
+ }
+ ret = process_wq_func(env, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_IRQ_FLAG:
+ if (reg->type != PTR_TO_STACK) {
+ verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i);
+ return -EINVAL;
+ }
+ ret = process_irq_flag(env, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_RES_SPIN_LOCK:
+ {
+ int flags = PROCESS_RES_LOCK;
+
+ if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d doesn't point to map value or allocated object\n", i);
+ return -EINVAL;
+ }
+
+ if (!is_bpf_res_spin_lock_kfunc(meta->func_id))
+ return -EFAULT;
+ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock] ||
+ meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])
+ flags |= PROCESS_SPIN_LOCK;
+ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] ||
+ meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore])
+ flags |= PROCESS_LOCK_IRQ;
+ ret = process_spin_lock(env, regno, flags);
+ if (ret < 0)
+ return ret;
+ break;
+ }
}
}
@@ -12088,21 +13418,192 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env,
return 0;
}
+/* check special kfuncs and return:
+ * 1 - not fall-through to 'else' branch, continue verification
+ * 0 - fall-through to 'else' branch
+ * < 0 - not fall-through to 'else' branch, return error
+ */
+static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta,
+ struct bpf_reg_state *regs, struct bpf_insn_aux_data *insn_aux,
+ const struct btf_type *ptr_type, struct btf *desc_btf)
+{
+ const struct btf_type *ret_t;
+ int err = 0;
+
+ if (meta->btf != btf_vmlinux)
+ return 0;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
+ meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ struct btf_struct_meta *struct_meta;
+ struct btf *ret_btf;
+ u32 ret_btf_id;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
+ return -ENOMEM;
+
+ if (((u64)(u32)meta->arg_constant.value) != meta->arg_constant.value) {
+ verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
+ return -EINVAL;
+ }
+
+ ret_btf = env->prog->aux->btf;
+ ret_btf_id = meta->arg_constant.value;
+
+ /* This may be NULL due to user not supplying a BTF */
+ if (!ret_btf) {
+ verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n");
+ return -EINVAL;
+ }
+
+ ret_t = btf_type_by_id(ret_btf, ret_btf_id);
+ if (!ret_t || !__btf_type_is_struct(ret_t)) {
+ verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n");
+ return -EINVAL;
+ }
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) {
+ verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n",
+ ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE);
+ return -EINVAL;
+ }
+
+ if (!bpf_global_percpu_ma_set) {
+ mutex_lock(&bpf_percpu_ma_lock);
+ if (!bpf_global_percpu_ma_set) {
+ /* Charge memory allocated with bpf_global_percpu_ma to
+ * root memcg. The obj_cgroup for root memcg is NULL.
+ */
+ err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL);
+ if (!err)
+ bpf_global_percpu_ma_set = true;
+ }
+ mutex_unlock(&bpf_percpu_ma_lock);
+ if (err)
+ return err;
+ }
+
+ mutex_lock(&bpf_percpu_ma_lock);
+ err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size);
+ mutex_unlock(&bpf_percpu_ma_lock);
+ if (err)
+ return err;
+ }
+
+ struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
+ if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
+ verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n");
+ return -EINVAL;
+ }
+
+ if (struct_meta) {
+ verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n");
+ return -EINVAL;
+ }
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[BPF_REG_0].btf = ret_btf;
+ regs[BPF_REG_0].btf_id = ret_btf_id;
+ if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl])
+ regs[BPF_REG_0].type |= MEM_PERCPU;
+
+ insn_aux->obj_new_size = ret_t->size;
+ insn_aux->kptr_struct_meta = struct_meta;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[BPF_REG_0].btf = meta->arg_btf;
+ regs[BPF_REG_0].btf_id = meta->arg_btf_id;
+
+ insn_aux->kptr_struct_meta =
+ btf_find_struct_meta(meta->arg_btf,
+ meta->arg_btf_id);
+ } else if (is_list_node_type(ptr_type)) {
+ struct btf_field *field = meta->arg_list_head.field;
+
+ mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
+ } else if (is_rbtree_node_type(ptr_type)) {
+ struct btf_field *field = meta->arg_rbtree_root.field;
+
+ mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].btf_id = meta->ret_btf_id;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
+ ret_t = btf_type_by_id(desc_btf, meta->arg_constant.value);
+ if (!ret_t || !btf_type_is_struct(ret_t)) {
+ verbose(env,
+ "kfunc bpf_rdonly_cast type ID argument must be of a struct\n");
+ return -EINVAL;
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].btf_id = meta->arg_constant.value;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice] ||
+ meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
+ enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->initialized_dynptr.type);
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+
+ if (!meta->arg_constant.found) {
+ verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n");
+ return -EFAULT;
+ }
+
+ regs[BPF_REG_0].mem_size = meta->arg_constant.value;
+
+ /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */
+ regs[BPF_REG_0].type = PTR_TO_MEM | type_flag;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice]) {
+ regs[BPF_REG_0].type |= MEM_RDONLY;
+ } else {
+ /* this will set env->seen_direct_write to true */
+ if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) {
+ verbose(env, "the prog does not allow writes to packet data\n");
+ return -EINVAL;
+ }
+ }
+
+ if (!meta->initialized_dynptr.id) {
+ verbose(env, "verifier internal error: no dynptr id\n");
+ return -EFAULT;
+ }
+ regs[BPF_REG_0].dynptr_id = meta->initialized_dynptr.id;
+
+ /* we don't need to set BPF_REG_0's ref obj id
+ * because packet slices are not refcounted (see
+ * dynptr_type_refcounted)
+ */
+ } else {
+ return 0;
+ }
+
+ return 1;
+}
+
static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name);
static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
- const struct btf_type *t, *ptr_type;
+ bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable;
u32 i, nargs, ptr_type_id, release_ref_obj_id;
struct bpf_reg_state *regs = cur_regs(env);
const char *func_name, *ptr_type_name;
- bool sleepable, rcu_lock, rcu_unlock;
+ const struct btf_type *t, *ptr_type;
struct bpf_kfunc_call_arg_meta meta;
struct bpf_insn_aux_data *insn_aux;
int err, insn_idx = *insn_idx_p;
const struct btf_param *args;
- const struct btf_type *ret_t;
struct btf *desc_btf;
/* skip for now, but return error when we find this in fixup_kfunc_call */
@@ -12119,6 +13620,36 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
insn_aux->is_iter_next = is_iter_next_kfunc(&meta);
+ if (!insn->off &&
+ (insn->imm == special_kfunc_list[KF_bpf_res_spin_lock] ||
+ insn->imm == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])) {
+ struct bpf_verifier_state *branch;
+ struct bpf_reg_state *regs;
+
+ branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
+ if (!branch) {
+ verbose(env, "failed to push state for failed lock acquisition\n");
+ return -ENOMEM;
+ }
+
+ regs = branch->frame[branch->curframe]->regs;
+
+ /* Clear r0-r5 registers in forked state */
+ for (i = 0; i < CALLER_SAVED_REGS; i++)
+ mark_reg_not_init(env, regs, caller_saved[i]);
+
+ mark_reg_unknown(env, regs, BPF_REG_0);
+ err = __mark_reg_s32_range(env, regs, BPF_REG_0, -MAX_ERRNO, -1);
+ if (err) {
+ verbose(env, "failed to mark s32 range for retval in forked state for lock\n");
+ return err;
+ }
+ __mark_btf_func_reg_size(env, regs, BPF_REG_0, sizeof(u32));
+ } else if (!insn->off && insn->imm == special_kfunc_list[KF___bpf_trap]) {
+ verbose(env, "unexpected __bpf_trap() due to uninitialized variable?\n");
+ return -EFAULT;
+ }
+
if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
return -EACCES;
@@ -12145,9 +13676,27 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
+ if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie]) {
+ meta.r0_size = sizeof(u64);
+ meta.r0_rdonly = false;
+ }
+
+ if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) {
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_timer_callback_state);
+ if (err) {
+ verbose(env, "kfunc %s#%d failed callback verification\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
+
rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
+ preempt_disable = is_kfunc_bpf_preempt_disable(&meta);
+ preempt_enable = is_kfunc_bpf_preempt_enable(&meta);
+
if (env->cur_state->active_rcu_lock) {
struct bpf_func_state *state;
struct bpf_reg_state *reg;
@@ -12180,6 +13729,27 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EINVAL;
}
+ if (env->cur_state->active_preempt_locks) {
+ if (preempt_disable) {
+ env->cur_state->active_preempt_locks++;
+ } else if (preempt_enable) {
+ env->cur_state->active_preempt_locks--;
+ } else if (sleepable) {
+ verbose(env, "kernel func %s is sleepable within non-preemptible region\n", func_name);
+ return -EACCES;
+ }
+ } else if (preempt_disable) {
+ env->cur_state->active_preempt_locks++;
+ } else if (preempt_enable) {
+ verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name);
+ return -EINVAL;
+ }
+
+ if (env->cur_state->active_irq_id && sleepable) {
+ verbose(env, "kernel func %s is sleepable within IRQ-disabled region\n", func_name);
+ return -EACCES;
+ }
+
/* In case of release function, we get register number of refcounted
* PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
*/
@@ -12250,168 +13820,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (btf_type_is_scalar(t)) {
mark_reg_unknown(env, regs, BPF_REG_0);
+ if (meta.btf == btf_vmlinux && (meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock] ||
+ meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]))
+ __mark_reg_const_zero(env, &regs[BPF_REG_0]);
mark_btf_func_reg_size(env, BPF_REG_0, t->size);
} else if (btf_type_is_ptr(t)) {
ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id);
-
- if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
- if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
- meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
- struct btf_struct_meta *struct_meta;
- struct btf *ret_btf;
- u32 ret_btf_id;
-
- if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
- return -ENOMEM;
-
- if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
- verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
- return -EINVAL;
- }
-
- ret_btf = env->prog->aux->btf;
- ret_btf_id = meta.arg_constant.value;
-
- /* This may be NULL due to user not supplying a BTF */
- if (!ret_btf) {
- verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n");
- return -EINVAL;
- }
-
- ret_t = btf_type_by_id(ret_btf, ret_btf_id);
- if (!ret_t || !__btf_type_is_struct(ret_t)) {
- verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n");
- return -EINVAL;
- }
-
- if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
- if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) {
- verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n",
- ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE);
- return -EINVAL;
- }
-
- if (!bpf_global_percpu_ma_set) {
- mutex_lock(&bpf_percpu_ma_lock);
- if (!bpf_global_percpu_ma_set) {
- /* Charge memory allocated with bpf_global_percpu_ma to
- * root memcg. The obj_cgroup for root memcg is NULL.
- */
- err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL);
- if (!err)
- bpf_global_percpu_ma_set = true;
- }
- mutex_unlock(&bpf_percpu_ma_lock);
- if (err)
- return err;
- }
-
- mutex_lock(&bpf_percpu_ma_lock);
- err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size);
- mutex_unlock(&bpf_percpu_ma_lock);
- if (err)
- return err;
- }
-
- struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
- if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
- if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
- verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n");
- return -EINVAL;
- }
-
- if (struct_meta) {
- verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n");
- return -EINVAL;
- }
- }
-
- mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
- regs[BPF_REG_0].btf = ret_btf;
- regs[BPF_REG_0].btf_id = ret_btf_id;
- if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl])
- regs[BPF_REG_0].type |= MEM_PERCPU;
-
- insn_aux->obj_new_size = ret_t->size;
- insn_aux->kptr_struct_meta = struct_meta;
- } else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
- mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
- regs[BPF_REG_0].btf = meta.arg_btf;
- regs[BPF_REG_0].btf_id = meta.arg_btf_id;
-
- insn_aux->kptr_struct_meta =
- btf_find_struct_meta(meta.arg_btf,
- meta.arg_btf_id);
- } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] ||
- meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {
- struct btf_field *field = meta.arg_list_head.field;
-
- mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
- } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
- meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) {
- struct btf_field *field = meta.arg_rbtree_root.field;
-
- mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
- } else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
- mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
- regs[BPF_REG_0].btf = desc_btf;
- regs[BPF_REG_0].btf_id = meta.ret_btf_id;
- } else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
- ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value);
- if (!ret_t || !btf_type_is_struct(ret_t)) {
- verbose(env,
- "kfunc bpf_rdonly_cast type ID argument must be of a struct\n");
- return -EINVAL;
- }
-
- mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
- regs[BPF_REG_0].btf = desc_btf;
- regs[BPF_REG_0].btf_id = meta.arg_constant.value;
- } else if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice] ||
- meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
- enum bpf_type_flag type_flag = get_dynptr_type_flag(meta.initialized_dynptr.type);
-
- mark_reg_known_zero(env, regs, BPF_REG_0);
-
- if (!meta.arg_constant.found) {
- verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n");
- return -EFAULT;
- }
-
- regs[BPF_REG_0].mem_size = meta.arg_constant.value;
-
- /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */
- regs[BPF_REG_0].type = PTR_TO_MEM | type_flag;
-
- if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice]) {
- regs[BPF_REG_0].type |= MEM_RDONLY;
- } else {
- /* this will set env->seen_direct_write to true */
- if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) {
- verbose(env, "the prog does not allow writes to packet data\n");
- return -EINVAL;
- }
- }
-
- if (!meta.initialized_dynptr.id) {
- verbose(env, "verifier internal error: no dynptr id\n");
- return -EFAULT;
- }
- regs[BPF_REG_0].dynptr_id = meta.initialized_dynptr.id;
-
- /* we don't need to set BPF_REG_0's ref obj id
- * because packet slices are not refcounted (see
- * dynptr_type_refcounted)
- */
- } else {
- verbose(env, "kernel function %s unhandled dynamic return type\n",
- meta.func_name);
- return -EFAULT;
- }
+ err = check_special_kfunc(env, &meta, regs, insn_aux, ptr_type, desc_btf);
+ if (err) {
+ if (err < 0)
+ return err;
} else if (btf_type_is_void(ptr_type)) {
/* kfunc returning 'void *' is equivalent to returning scalar */
mark_reg_unknown(env, regs, BPF_REG_0);
@@ -12450,6 +13868,20 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
regs[BPF_REG_0].btf = desc_btf;
regs[BPF_REG_0].type = PTR_TO_BTF_ID;
regs[BPF_REG_0].btf_id = ptr_type_id;
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache])
+ regs[BPF_REG_0].type |= PTR_UNTRUSTED;
+
+ if (is_iter_next_kfunc(&meta)) {
+ struct bpf_reg_state *cur_iter;
+
+ cur_iter = get_iter_from_state(env->cur_state, &meta);
+
+ if (cur_iter->type & MEM_RCU) /* KF_RCU_PROTECTED */
+ regs[BPF_REG_0].type |= MEM_RCU;
+ else
+ regs[BPF_REG_0].type |= PTR_TRUSTED;
+ }
}
if (is_kfunc_ret_null(&meta)) {
@@ -12459,21 +13891,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
if (is_kfunc_acquire(&meta)) {
- int id = acquire_reference_state(env, insn_idx);
+ int id = acquire_reference(env, insn_idx);
if (id < 0)
return id;
if (is_kfunc_ret_null(&meta))
regs[BPF_REG_0].id = id;
regs[BPF_REG_0].ref_obj_id = id;
- } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) {
+ } else if (is_rbtree_node_type(ptr_type) || is_list_node_type(ptr_type)) {
ref_set_non_owning(env, &regs[BPF_REG_0]);
}
if (reg_may_point_to_spin_lock(&regs[BPF_REG_0]) && !regs[BPF_REG_0].id)
regs[BPF_REG_0].id = ++env->id_gen;
} else if (btf_type_is_void(t)) {
- if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
+ if (meta.btf == btf_vmlinux) {
if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
insn_aux->kptr_struct_meta =
@@ -12505,46 +13937,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return 0;
}
-static bool signed_add_overflows(s64 a, s64 b)
-{
- /* Do the add in u64, where overflow is well-defined */
- s64 res = (s64)((u64)a + (u64)b);
-
- if (b < 0)
- return res > a;
- return res < a;
-}
-
-static bool signed_add32_overflows(s32 a, s32 b)
-{
- /* Do the add in u32, where overflow is well-defined */
- s32 res = (s32)((u32)a + (u32)b);
-
- if (b < 0)
- return res > a;
- return res < a;
-}
-
-static bool signed_sub_overflows(s64 a, s64 b)
-{
- /* Do the sub in u64, where overflow is well-defined */
- s64 res = (s64)((u64)a - (u64)b);
-
- if (b < 0)
- return res < a;
- return res > a;
-}
-
-static bool signed_sub32_overflows(s32 a, s32 b)
-{
- /* Do the sub in u32, where overflow is well-defined */
- s32 res = (s32)((u32)a - (u32)b);
-
- if (b < 0)
- return res < a;
- return res > a;
-}
-
static bool check_reg_sane_offset(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
enum bpf_reg_type type)
@@ -13026,21 +14418,15 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
* added into the variable offset, and we copy the fixed offset
* from ptr_reg.
*/
- if (signed_add_overflows(smin_ptr, smin_val) ||
- signed_add_overflows(smax_ptr, smax_val)) {
+ if (check_add_overflow(smin_ptr, smin_val, &dst_reg->smin_value) ||
+ check_add_overflow(smax_ptr, smax_val, &dst_reg->smax_value)) {
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
- } else {
- dst_reg->smin_value = smin_ptr + smin_val;
- dst_reg->smax_value = smax_ptr + smax_val;
}
- if (umin_ptr + umin_val < umin_ptr ||
- umax_ptr + umax_val < umax_ptr) {
+ if (check_add_overflow(umin_ptr, umin_val, &dst_reg->umin_value) ||
+ check_add_overflow(umax_ptr, umax_val, &dst_reg->umax_value)) {
dst_reg->umin_value = 0;
dst_reg->umax_value = U64_MAX;
- } else {
- dst_reg->umin_value = umin_ptr + umin_val;
- dst_reg->umax_value = umax_ptr + umax_val;
}
dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
dst_reg->off = ptr_reg->off;
@@ -13083,14 +14469,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
/* A new variable offset is created. If the subtrahend is known
* nonnegative, then any reg->range we had before is still good.
*/
- if (signed_sub_overflows(smin_ptr, smax_val) ||
- signed_sub_overflows(smax_ptr, smin_val)) {
+ if (check_sub_overflow(smin_ptr, smax_val, &dst_reg->smin_value) ||
+ check_sub_overflow(smax_ptr, smin_val, &dst_reg->smax_value)) {
/* Overflow possible, we know nothing */
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
- } else {
- dst_reg->smin_value = smin_ptr - smax_val;
- dst_reg->smax_value = smax_ptr - smin_val;
}
if (umin_ptr < umax_val) {
/* Overflow possible, we know nothing */
@@ -13143,71 +14526,56 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
static void scalar32_min_max_add(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s32 smin_val = src_reg->s32_min_value;
- s32 smax_val = src_reg->s32_max_value;
- u32 umin_val = src_reg->u32_min_value;
- u32 umax_val = src_reg->u32_max_value;
+ s32 *dst_smin = &dst_reg->s32_min_value;
+ s32 *dst_smax = &dst_reg->s32_max_value;
+ u32 *dst_umin = &dst_reg->u32_min_value;
+ u32 *dst_umax = &dst_reg->u32_max_value;
- if (signed_add32_overflows(dst_reg->s32_min_value, smin_val) ||
- signed_add32_overflows(dst_reg->s32_max_value, smax_val)) {
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
- } else {
- dst_reg->s32_min_value += smin_val;
- dst_reg->s32_max_value += smax_val;
+ if (check_add_overflow(*dst_smin, src_reg->s32_min_value, dst_smin) ||
+ check_add_overflow(*dst_smax, src_reg->s32_max_value, dst_smax)) {
+ *dst_smin = S32_MIN;
+ *dst_smax = S32_MAX;
}
- if (dst_reg->u32_min_value + umin_val < umin_val ||
- dst_reg->u32_max_value + umax_val < umax_val) {
- dst_reg->u32_min_value = 0;
- dst_reg->u32_max_value = U32_MAX;
- } else {
- dst_reg->u32_min_value += umin_val;
- dst_reg->u32_max_value += umax_val;
+ if (check_add_overflow(*dst_umin, src_reg->u32_min_value, dst_umin) ||
+ check_add_overflow(*dst_umax, src_reg->u32_max_value, dst_umax)) {
+ *dst_umin = 0;
+ *dst_umax = U32_MAX;
}
}
static void scalar_min_max_add(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s64 smin_val = src_reg->smin_value;
- s64 smax_val = src_reg->smax_value;
- u64 umin_val = src_reg->umin_value;
- u64 umax_val = src_reg->umax_value;
+ s64 *dst_smin = &dst_reg->smin_value;
+ s64 *dst_smax = &dst_reg->smax_value;
+ u64 *dst_umin = &dst_reg->umin_value;
+ u64 *dst_umax = &dst_reg->umax_value;
- if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
- signed_add_overflows(dst_reg->smax_value, smax_val)) {
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- } else {
- dst_reg->smin_value += smin_val;
- dst_reg->smax_value += smax_val;
+ if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) ||
+ check_add_overflow(*dst_smax, src_reg->smax_value, dst_smax)) {
+ *dst_smin = S64_MIN;
+ *dst_smax = S64_MAX;
}
- if (dst_reg->umin_value + umin_val < umin_val ||
- dst_reg->umax_value + umax_val < umax_val) {
- dst_reg->umin_value = 0;
- dst_reg->umax_value = U64_MAX;
- } else {
- dst_reg->umin_value += umin_val;
- dst_reg->umax_value += umax_val;
+ if (check_add_overflow(*dst_umin, src_reg->umin_value, dst_umin) ||
+ check_add_overflow(*dst_umax, src_reg->umax_value, dst_umax)) {
+ *dst_umin = 0;
+ *dst_umax = U64_MAX;
}
}
static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s32 smin_val = src_reg->s32_min_value;
- s32 smax_val = src_reg->s32_max_value;
+ s32 *dst_smin = &dst_reg->s32_min_value;
+ s32 *dst_smax = &dst_reg->s32_max_value;
u32 umin_val = src_reg->u32_min_value;
u32 umax_val = src_reg->u32_max_value;
- if (signed_sub32_overflows(dst_reg->s32_min_value, smax_val) ||
- signed_sub32_overflows(dst_reg->s32_max_value, smin_val)) {
+ if (check_sub_overflow(*dst_smin, src_reg->s32_max_value, dst_smin) ||
+ check_sub_overflow(*dst_smax, src_reg->s32_min_value, dst_smax)) {
/* Overflow possible, we know nothing */
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
- } else {
- dst_reg->s32_min_value -= smax_val;
- dst_reg->s32_max_value -= smin_val;
+ *dst_smin = S32_MIN;
+ *dst_smax = S32_MAX;
}
if (dst_reg->u32_min_value < umax_val) {
/* Overflow possible, we know nothing */
@@ -13223,19 +14591,16 @@ static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,
static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s64 smin_val = src_reg->smin_value;
- s64 smax_val = src_reg->smax_value;
+ s64 *dst_smin = &dst_reg->smin_value;
+ s64 *dst_smax = &dst_reg->smax_value;
u64 umin_val = src_reg->umin_value;
u64 umax_val = src_reg->umax_value;
- if (signed_sub_overflows(dst_reg->smin_value, smax_val) ||
- signed_sub_overflows(dst_reg->smax_value, smin_val)) {
+ if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) ||
+ check_sub_overflow(*dst_smax, src_reg->smin_value, dst_smax)) {
/* Overflow possible, we know nothing */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- } else {
- dst_reg->smin_value -= smax_val;
- dst_reg->smax_value -= smin_val;
+ *dst_smin = S64_MIN;
+ *dst_smax = S64_MAX;
}
if (dst_reg->umin_value < umax_val) {
/* Overflow possible, we know nothing */
@@ -13251,64 +14616,56 @@ static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,
static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s32 smin_val = src_reg->s32_min_value;
- u32 umin_val = src_reg->u32_min_value;
- u32 umax_val = src_reg->u32_max_value;
+ s32 *dst_smin = &dst_reg->s32_min_value;
+ s32 *dst_smax = &dst_reg->s32_max_value;
+ u32 *dst_umin = &dst_reg->u32_min_value;
+ u32 *dst_umax = &dst_reg->u32_max_value;
+ s32 tmp_prod[4];
- if (smin_val < 0 || dst_reg->s32_min_value < 0) {
- /* Ain't nobody got time to multiply that sign */
- __mark_reg32_unbounded(dst_reg);
- return;
- }
- /* Both values are positive, so we can work with unsigned and
- * copy the result to signed (unless it exceeds S32_MAX).
- */
- if (umax_val > U16_MAX || dst_reg->u32_max_value > U16_MAX) {
- /* Potential overflow, we know nothing */
- __mark_reg32_unbounded(dst_reg);
- return;
+ if (check_mul_overflow(*dst_umax, src_reg->u32_max_value, dst_umax) ||
+ check_mul_overflow(*dst_umin, src_reg->u32_min_value, dst_umin)) {
+ /* Overflow possible, we know nothing */
+ *dst_umin = 0;
+ *dst_umax = U32_MAX;
}
- dst_reg->u32_min_value *= umin_val;
- dst_reg->u32_max_value *= umax_val;
- if (dst_reg->u32_max_value > S32_MAX) {
+ if (check_mul_overflow(*dst_smin, src_reg->s32_min_value, &tmp_prod[0]) ||
+ check_mul_overflow(*dst_smin, src_reg->s32_max_value, &tmp_prod[1]) ||
+ check_mul_overflow(*dst_smax, src_reg->s32_min_value, &tmp_prod[2]) ||
+ check_mul_overflow(*dst_smax, src_reg->s32_max_value, &tmp_prod[3])) {
/* Overflow possible, we know nothing */
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
+ *dst_smin = S32_MIN;
+ *dst_smax = S32_MAX;
} else {
- dst_reg->s32_min_value = dst_reg->u32_min_value;
- dst_reg->s32_max_value = dst_reg->u32_max_value;
+ *dst_smin = min_array(tmp_prod, 4);
+ *dst_smax = max_array(tmp_prod, 4);
}
}
static void scalar_min_max_mul(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s64 smin_val = src_reg->smin_value;
- u64 umin_val = src_reg->umin_value;
- u64 umax_val = src_reg->umax_value;
+ s64 *dst_smin = &dst_reg->smin_value;
+ s64 *dst_smax = &dst_reg->smax_value;
+ u64 *dst_umin = &dst_reg->umin_value;
+ u64 *dst_umax = &dst_reg->umax_value;
+ s64 tmp_prod[4];
- if (smin_val < 0 || dst_reg->smin_value < 0) {
- /* Ain't nobody got time to multiply that sign */
- __mark_reg64_unbounded(dst_reg);
- return;
- }
- /* Both values are positive, so we can work with unsigned and
- * copy the result to signed (unless it exceeds S64_MAX).
- */
- if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) {
- /* Potential overflow, we know nothing */
- __mark_reg64_unbounded(dst_reg);
- return;
+ if (check_mul_overflow(*dst_umax, src_reg->umax_value, dst_umax) ||
+ check_mul_overflow(*dst_umin, src_reg->umin_value, dst_umin)) {
+ /* Overflow possible, we know nothing */
+ *dst_umin = 0;
+ *dst_umax = U64_MAX;
}
- dst_reg->umin_value *= umin_val;
- dst_reg->umax_value *= umax_val;
- if (dst_reg->umax_value > S64_MAX) {
+ if (check_mul_overflow(*dst_smin, src_reg->smin_value, &tmp_prod[0]) ||
+ check_mul_overflow(*dst_smin, src_reg->smax_value, &tmp_prod[1]) ||
+ check_mul_overflow(*dst_smax, src_reg->smin_value, &tmp_prod[2]) ||
+ check_mul_overflow(*dst_smax, src_reg->smax_value, &tmp_prod[3])) {
/* Overflow possible, we know nothing */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
+ *dst_smin = S64_MIN;
+ *dst_smax = S64_MAX;
} else {
- dst_reg->smin_value = dst_reg->umin_value;
- dst_reg->smax_value = dst_reg->umax_value;
+ *dst_smin = min_array(tmp_prod, 4);
+ *dst_smax = max_array(tmp_prod, 4);
}
}
@@ -13318,7 +14675,6 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
- s32 smin_val = src_reg->s32_min_value;
u32 umax_val = src_reg->u32_max_value;
if (src_known && dst_known) {
@@ -13331,18 +14687,16 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
*/
dst_reg->u32_min_value = var32_off.value;
dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val);
- if (dst_reg->s32_min_value < 0 || smin_val < 0) {
- /* Lose signed bounds when ANDing negative numbers,
- * ain't nobody got time for that.
- */
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
- } else {
- /* ANDing two positives gives a positive, so safe to
- * cast result into s64.
- */
+
+ /* Safe to set s32 bounds by casting u32 result into s32 when u32
+ * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
+ */
+ if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
+ } else {
+ dst_reg->s32_min_value = S32_MIN;
+ dst_reg->s32_max_value = S32_MAX;
}
}
@@ -13351,7 +14705,6 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
{
bool src_known = tnum_is_const(src_reg->var_off);
bool dst_known = tnum_is_const(dst_reg->var_off);
- s64 smin_val = src_reg->smin_value;
u64 umax_val = src_reg->umax_value;
if (src_known && dst_known) {
@@ -13364,18 +14717,16 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
*/
dst_reg->umin_value = dst_reg->var_off.value;
dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
- if (dst_reg->smin_value < 0 || smin_val < 0) {
- /* Lose signed bounds when ANDing negative numbers,
- * ain't nobody got time for that.
- */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- } else {
- /* ANDing two positives gives a positive, so safe to
- * cast result into s64.
- */
+
+ /* Safe to set s64 bounds by casting u64 result into s64 when u64
+ * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
+ */
+ if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
dst_reg->smin_value = dst_reg->umin_value;
dst_reg->smax_value = dst_reg->umax_value;
+ } else {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
}
/* We may learn something more from the var_off */
__update_reg_bounds(dst_reg);
@@ -13387,7 +14738,6 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
- s32 smin_val = src_reg->s32_min_value;
u32 umin_val = src_reg->u32_min_value;
if (src_known && dst_known) {
@@ -13400,18 +14750,16 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
*/
dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val);
dst_reg->u32_max_value = var32_off.value | var32_off.mask;
- if (dst_reg->s32_min_value < 0 || smin_val < 0) {
- /* Lose signed bounds when ORing negative numbers,
- * ain't nobody got time for that.
- */
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
- } else {
- /* ORing two positives gives a positive, so safe to
- * cast result into s64.
- */
+
+ /* Safe to set s32 bounds by casting u32 result into s32 when u32
+ * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
+ */
+ if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
+ } else {
+ dst_reg->s32_min_value = S32_MIN;
+ dst_reg->s32_max_value = S32_MAX;
}
}
@@ -13420,7 +14768,6 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
{
bool src_known = tnum_is_const(src_reg->var_off);
bool dst_known = tnum_is_const(dst_reg->var_off);
- s64 smin_val = src_reg->smin_value;
u64 umin_val = src_reg->umin_value;
if (src_known && dst_known) {
@@ -13433,18 +14780,16 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
*/
dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
- if (dst_reg->smin_value < 0 || smin_val < 0) {
- /* Lose signed bounds when ORing negative numbers,
- * ain't nobody got time for that.
- */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- } else {
- /* ORing two positives gives a positive, so safe to
- * cast result into s64.
- */
+
+ /* Safe to set s64 bounds by casting u64 result into s64 when u64
+ * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
+ */
+ if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
dst_reg->smin_value = dst_reg->umin_value;
dst_reg->smax_value = dst_reg->umax_value;
+ } else {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
}
/* We may learn something more from the var_off */
__update_reg_bounds(dst_reg);
@@ -13456,7 +14801,6 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
- s32 smin_val = src_reg->s32_min_value;
if (src_known && dst_known) {
__mark_reg32_known(dst_reg, var32_off.value);
@@ -13467,10 +14811,10 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
dst_reg->u32_min_value = var32_off.value;
dst_reg->u32_max_value = var32_off.value | var32_off.mask;
- if (dst_reg->s32_min_value >= 0 && smin_val >= 0) {
- /* XORing two positive sign numbers gives a positive,
- * so safe to cast u32 result into s32.
- */
+ /* Safe to set s32 bounds by casting u32 result into s32 when u32
+ * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
+ */
+ if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
} else {
@@ -13484,7 +14828,6 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg,
{
bool src_known = tnum_is_const(src_reg->var_off);
bool dst_known = tnum_is_const(dst_reg->var_off);
- s64 smin_val = src_reg->smin_value;
if (src_known && dst_known) {
/* dst_reg->var_off.value has been updated earlier */
@@ -13496,10 +14839,10 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg,
dst_reg->umin_value = dst_reg->var_off.value;
dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
- if (dst_reg->smin_value >= 0 && smin_val >= 0) {
- /* XORing two positive sign numbers gives a positive,
- * so safe to cast u64 result into s64.
- */
+ /* Safe to set s64 bounds by casting u64 result into s64 when u64
+ * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
+ */
+ if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
dst_reg->smin_value = dst_reg->umin_value;
dst_reg->smax_value = dst_reg->umax_value;
} else {
@@ -13707,6 +15050,46 @@ static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg,
__update_reg_bounds(dst_reg);
}
+static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
+ const struct bpf_reg_state *src_reg)
+{
+ bool src_is_const = false;
+ u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
+
+ if (insn_bitness == 32) {
+ if (tnum_subreg_is_const(src_reg->var_off)
+ && src_reg->s32_min_value == src_reg->s32_max_value
+ && src_reg->u32_min_value == src_reg->u32_max_value)
+ src_is_const = true;
+ } else {
+ if (tnum_is_const(src_reg->var_off)
+ && src_reg->smin_value == src_reg->smax_value
+ && src_reg->umin_value == src_reg->umax_value)
+ src_is_const = true;
+ }
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_ADD:
+ case BPF_SUB:
+ case BPF_AND:
+ case BPF_XOR:
+ case BPF_OR:
+ case BPF_MUL:
+ return true;
+
+ /* Shift operators range is only computable if shift dimension operand
+ * is a constant. Shifts greater than 31 or 63 are undefined. This
+ * includes shifts by a negative number.
+ */
+ case BPF_LSH:
+ case BPF_RSH:
+ case BPF_ARSH:
+ return (src_is_const && src_reg->umax_value < insn_bitness);
+ default:
+ return false;
+ }
+}
+
/* WARNING: This function does calculations on 64-bit values, but the actual
* execution may occur on 32-bit values. Therefore, things like bitshifts
* need extra checks in the 32-bit case.
@@ -13716,53 +15099,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
struct bpf_reg_state *dst_reg,
struct bpf_reg_state src_reg)
{
- struct bpf_reg_state *regs = cur_regs(env);
u8 opcode = BPF_OP(insn->code);
- bool src_known;
- s64 smin_val, smax_val;
- u64 umin_val, umax_val;
- s32 s32_min_val, s32_max_val;
- u32 u32_min_val, u32_max_val;
- u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
int ret;
- smin_val = src_reg.smin_value;
- smax_val = src_reg.smax_value;
- umin_val = src_reg.umin_value;
- umax_val = src_reg.umax_value;
-
- s32_min_val = src_reg.s32_min_value;
- s32_max_val = src_reg.s32_max_value;
- u32_min_val = src_reg.u32_min_value;
- u32_max_val = src_reg.u32_max_value;
-
- if (alu32) {
- src_known = tnum_subreg_is_const(src_reg.var_off);
- if ((src_known &&
- (s32_min_val != s32_max_val || u32_min_val != u32_max_val)) ||
- s32_min_val > s32_max_val || u32_min_val > u32_max_val) {
- /* Taint dst register if offset had invalid bounds
- * derived from e.g. dead branches.
- */
- __mark_reg_unknown(env, dst_reg);
- return 0;
- }
- } else {
- src_known = tnum_is_const(src_reg.var_off);
- if ((src_known &&
- (smin_val != smax_val || umin_val != umax_val)) ||
- smin_val > smax_val || umin_val > umax_val) {
- /* Taint dst register if offset had invalid bounds
- * derived from e.g. dead branches.
- */
- __mark_reg_unknown(env, dst_reg);
- return 0;
- }
- }
-
- if (!src_known &&
- opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
+ if (!is_safe_to_compute_dst_reg_range(insn, &src_reg)) {
__mark_reg_unknown(env, dst_reg);
return 0;
}
@@ -13819,46 +15160,24 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
scalar_min_max_xor(dst_reg, &src_reg);
break;
case BPF_LSH:
- if (umax_val >= insn_bitness) {
- /* Shifts greater than 31 or 63 are undefined.
- * This includes shifts by a negative number.
- */
- mark_reg_unknown(env, regs, insn->dst_reg);
- break;
- }
if (alu32)
scalar32_min_max_lsh(dst_reg, &src_reg);
else
scalar_min_max_lsh(dst_reg, &src_reg);
break;
case BPF_RSH:
- if (umax_val >= insn_bitness) {
- /* Shifts greater than 31 or 63 are undefined.
- * This includes shifts by a negative number.
- */
- mark_reg_unknown(env, regs, insn->dst_reg);
- break;
- }
if (alu32)
scalar32_min_max_rsh(dst_reg, &src_reg);
else
scalar_min_max_rsh(dst_reg, &src_reg);
break;
case BPF_ARSH:
- if (umax_val >= insn_bitness) {
- /* Shifts greater than 31 or 63 are undefined.
- * This includes shifts by a negative number.
- */
- mark_reg_unknown(env, regs, insn->dst_reg);
- break;
- }
if (alu32)
scalar32_min_max_arsh(dst_reg, &src_reg);
else
scalar_min_max_arsh(dst_reg, &src_reg);
break;
default:
- mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
@@ -13879,6 +15198,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
+ bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
u8 opcode = BPF_OP(insn->code);
int err;
@@ -13901,11 +15221,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
if (dst_reg->type != SCALAR_VALUE)
ptr_reg = dst_reg;
- else
- /* Make sure ID is cleared otherwise dst_reg min/max could be
- * incorrectly propagated into other registers by find_equal_scalars()
- */
- dst_reg->id = 0;
+
if (BPF_SRC(insn->code) == BPF_X) {
src_reg = &regs[insn->src_reg];
if (src_reg->type != SCALAR_VALUE) {
@@ -13960,16 +15276,53 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
/* Got here implies adding two SCALAR_VALUEs */
if (WARN_ON_ONCE(ptr_reg)) {
- print_verifier_state(env, state, true);
+ print_verifier_state(env, vstate, vstate->curframe, true);
verbose(env, "verifier internal error: unexpected ptr_reg\n");
return -EINVAL;
}
if (WARN_ON(!src_reg)) {
- print_verifier_state(env, state, true);
+ print_verifier_state(env, vstate, vstate->curframe, true);
verbose(env, "verifier internal error: no src_reg\n");
return -EINVAL;
}
- return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
+ err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
+ if (err)
+ return err;
+ /*
+ * Compilers can generate the code
+ * r1 = r2
+ * r1 += 0x1
+ * if r2 < 1000 goto ...
+ * use r1 in memory access
+ * So for 64-bit alu remember constant delta between r2 and r1 and
+ * update r1 after 'if' condition.
+ */
+ if (env->bpf_capable &&
+ BPF_OP(insn->code) == BPF_ADD && !alu32 &&
+ dst_reg->id && is_reg_const(src_reg, false)) {
+ u64 val = reg_const_value(src_reg, false);
+
+ if ((dst_reg->id & BPF_ADD_CONST) ||
+ /* prevent overflow in sync_linked_regs() later */
+ val > (u32)S32_MAX) {
+ /*
+ * If the register already went through rX += val
+ * we cannot accumulate another val into rx->off.
+ */
+ dst_reg->off = 0;
+ dst_reg->id = 0;
+ } else {
+ dst_reg->id |= BPF_ADD_CONST;
+ dst_reg->off = val;
+ }
+ } else {
+ /*
+ * Make sure ID is cleared otherwise dst_reg min/max could be
+ * incorrectly propagated into other registers by sync_linked_regs()
+ */
+ dst_reg->id = 0;
+ }
+ return 0;
}
/* check validity of 32-bit and 64-bit arithmetic operations */
@@ -14115,7 +15468,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
copy_register_state(dst_reg, src_reg);
/* Make sure ID is cleared if src_reg is not in u32
* range otherwise dst_reg min/max could be incorrectly
- * propagated into src_reg by find_equal_scalars()
+ * propagated into src_reg by sync_linked_regs()
*/
if (!is_src_reg_u32)
dst_reg->id = 0;
@@ -14564,7 +15917,19 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state
struct tnum t;
u64 val;
-again:
+ /* In case of GE/GT/SGE/JST, reuse LE/LT/SLE/SLT logic from below */
+ switch (opcode) {
+ case BPF_JGE:
+ case BPF_JGT:
+ case BPF_JSGE:
+ case BPF_JSGT:
+ opcode = flip_opcode(opcode);
+ swap(reg1, reg2);
+ break;
+ default:
+ break;
+ }
+
switch (opcode) {
case BPF_JEQ:
if (is_jmp32) {
@@ -14707,14 +16072,6 @@ again:
reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value);
}
break;
- case BPF_JGE:
- case BPF_JGT:
- case BPF_JSGE:
- case BPF_JSGT:
- /* just reuse LE/LT logic above */
- opcode = flip_opcode(opcode);
- swap(reg1, reg2);
- goto again;
default:
return;
}
@@ -14722,7 +16079,7 @@ again:
/* Adjusts the register min/max values in the case that the dst_reg and
* src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K
- * check, in which case we havea fake SCALAR_VALUE representing insn->imm).
+ * check, in which case we have a fake SCALAR_VALUE representing insn->imm).
* Technically we can do similar adjustments for pointers to the same object,
* but we don't support that right now.
*/
@@ -14822,7 +16179,7 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
* No one could have freed the reference state before
* doing the NULL check.
*/
- WARN_ON_ONCE(release_reference_state(state, id));
+ WARN_ON_ONCE(release_reference_nomark(vstate, id));
bpf_for_each_reg_in_vstate(vstate, state, reg, ({
mark_ptr_or_null_reg(state, reg, id, is_null);
@@ -14934,16 +16291,97 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
return true;
}
-static void find_equal_scalars(struct bpf_verifier_state *vstate,
- struct bpf_reg_state *known_reg)
+static void __collect_linked_regs(struct linked_regs *reg_set, struct bpf_reg_state *reg,
+ u32 id, u32 frameno, u32 spi_or_reg, bool is_reg)
{
- struct bpf_func_state *state;
+ struct linked_reg *e;
+
+ if (reg->type != SCALAR_VALUE || (reg->id & ~BPF_ADD_CONST) != id)
+ return;
+
+ e = linked_regs_push(reg_set);
+ if (e) {
+ e->frameno = frameno;
+ e->is_reg = is_reg;
+ e->regno = spi_or_reg;
+ } else {
+ reg->id = 0;
+ }
+}
+
+/* For all R being scalar registers or spilled scalar registers
+ * in verifier state, save R in linked_regs if R->id == id.
+ * If there are too many Rs sharing same id, reset id for leftover Rs.
+ */
+static void collect_linked_regs(struct bpf_verifier_state *vstate, u32 id,
+ struct linked_regs *linked_regs)
+{
+ struct bpf_func_state *func;
struct bpf_reg_state *reg;
+ int i, j;
+
+ id = id & ~BPF_ADD_CONST;
+ for (i = vstate->curframe; i >= 0; i--) {
+ func = vstate->frame[i];
+ for (j = 0; j < BPF_REG_FP; j++) {
+ reg = &func->regs[j];
+ __collect_linked_regs(linked_regs, reg, id, i, j, true);
+ }
+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+ if (!is_spilled_reg(&func->stack[j]))
+ continue;
+ reg = &func->stack[j].spilled_ptr;
+ __collect_linked_regs(linked_regs, reg, id, i, j, false);
+ }
+ }
+}
+
+/* For all R in linked_regs, copy known_reg range into R
+ * if R->id == known_reg->id.
+ */
+static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_state *known_reg,
+ struct linked_regs *linked_regs)
+{
+ struct bpf_reg_state fake_reg;
+ struct bpf_reg_state *reg;
+ struct linked_reg *e;
+ int i;
+
+ for (i = 0; i < linked_regs->cnt; ++i) {
+ e = &linked_regs->entries[i];
+ reg = e->is_reg ? &vstate->frame[e->frameno]->regs[e->regno]
+ : &vstate->frame[e->frameno]->stack[e->spi].spilled_ptr;
+ if (reg->type != SCALAR_VALUE || reg == known_reg)
+ continue;
+ if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST))
+ continue;
+ if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) ||
+ reg->off == known_reg->off) {
+ s32 saved_subreg_def = reg->subreg_def;
- bpf_for_each_reg_in_vstate(vstate, state, reg, ({
- if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
copy_register_state(reg, known_reg);
- }));
+ reg->subreg_def = saved_subreg_def;
+ } else {
+ s32 saved_subreg_def = reg->subreg_def;
+ s32 saved_off = reg->off;
+
+ fake_reg.type = SCALAR_VALUE;
+ __mark_reg_known(&fake_reg, (s32)reg->off - (s32)known_reg->off);
+
+ /* reg = known_reg; reg += delta */
+ copy_register_state(reg, known_reg);
+ /*
+ * Must preserve off, id and add_const flag,
+ * otherwise another sync_linked_regs() will be incorrect.
+ */
+ reg->off = saved_off;
+ reg->subreg_def = saved_subreg_def;
+
+ scalar32_min_max_add(reg, &fake_reg);
+ scalar_min_max_add(reg, &fake_reg);
+ reg->var_off = tnum_add(reg->var_off, fake_reg.var_off);
+ }
+ }
}
static int check_cond_jmp_op(struct bpf_verifier_env *env,
@@ -14954,8 +16392,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
struct bpf_reg_state *eq_branch_regs;
- struct bpf_reg_state fake_reg = {};
+ struct linked_regs linked_regs = {};
u8 opcode = BPF_OP(insn->code);
+ int insn_flags = 0;
bool is_jmp32;
int pred = -1;
int err;
@@ -14972,9 +16411,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (insn->code != (BPF_JMP | BPF_JCOND) ||
insn->src_reg != BPF_MAY_GOTO ||
- insn->dst_reg || insn->imm || insn->off == 0) {
- verbose(env, "invalid may_goto off %d imm %d\n",
- insn->off, insn->imm);
+ insn->dst_reg || insn->imm) {
+ verbose(env, "invalid may_goto imm %d\n", insn->imm);
return -EINVAL;
}
prev_st = find_prev_entry(env, cur_st->parent, idx);
@@ -15015,14 +16453,29 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
insn->src_reg);
return -EACCES;
}
+
+ if (src_reg->type == PTR_TO_STACK)
+ insn_flags |= INSN_F_SRC_REG_STACK;
+ if (dst_reg->type == PTR_TO_STACK)
+ insn_flags |= INSN_F_DST_REG_STACK;
} else {
if (insn->src_reg != BPF_REG_0) {
verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
return -EINVAL;
}
- src_reg = &fake_reg;
+ src_reg = &env->fake_reg[0];
+ memset(src_reg, 0, sizeof(*src_reg));
src_reg->type = SCALAR_VALUE;
__mark_reg_known(src_reg, insn->imm);
+
+ if (dst_reg->type == PTR_TO_STACK)
+ insn_flags |= INSN_F_DST_REG_STACK;
+ }
+
+ if (insn_flags) {
+ err = push_insn_history(env, this_branch, insn_flags, 0);
+ if (err)
+ return err;
}
is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
@@ -15050,7 +16503,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
*insn_idx))
return -EFAULT;
if (env->log.level & BPF_LOG_LEVEL)
- print_insn_state(env, this_branch->frame[this_branch->curframe]);
+ print_insn_state(env, this_branch, this_branch->curframe);
*insn_idx += insn->off;
return 0;
} else if (pred == 0) {
@@ -15064,10 +16517,25 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
*insn_idx))
return -EFAULT;
if (env->log.level & BPF_LOG_LEVEL)
- print_insn_state(env, this_branch->frame[this_branch->curframe]);
+ print_insn_state(env, this_branch, this_branch->curframe);
return 0;
}
+ /* Push scalar registers sharing same ID to jump history,
+ * do this before creating 'other_branch', so that both
+ * 'this_branch' and 'other_branch' share this history
+ * if parent state is created.
+ */
+ if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id)
+ collect_linked_regs(this_branch, src_reg->id, &linked_regs);
+ if (dst_reg->type == SCALAR_VALUE && dst_reg->id)
+ collect_linked_regs(this_branch, dst_reg->id, &linked_regs);
+ if (linked_regs.cnt > 1) {
+ err = push_insn_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
+ if (err)
+ return err;
+ }
+
other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx,
false);
if (!other_branch)
@@ -15080,10 +16548,16 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
&other_branch_regs[insn->src_reg],
dst_reg, src_reg, opcode, is_jmp32);
} else /* BPF_SRC(insn->code) == BPF_K */ {
+ /* reg_set_min_max() can mangle the fake_reg. Make a copy
+ * so that these are two different memory locations. The
+ * src_reg is not used beyond here in context of K.
+ */
+ memcpy(&env->fake_reg[1], &env->fake_reg[0],
+ sizeof(env->fake_reg[0]));
err = reg_set_min_max(env,
&other_branch_regs[insn->dst_reg],
- src_reg /* fake one */,
- dst_reg, src_reg /* same fake one */,
+ &env->fake_reg[0],
+ dst_reg, &env->fake_reg[1],
opcode, is_jmp32);
}
if (err)
@@ -15092,13 +16566,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (BPF_SRC(insn->code) == BPF_X &&
src_reg->type == SCALAR_VALUE && src_reg->id &&
!WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
- find_equal_scalars(this_branch, src_reg);
- find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]);
+ sync_linked_regs(this_branch, src_reg, &linked_regs);
+ sync_linked_regs(other_branch, &other_branch_regs[insn->src_reg], &linked_regs);
}
if (dst_reg->type == SCALAR_VALUE && dst_reg->id &&
!WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) {
- find_equal_scalars(this_branch, dst_reg);
- find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]);
+ sync_linked_regs(this_branch, dst_reg, &linked_regs);
+ sync_linked_regs(other_branch, &other_branch_regs[insn->dst_reg], &linked_regs);
}
/* if one pointer register is compared to another pointer
@@ -15160,7 +16634,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return -EACCES;
}
if (env->log.level & BPF_LOG_LEVEL)
- print_insn_state(env, this_branch->frame[this_branch->curframe]);
+ print_insn_state(env, this_branch, this_branch->curframe);
return 0;
}
@@ -15321,21 +16795,9 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
* gen_ld_abs() may terminate the program at runtime, leading to
* reference leak.
*/
- err = check_reference_leak(env, false);
- if (err) {
- verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n");
+ err = check_resource_leak(env, false, true, "BPF_LD_[ABS|IND]");
+ if (err)
return err;
- }
-
- if (env->cur_state->active_lock.ptr) {
- verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
- return -EINVAL;
- }
-
- if (env->cur_state->active_rcu_lock) {
- verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n");
- return -EINVAL;
- }
if (regs[ctx_reg].type != PTR_TO_CTX) {
verbose(env,
@@ -15375,12 +16837,14 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
const char *exit_ctx = "At program exit";
struct tnum enforce_attach_type_range = tnum_unknown;
const struct bpf_prog *prog = env->prog;
- struct bpf_reg_state *reg;
+ struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_retval_range range = retval_range(0, 1);
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
int err;
struct bpf_func_state *frame = env->cur_state->frame[0];
const bool is_subprog = frame->subprogno;
+ bool return_32bit = false;
+ const struct btf_type *reg_type, *ret_type = NULL;
/* LSM and struct_ops func-ptr's return type could be "void" */
if (!is_subprog || frame->in_exception_callback_fn) {
@@ -15389,10 +16853,26 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
if (prog->expected_attach_type == BPF_LSM_CGROUP)
/* See below, can be 0 or 0-1 depending on hook. */
break;
- fallthrough;
+ if (!prog->aux->attach_func_proto->type)
+ return 0;
+ break;
case BPF_PROG_TYPE_STRUCT_OPS:
if (!prog->aux->attach_func_proto->type)
return 0;
+
+ if (frame->in_exception_callback_fn)
+ break;
+
+ /* Allow a struct_ops program to return a referenced kptr if it
+ * matches the operator's return type and is in its unmodified
+ * form. A scalar zero (i.e., a null pointer) is also allowed.
+ */
+ reg_type = reg->btf ? btf_type_by_id(reg->btf, reg->btf_id) : NULL;
+ ret_type = btf_type_resolve_ptr(prog->aux->attach_btf,
+ prog->aux->attach_func_proto->type,
+ NULL);
+ if (ret_type && ret_type == reg_type && reg->ref_obj_id)
+ return __check_ptr_off_reg(env, reg, regno, false);
break;
default:
break;
@@ -15414,8 +16894,6 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
return -EACCES;
}
- reg = cur_regs(env) + regno;
-
if (frame->in_async_callback_fn) {
/* enforce return zero from async callbacks like timer */
exit_ctx = "At async callback return";
@@ -15480,18 +16958,30 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
return -ENOTSUPP;
}
break;
+ case BPF_PROG_TYPE_KPROBE:
+ switch (env->prog->expected_attach_type) {
+ case BPF_TRACE_KPROBE_SESSION:
+ case BPF_TRACE_UPROBE_SESSION:
+ range = retval_range(0, 1);
+ break;
+ default:
+ return 0;
+ }
+ break;
case BPF_PROG_TYPE_SK_LOOKUP:
range = retval_range(SK_DROP, SK_PASS);
break;
case BPF_PROG_TYPE_LSM:
if (env->prog->expected_attach_type != BPF_LSM_CGROUP) {
- /* Regular BPF_PROG_TYPE_LSM programs can return
- * any value.
- */
- return 0;
- }
- if (!env->prog->aux->attach_func_proto->type) {
+ /* no range found, any return value is allowed */
+ if (!get_func_retval_range(env->prog, &range))
+ return 0;
+ /* no restricted range, any return value is allowed */
+ if (range.minval == S32_MIN && range.maxval == S32_MAX)
+ return 0;
+ return_32bit = true;
+ } else if (!env->prog->aux->attach_func_proto->type) {
/* Make sure programs that attach to void
* hooks don't try to modify return value.
*/
@@ -15502,6 +16992,11 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
case BPF_PROG_TYPE_NETFILTER:
range = retval_range(NF_DROP, NF_ACCEPT);
break;
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ if (!ret_type)
+ return 0;
+ range = retval_range(0, 0);
+ break;
case BPF_PROG_TYPE_EXT:
/* freplace program can return anything as its return value
* depends on the to-be-replaced kernel func or bpf program.
@@ -15521,7 +17016,7 @@ enforce_retval:
if (err)
return err;
- if (!retval_range_within(range, reg)) {
+ if (!retval_range_within(range, reg, return_32bit)) {
verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name);
if (!is_subprog &&
prog->expected_attach_type == BPF_LSM_CGROUP &&
@@ -15537,6 +17032,38 @@ enforce_retval:
return 0;
}
+static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *subprog;
+
+ subprog = find_containing_subprog(env, off);
+ subprog->changes_pkt_data = true;
+}
+
+static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *subprog;
+
+ subprog = find_containing_subprog(env, off);
+ subprog->might_sleep = true;
+}
+
+/* 't' is an index of a call-site.
+ * 'w' is a callee entry point.
+ * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED.
+ * Rely on DFS traversal order and absence of recursive calls to guarantee that
+ * callee's change_pkt_data marks would be correct at that moment.
+ */
+static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w)
+{
+ struct bpf_subprog_info *caller, *callee;
+
+ caller = find_containing_subprog(env, t);
+ callee = find_containing_subprog(env, w);
+ caller->changes_pkt_data |= callee->changes_pkt_data;
+ caller->might_sleep |= callee->might_sleep;
+}
+
/* non-recursive DFS pseudo code
* 1 procedure DFS-iterative(G,v):
* 2 label v as discovered
@@ -15670,6 +17197,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
bool visit_callee)
{
int ret, insn_sz;
+ int w;
insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
ret = push_insn(t, t + insn_sz, FALLTHROUGH, env);
@@ -15681,12 +17209,272 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
mark_jmp_point(env, t + insn_sz);
if (visit_callee) {
+ w = t + insns[t].imm + 1;
mark_prune_point(env, t);
- ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
+ merge_callee_effects(env, t, w);
+ ret = push_insn(t, w, BRANCH, env);
}
return ret;
}
+/* Bitmask with 1s for all caller saved registers */
+#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1)
+
+/* True if do_misc_fixups() replaces calls to helper number 'imm',
+ * replacement patch is presumed to follow bpf_fastcall contract
+ * (see mark_fastcall_pattern_for_call() below).
+ */
+static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm)
+{
+ switch (imm) {
+#ifdef CONFIG_X86_64
+ case BPF_FUNC_get_smp_processor_id:
+ return env->prog->jit_requested && bpf_jit_supports_percpu_insn();
+#endif
+ default:
+ return false;
+ }
+}
+
+struct call_summary {
+ u8 num_params;
+ bool is_void;
+ bool fastcall;
+};
+
+/* If @call is a kfunc or helper call, fills @cs and returns true,
+ * otherwise returns false.
+ */
+static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call,
+ struct call_summary *cs)
+{
+ struct bpf_kfunc_call_arg_meta meta;
+ const struct bpf_func_proto *fn;
+ int i;
+
+ if (bpf_helper_call(call)) {
+
+ if (get_helper_proto(env, call->imm, &fn) < 0)
+ /* error would be reported later */
+ return false;
+ cs->fastcall = fn->allow_fastcall &&
+ (verifier_inlines_helper_call(env, call->imm) ||
+ bpf_jit_inlines_helper_call(call->imm));
+ cs->is_void = fn->ret_type == RET_VOID;
+ cs->num_params = 0;
+ for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) {
+ if (fn->arg_type[i] == ARG_DONTCARE)
+ break;
+ cs->num_params++;
+ }
+ return true;
+ }
+
+ if (bpf_pseudo_kfunc_call(call)) {
+ int err;
+
+ err = fetch_kfunc_meta(env, call, &meta, NULL);
+ if (err < 0)
+ /* error would be reported later */
+ return false;
+ cs->num_params = btf_type_vlen(meta.func_proto);
+ cs->fastcall = meta.kfunc_flags & KF_FASTCALL;
+ cs->is_void = btf_type_is_void(btf_type_by_id(meta.btf, meta.func_proto->type));
+ return true;
+ }
+
+ return false;
+}
+
+/* LLVM define a bpf_fastcall function attribute.
+ * This attribute means that function scratches only some of
+ * the caller saved registers defined by ABI.
+ * For BPF the set of such registers could be defined as follows:
+ * - R0 is scratched only if function is non-void;
+ * - R1-R5 are scratched only if corresponding parameter type is defined
+ * in the function prototype.
+ *
+ * The contract between kernel and clang allows to simultaneously use
+ * such functions and maintain backwards compatibility with old
+ * kernels that don't understand bpf_fastcall calls:
+ *
+ * - for bpf_fastcall calls clang allocates registers as-if relevant r0-r5
+ * registers are not scratched by the call;
+ *
+ * - as a post-processing step, clang visits each bpf_fastcall call and adds
+ * spill/fill for every live r0-r5;
+ *
+ * - stack offsets used for the spill/fill are allocated as lowest
+ * stack offsets in whole function and are not used for any other
+ * purposes;
+ *
+ * - when kernel loads a program, it looks for such patterns
+ * (bpf_fastcall function surrounded by spills/fills) and checks if
+ * spill/fill stack offsets are used exclusively in fastcall patterns;
+ *
+ * - if so, and if verifier or current JIT inlines the call to the
+ * bpf_fastcall function (e.g. a helper call), kernel removes unnecessary
+ * spill/fill pairs;
+ *
+ * - when old kernel loads a program, presence of spill/fill pairs
+ * keeps BPF program valid, albeit slightly less efficient.
+ *
+ * For example:
+ *
+ * r1 = 1;
+ * r2 = 2;
+ * *(u64 *)(r10 - 8) = r1; r1 = 1;
+ * *(u64 *)(r10 - 16) = r2; r2 = 2;
+ * call %[to_be_inlined] --> call %[to_be_inlined]
+ * r2 = *(u64 *)(r10 - 16); r0 = r1;
+ * r1 = *(u64 *)(r10 - 8); r0 += r2;
+ * r0 = r1; exit;
+ * r0 += r2;
+ * exit;
+ *
+ * The purpose of mark_fastcall_pattern_for_call is to:
+ * - look for such patterns;
+ * - mark spill and fill instructions in env->insn_aux_data[*].fastcall_pattern;
+ * - mark set env->insn_aux_data[*].fastcall_spills_num for call instruction;
+ * - update env->subprog_info[*]->fastcall_stack_off to find an offset
+ * at which bpf_fastcall spill/fill stack slots start;
+ * - update env->subprog_info[*]->keep_fastcall_stack.
+ *
+ * The .fastcall_pattern and .fastcall_stack_off are used by
+ * check_fastcall_stack_contract() to check if every stack access to
+ * fastcall spill/fill stack slot originates from spill/fill
+ * instructions, members of fastcall patterns.
+ *
+ * If such condition holds true for a subprogram, fastcall patterns could
+ * be rewritten by remove_fastcall_spills_fills().
+ * Otherwise bpf_fastcall patterns are not changed in the subprogram
+ * (code, presumably, generated by an older clang version).
+ *
+ * For example, it is *not* safe to remove spill/fill below:
+ *
+ * r1 = 1;
+ * *(u64 *)(r10 - 8) = r1; r1 = 1;
+ * call %[to_be_inlined] --> call %[to_be_inlined]
+ * r1 = *(u64 *)(r10 - 8); r0 = *(u64 *)(r10 - 8); <---- wrong !!!
+ * r0 = *(u64 *)(r10 - 8); r0 += r1;
+ * r0 += r1; exit;
+ * exit;
+ */
+static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env,
+ struct bpf_subprog_info *subprog,
+ int insn_idx, s16 lowest_off)
+{
+ struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx;
+ struct bpf_insn *call = &env->prog->insnsi[insn_idx];
+ u32 clobbered_regs_mask;
+ struct call_summary cs;
+ u32 expected_regs_mask;
+ s16 off;
+ int i;
+
+ if (!get_call_summary(env, call, &cs))
+ return;
+
+ /* A bitmask specifying which caller saved registers are clobbered
+ * by a call to a helper/kfunc *as if* this helper/kfunc follows
+ * bpf_fastcall contract:
+ * - includes R0 if function is non-void;
+ * - includes R1-R5 if corresponding parameter has is described
+ * in the function prototype.
+ */
+ clobbered_regs_mask = GENMASK(cs.num_params, cs.is_void ? 1 : 0);
+ /* e.g. if helper call clobbers r{0,1}, expect r{2,3,4,5} in the pattern */
+ expected_regs_mask = ~clobbered_regs_mask & ALL_CALLER_SAVED_REGS;
+
+ /* match pairs of form:
+ *
+ * *(u64 *)(r10 - Y) = rX (where Y % 8 == 0)
+ * ...
+ * call %[to_be_inlined]
+ * ...
+ * rX = *(u64 *)(r10 - Y)
+ */
+ for (i = 1, off = lowest_off; i <= ARRAY_SIZE(caller_saved); ++i, off += BPF_REG_SIZE) {
+ if (insn_idx - i < 0 || insn_idx + i >= env->prog->len)
+ break;
+ stx = &insns[insn_idx - i];
+ ldx = &insns[insn_idx + i];
+ /* must be a stack spill/fill pair */
+ if (stx->code != (BPF_STX | BPF_MEM | BPF_DW) ||
+ ldx->code != (BPF_LDX | BPF_MEM | BPF_DW) ||
+ stx->dst_reg != BPF_REG_10 ||
+ ldx->src_reg != BPF_REG_10)
+ break;
+ /* must be a spill/fill for the same reg */
+ if (stx->src_reg != ldx->dst_reg)
+ break;
+ /* must be one of the previously unseen registers */
+ if ((BIT(stx->src_reg) & expected_regs_mask) == 0)
+ break;
+ /* must be a spill/fill for the same expected offset,
+ * no need to check offset alignment, BPF_DW stack access
+ * is always 8-byte aligned.
+ */
+ if (stx->off != off || ldx->off != off)
+ break;
+ expected_regs_mask &= ~BIT(stx->src_reg);
+ env->insn_aux_data[insn_idx - i].fastcall_pattern = 1;
+ env->insn_aux_data[insn_idx + i].fastcall_pattern = 1;
+ }
+ if (i == 1)
+ return;
+
+ /* Conditionally set 'fastcall_spills_num' to allow forward
+ * compatibility when more helper functions are marked as
+ * bpf_fastcall at compile time than current kernel supports, e.g:
+ *
+ * 1: *(u64 *)(r10 - 8) = r1
+ * 2: call A ;; assume A is bpf_fastcall for current kernel
+ * 3: r1 = *(u64 *)(r10 - 8)
+ * 4: *(u64 *)(r10 - 8) = r1
+ * 5: call B ;; assume B is not bpf_fastcall for current kernel
+ * 6: r1 = *(u64 *)(r10 - 8)
+ *
+ * There is no need to block bpf_fastcall rewrite for such program.
+ * Set 'fastcall_pattern' for both calls to keep check_fastcall_stack_contract() happy,
+ * don't set 'fastcall_spills_num' for call B so that remove_fastcall_spills_fills()
+ * does not remove spill/fill pair {4,6}.
+ */
+ if (cs.fastcall)
+ env->insn_aux_data[insn_idx].fastcall_spills_num = i - 1;
+ else
+ subprog->keep_fastcall_stack = 1;
+ subprog->fastcall_stack_off = min(subprog->fastcall_stack_off, off);
+}
+
+static int mark_fastcall_patterns(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprog = env->subprog_info;
+ struct bpf_insn *insn;
+ s16 lowest_off;
+ int s, i;
+
+ for (s = 0; s < env->subprog_cnt; ++s, ++subprog) {
+ /* find lowest stack spill offset used in this subprog */
+ lowest_off = 0;
+ for (i = subprog->start; i < (subprog + 1)->start; ++i) {
+ insn = env->prog->insnsi + i;
+ if (insn->code != (BPF_STX | BPF_MEM | BPF_DW) ||
+ insn->dst_reg != BPF_REG_10)
+ continue;
+ lowest_off = min(lowest_off, insn->off);
+ }
+ /* use this offset to find fastcall patterns */
+ for (i = subprog->start; i < (subprog + 1)->start; ++i) {
+ insn = env->prog->insnsi + i;
+ if (insn->code != (BPF_JMP | BPF_CALL))
+ continue;
+ mark_fastcall_pattern_for_call(env, subprog, i, lowest_off);
+ }
+ }
+ return 0;
+}
+
/* Visits the instruction at index t and returns one of the following:
* < 0 - an error occurred
* DONE_EXPLORING - the instruction was fully explored
@@ -15734,7 +17522,20 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
mark_prune_point(env, t);
mark_jmp_point(env, t);
}
- if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ if (bpf_helper_call(insn)) {
+ const struct bpf_func_proto *fp;
+
+ ret = get_helper_proto(env, insn->imm, &fp);
+ /* If called in a non-sleepable context program will be
+ * rejected anyway, so we should end up with precise
+ * sleepable marks on subprogs, except for dead code
+ * elimination.
+ */
+ if (ret == 0 && fp->might_sleep)
+ mark_subprog_might_sleep(env, t);
+ if (bpf_helper_changes_pkt_data(insn->imm))
+ mark_subprog_changes_pkt_data(env, t);
+ } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
struct bpf_kfunc_call_arg_meta meta;
ret = fetch_kfunc_meta(env, insn, &meta, NULL);
@@ -15753,6 +17554,13 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
*/
mark_force_checkpoint(env, t);
}
+ /* Same as helpers, if called in a non-sleepable context
+ * program will be rejected anyway, so we should end up
+ * with precise sleepable marks on subprogs, except for
+ * dead code elimination.
+ */
+ if (ret == 0 && is_kfunc_sleepable(&meta))
+ mark_subprog_might_sleep(env, t);
}
return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
@@ -15795,9 +17603,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
static int check_cfg(struct bpf_verifier_env *env)
{
int insn_cnt = env->prog->len;
- int *insn_stack, *insn_state;
+ int *insn_stack, *insn_state, *insn_postorder;
int ex_insn_beg, i, ret = 0;
- bool ex_done = false;
insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
if (!insn_state)
@@ -15809,6 +17616,17 @@ static int check_cfg(struct bpf_verifier_env *env)
return -ENOMEM;
}
+ insn_postorder = env->cfg.insn_postorder = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ if (!insn_postorder) {
+ kvfree(insn_state);
+ kvfree(insn_stack);
+ return -ENOMEM;
+ }
+
+ ex_insn_beg = env->exception_callback_subprog
+ ? env->subprog_info[env->exception_callback_subprog].start
+ : 0;
+
insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
insn_stack[0] = 0; /* 0 is the first instruction */
env->cfg.cur_stack = 1;
@@ -15822,6 +17640,7 @@ walk_cfg:
case DONE_EXPLORING:
insn_state[t] = EXPLORED;
env->cfg.cur_stack--;
+ insn_postorder[env->cfg.cur_postorder++] = t;
break;
case KEEP_EXPLORING:
break;
@@ -15840,13 +17659,10 @@ walk_cfg:
goto err_free;
}
- if (env->exception_callback_subprog && !ex_done) {
- ex_insn_beg = env->subprog_info[env->exception_callback_subprog].start;
-
+ if (ex_insn_beg && insn_state[ex_insn_beg] != EXPLORED) {
insn_state[ex_insn_beg] = DISCOVERED;
insn_stack[0] = ex_insn_beg;
env->cfg.cur_stack = 1;
- ex_done = true;
goto walk_cfg;
}
@@ -15868,6 +17684,8 @@ walk_cfg:
}
}
ret = 0; /* cfg looks good */
+ env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data;
+ env->prog->aux->might_sleep = env->subprog_info[0].might_sleep;
err_free:
kvfree(insn_state);
@@ -16484,18 +18302,22 @@ static void clean_verifier_state(struct bpf_verifier_env *env,
static void clean_live_states(struct bpf_verifier_env *env, int insn,
struct bpf_verifier_state *cur)
{
+ struct bpf_verifier_state *loop_entry;
struct bpf_verifier_state_list *sl;
+ struct list_head *pos, *head;
- sl = *explored_state(env, insn);
- while (sl) {
+ head = explored_state(env, insn);
+ list_for_each(pos, head) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
if (sl->state.branches)
- goto next;
+ continue;
+ loop_entry = get_loop_entry(env, &sl->state);
+ if (!IS_ERR_OR_NULL(loop_entry) && loop_entry->branches)
+ continue;
if (sl->state.insn_idx != insn ||
!same_callsites(&sl->state, cur))
- goto next;
+ continue;
clean_verifier_state(env, &sl->state);
-next:
- sl = sl->next;
}
}
@@ -16566,6 +18388,10 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
}
if (!rold->precise && exact == NOT_EXACT)
return true;
+ if ((rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST))
+ return false;
+ if ((rold->id & BPF_ADD_CONST) && (rold->off != rcur->off))
+ return false;
/* Why check_ids() for scalar registers?
*
* Consider the following BPF code:
@@ -16578,7 +18404,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
*
* First verification path is [1-6]:
* - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7;
- * - at (5) r6 would be marked <= X, find_equal_scalars() would also mark
+ * - at (5) r6 would be marked <= X, sync_linked_regs() would also mark
* r7 <= X, because r6 and r7 share same id.
* Next verification path is [1-4, 6].
*
@@ -16692,8 +18518,9 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
spi = i / BPF_REG_SIZE;
if (exact != NOT_EXACT &&
- old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
- cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+ (i >= cur->allocated_stack ||
+ old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE]))
return false;
if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)
@@ -16788,6 +18615,13 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
return false;
break;
+ case STACK_IRQ_FLAG:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) ||
+ old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class)
+ return false;
+ break;
case STACK_MISC:
case STACK_ZERO:
case STACK_INVALID:
@@ -16800,7 +18634,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
return true;
}
-static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
+static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *cur,
struct bpf_idmap *idmap)
{
int i;
@@ -16808,9 +18642,40 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
if (old->acquired_refs != cur->acquired_refs)
return false;
+ if (old->active_locks != cur->active_locks)
+ return false;
+
+ if (old->active_preempt_locks != cur->active_preempt_locks)
+ return false;
+
+ if (old->active_rcu_lock != cur->active_rcu_lock)
+ return false;
+
+ if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap))
+ return false;
+
+ if (!check_ids(old->active_lock_id, cur->active_lock_id, idmap) ||
+ old->active_lock_ptr != cur->active_lock_ptr)
+ return false;
+
for (i = 0; i < old->acquired_refs; i++) {
- if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap))
+ if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) ||
+ old->refs[i].type != cur->refs[i].type)
return false;
+ switch (old->refs[i].type) {
+ case REF_TYPE_PTR:
+ case REF_TYPE_IRQ:
+ break;
+ case REF_TYPE_LOCK:
+ case REF_TYPE_RES_LOCK:
+ case REF_TYPE_RES_LOCK_IRQ:
+ if (old->refs[i].ptr != cur->refs[i].ptr)
+ return false;
+ break;
+ default:
+ WARN_ONCE(1, "Unhandled enum type for reference state: %d\n", old->refs[i].type);
+ return false;
+ }
}
return true;
@@ -16843,24 +18708,23 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
* the current state will reach 'bpf_exit' instruction safely
*/
static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
- struct bpf_func_state *cur, enum exact_level exact)
+ struct bpf_func_state *cur, u32 insn_idx, enum exact_level exact)
{
- int i;
+ u16 live_regs = env->insn_aux_data[insn_idx].live_regs_before;
+ u16 i;
if (old->callback_depth > cur->callback_depth)
return false;
for (i = 0; i < MAX_BPF_REG; i++)
- if (!regsafe(env, &old->regs[i], &cur->regs[i],
+ if (((1 << i) & live_regs) &&
+ !regsafe(env, &old->regs[i], &cur->regs[i],
&env->idmap_scratch, exact))
return false;
if (!stacksafe(env, old, cur, &env->idmap_scratch, exact))
return false;
- if (!refsafe(old, cur, &env->idmap_scratch))
- return false;
-
return true;
}
@@ -16875,6 +18739,7 @@ static bool states_equal(struct bpf_verifier_env *env,
struct bpf_verifier_state *cur,
enum exact_level exact)
{
+ u32 insn_idx;
int i;
if (old->curframe != cur->curframe)
@@ -16888,29 +18753,22 @@ static bool states_equal(struct bpf_verifier_env *env,
if (old->speculative && !cur->speculative)
return false;
- if (old->active_lock.ptr != cur->active_lock.ptr)
- return false;
-
- /* Old and cur active_lock's have to be either both present
- * or both absent.
- */
- if (!!old->active_lock.id != !!cur->active_lock.id)
- return false;
-
- if (old->active_lock.id &&
- !check_ids(old->active_lock.id, cur->active_lock.id, &env->idmap_scratch))
+ if (old->in_sleepable != cur->in_sleepable)
return false;
- if (old->active_rcu_lock != cur->active_rcu_lock)
+ if (!refsafe(old, cur, &env->idmap_scratch))
return false;
/* for states to be equal callsites have to be the same
* and all frame states need to be equivalent
*/
for (i = 0; i <= old->curframe; i++) {
+ insn_idx = i == old->curframe
+ ? env->insn_idx
+ : old->frame[i + 1]->callsite;
if (old->frame[i]->callsite != cur->frame[i]->callsite)
return false;
- if (!func_states_equal(env, old->frame[i], cur->frame[i], exact))
+ if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact))
return false;
}
return true;
@@ -17163,12 +19021,15 @@ static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf
static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
struct bpf_verifier_state_list *new_sl;
- struct bpf_verifier_state_list *sl, **pprev;
+ struct bpf_verifier_state_list *sl;
struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry;
int i, j, n, err, states_cnt = 0;
- bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
- bool add_new_state = force_new_state;
- bool force_exact;
+ bool force_new_state, add_new_state, force_exact;
+ struct list_head *pos, *tmp, *head;
+
+ force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) ||
+ /* Avoid accumulating infinitely long jmp history */
+ cur->insn_hist_end - cur->insn_hist_start > 40;
/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
@@ -17178,19 +19039,19 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* In tests that amounts to up to 50% reduction into total verifier
* memory consumption and 20% verifier time speedup.
*/
+ add_new_state = force_new_state;
if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
env->insn_processed - env->prev_insn_processed >= 8)
add_new_state = true;
- pprev = explored_state(env, insn_idx);
- sl = *pprev;
-
clean_live_states(env, insn_idx, cur);
- while (sl) {
+ head = explored_state(env, insn_idx);
+ list_for_each_safe(pos, tmp, head) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
states_cnt++;
if (sl->state.insn_idx != insn_idx)
- goto next;
+ continue;
if (sl->state.branches) {
struct bpf_func_state *frame = sl->state.frame[sl->state.curframe];
@@ -17264,18 +19125,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
- update_loop_entry(cur, &sl->state);
+ update_loop_entry(env, cur, &sl->state);
goto hit;
}
}
goto skip_inf_loop_check;
}
if (is_may_goto_insn_at(env, insn_idx)) {
- if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
- update_loop_entry(cur, &sl->state);
+ if (sl->state.may_goto_depth != cur->may_goto_depth &&
+ states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
+ update_loop_entry(env, cur, &sl->state);
goto hit;
}
- goto skip_inf_loop_check;
}
if (calls_callback(env, insn_idx)) {
if (states_equal(env, &sl->state, cur, RANGE_WITHIN))
@@ -17291,9 +19152,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
verbose_linfo(env, insn_idx, "; ");
verbose(env, "infinite loop detected at insn %d\n", insn_idx);
verbose(env, "cur state:");
- print_verifier_state(env, cur->frame[cur->curframe], true);
+ print_verifier_state(env, cur, cur->curframe, true);
verbose(env, "old state:");
- print_verifier_state(env, sl->state.frame[cur->curframe], true);
+ print_verifier_state(env, &sl->state, cur->curframe, true);
return -EINVAL;
}
/* if the verifier is processing a loop, avoid adding new state
@@ -17340,11 +19201,13 @@ skip_inf_loop_check:
*
* Additional details are in the comment before get_loop_entry().
*/
- loop_entry = get_loop_entry(&sl->state);
+ loop_entry = get_loop_entry(env, &sl->state);
+ if (IS_ERR(loop_entry))
+ return PTR_ERR(loop_entry);
force_exact = loop_entry && loop_entry->branches > 0;
if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) {
if (force_exact)
- update_loop_entry(cur, loop_entry);
+ update_loop_entry(env, cur, loop_entry);
hit:
sl->hit_cnt++;
/* reached equivalent register/stack state,
@@ -17360,12 +19223,12 @@ hit:
err = propagate_liveness(env, &sl->state, cur);
/* if previous state reached the exit with precision and
- * current state is equivalent to it (except precsion marks)
+ * current state is equivalent to it (except precision marks)
* the precision needs to be propagated back in
* the current state.
*/
if (is_jmp_point(env, env->insn_idx))
- err = err ? : push_jmp_history(env, cur, 0);
+ err = err ? : push_insn_history(env, cur, 0, 0);
err = err ? : propagate_precision(env, &sl->state);
if (err)
return err;
@@ -17393,31 +19256,13 @@ miss:
/* the state is unlikely to be useful. Remove it to
* speed up verification
*/
- *pprev = sl->next;
- if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE &&
- !sl->state.used_as_loop_entry) {
- u32 br = sl->state.branches;
-
- WARN_ONCE(br,
- "BUG live_done but branches_to_explore %d\n",
- br);
- free_verifier_state(&sl->state, false);
- kfree(sl);
- env->peak_states--;
- } else {
- /* cannot free this state, since parentage chain may
- * walk it later. Add it for free_list instead to
- * be freed at the end of verification
- */
- sl->next = env->free_list;
- env->free_list = sl;
- }
- sl = *pprev;
- continue;
+ sl->in_free_list = true;
+ list_del(&sl->node);
+ list_add(&sl->node, &env->free_list);
+ env->free_list_size++;
+ env->explored_states_size--;
+ maybe_free_verifier_state(env, sl);
}
-next:
- pprev = &sl->next;
- sl = *pprev;
}
if (env->max_states_per_insn < states_cnt)
@@ -17442,7 +19287,8 @@ next:
if (!new_sl)
return -ENOMEM;
env->total_states++;
- env->peak_states++;
+ env->explored_states_size++;
+ update_peak_states(env);
env->prev_jmps_processed = env->jmps_processed;
env->prev_insn_processed = env->insn_processed;
@@ -17464,10 +19310,10 @@ next:
cur->parent = new;
cur->first_insn_idx = insn_idx;
+ cur->insn_hist_start = cur->insn_hist_end;
cur->dfs_depth = new->dfs_depth + 1;
- clear_jmp_history(cur);
- new_sl->next = *explored_state(env, insn_idx);
- *explored_state(env, insn_idx) = new_sl;
+ list_add(&new_sl->node, head);
+
/* connect new state to parentage chain. Current frame needs all
* registers connected. Only r6 - r9 of the callers are alive (pushed
* to the stack implicitly by JITs) so in callers' frames connect just
@@ -17538,7 +19384,7 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
}
static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
- bool allow_trust_missmatch)
+ bool allow_trust_mismatch)
{
enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type;
@@ -17556,7 +19402,7 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ
* src_reg == stack|map in some other branch.
* Reject it.
*/
- if (allow_trust_missmatch &&
+ if (allow_trust_mismatch &&
base_type(type) == PTR_TO_BTF_ID &&
base_type(*prev_type) == PTR_TO_BTF_ID) {
/*
@@ -17633,7 +19479,7 @@ static int do_check(struct bpf_verifier_env *env)
}
if (is_jmp_point(env, env->insn_idx)) {
- err = push_jmp_history(env, state, 0);
+ err = push_insn_history(env, state, 0, 0);
if (err)
return err;
}
@@ -17649,24 +19495,18 @@ static int do_check(struct bpf_verifier_env *env)
env->prev_insn_idx, env->insn_idx,
env->cur_state->speculative ?
" (speculative execution)" : "");
- print_verifier_state(env, state->frame[state->curframe], true);
+ print_verifier_state(env, state, state->curframe, true);
do_print_state = false;
}
if (env->log.level & BPF_LOG_LEVEL) {
- const struct bpf_insn_cbs cbs = {
- .cb_call = disasm_kfunc_name,
- .cb_print = verbose,
- .private_data = env,
- };
-
if (verifier_state_scratched(env))
- print_insn_state(env, state->frame[state->curframe]);
+ print_insn_state(env, state, state->curframe);
verbose_linfo(env, env->insn_idx, "; ");
env->prev_log_pos = env->log.end_pos;
verbose(env, "%d: ", env->insn_idx);
- print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
+ verbose_insn(env, insn);
env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos;
env->prev_log_pos = env->log.end_pos;
}
@@ -17688,37 +19528,18 @@ static int do_check(struct bpf_verifier_env *env)
return err;
} else if (class == BPF_LDX) {
- enum bpf_reg_type src_reg_type;
-
- /* check for reserved fields is already done */
-
- /* check src operand */
- err = check_reg_arg(env, insn->src_reg, SRC_OP);
- if (err)
- return err;
-
- err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
- if (err)
- return err;
-
- src_reg_type = regs[insn->src_reg].type;
+ bool is_ldsx = BPF_MODE(insn->code) == BPF_MEMSX;
- /* check that memory (src_reg + off) is readable,
- * the state of dst_reg will be updated by this func
+ /* Check for reserved fields is already done in
+ * resolve_pseudo_ldimm64().
*/
- err = check_mem_access(env, env->insn_idx, insn->src_reg,
- insn->off, BPF_SIZE(insn->code),
- BPF_READ, insn->dst_reg, false,
- BPF_MODE(insn->code) == BPF_MEMSX);
- err = err ?: save_aux_ptr_type(env, src_reg_type, true);
- err = err ?: reg_bounds_sanity_check(env, &regs[insn->dst_reg], "ldx");
+ err = check_load_mem(env, insn, false, is_ldsx, true,
+ "ldx");
if (err)
return err;
} else if (class == BPF_STX) {
- enum bpf_reg_type dst_reg_type;
-
if (BPF_MODE(insn->code) == BPF_ATOMIC) {
- err = check_atomic(env, env->insn_idx, insn);
+ err = check_atomic(env, insn);
if (err)
return err;
env->insn_idx++;
@@ -17730,25 +19551,7 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- /* check src1 operand */
- err = check_reg_arg(env, insn->src_reg, SRC_OP);
- if (err)
- return err;
- /* check src2 operand */
- err = check_reg_arg(env, insn->dst_reg, SRC_OP);
- if (err)
- return err;
-
- dst_reg_type = regs[insn->dst_reg].type;
-
- /* check that memory (dst_reg + off) is writeable */
- err = check_mem_access(env, env->insn_idx, insn->dst_reg,
- insn->off, BPF_SIZE(insn->code),
- BPF_WRITE, insn->src_reg, false, false);
- if (err)
- return err;
-
- err = save_aux_ptr_type(env, dst_reg_type, false);
+ err = check_store_reg(env, insn, false);
if (err)
return err;
} else if (class == BPF_ST) {
@@ -17793,10 +19596,10 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (env->cur_state->active_lock.ptr) {
+ if (env->cur_state->active_locks) {
if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) ||
(insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
- (insn->off != 0 || !is_bpf_graph_api_kfunc(insn->imm)))) {
+ (insn->off != 0 || !kfunc_spin_allowed(insn->imm)))) {
verbose(env, "function calls are not allowed while holding a lock\n");
return -EINVAL;
}
@@ -17842,23 +19645,14 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
process_bpf_exit_full:
- if (env->cur_state->active_lock.ptr && !env->cur_state->curframe) {
- verbose(env, "bpf_spin_unlock is missing\n");
- return -EINVAL;
- }
-
- if (env->cur_state->active_rcu_lock && !env->cur_state->curframe) {
- verbose(env, "bpf_rcu_read_unlock is missing\n");
- return -EINVAL;
- }
-
/* We must do check_reference_leak here before
* prepare_func_exit to handle the case when
* state->curframe > 0, it may be a callback
* function, for which reference_state must
* match caller reference state when it exits.
*/
- err = check_reference_leak(env, exception_exit);
+ err = check_resource_leak(env, exception_exit, !env->cur_state->curframe,
+ "BPF_EXIT instruction in main prog");
if (err)
return err;
@@ -17897,6 +19691,9 @@ process_bpf_exit:
return err;
break;
} else {
+ if (verifier_bug_if(env->cur_state->loop_entry, env,
+ "broken loop detection"))
+ return -EFAULT;
do_print_state = true;
continue;
}
@@ -17965,50 +19762,68 @@ static int find_btf_percpu_datasec(struct btf *btf)
return -ENOENT;
}
+/*
+ * Add btf to the used_btfs array and return the index. (If the btf was
+ * already added, then just return the index.) Upon successful insertion
+ * increase btf refcnt, and, if present, also refcount the corresponding
+ * kernel module.
+ */
+static int __add_used_btf(struct bpf_verifier_env *env, struct btf *btf)
+{
+ struct btf_mod_pair *btf_mod;
+ int i;
+
+ /* check whether we recorded this BTF (and maybe module) already */
+ for (i = 0; i < env->used_btf_cnt; i++)
+ if (env->used_btfs[i].btf == btf)
+ return i;
+
+ if (env->used_btf_cnt >= MAX_USED_BTFS)
+ return -E2BIG;
+
+ btf_get(btf);
+
+ btf_mod = &env->used_btfs[env->used_btf_cnt];
+ btf_mod->btf = btf;
+ btf_mod->module = NULL;
+
+ /* if we reference variables from kernel module, bump its refcount */
+ if (btf_is_module(btf)) {
+ btf_mod->module = btf_try_get_module(btf);
+ if (!btf_mod->module) {
+ btf_put(btf);
+ return -ENXIO;
+ }
+ }
+
+ return env->used_btf_cnt++;
+}
+
/* replace pseudo btf_id with kernel symbol address */
-static int check_pseudo_btf_id(struct bpf_verifier_env *env,
- struct bpf_insn *insn,
- struct bpf_insn_aux_data *aux)
+static int __check_pseudo_btf_id(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_insn_aux_data *aux,
+ struct btf *btf)
{
const struct btf_var_secinfo *vsi;
const struct btf_type *datasec;
- struct btf_mod_pair *btf_mod;
const struct btf_type *t;
const char *sym_name;
bool percpu = false;
u32 type, id = insn->imm;
- struct btf *btf;
s32 datasec_id;
u64 addr;
- int i, btf_fd, err;
-
- btf_fd = insn[1].imm;
- if (btf_fd) {
- btf = btf_get_by_fd(btf_fd);
- if (IS_ERR(btf)) {
- verbose(env, "invalid module BTF object FD specified.\n");
- return -EINVAL;
- }
- } else {
- if (!btf_vmlinux) {
- verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
- return -EINVAL;
- }
- btf = btf_vmlinux;
- btf_get(btf);
- }
+ int i;
t = btf_type_by_id(btf, id);
if (!t) {
verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id);
- err = -ENOENT;
- goto err_put;
+ return -ENOENT;
}
if (!btf_type_is_var(t) && !btf_type_is_func(t)) {
verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR or KIND_FUNC\n", id);
- err = -EINVAL;
- goto err_put;
+ return -EINVAL;
}
sym_name = btf_name_by_offset(btf, t->name_off);
@@ -18016,8 +19831,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
if (!addr) {
verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n",
sym_name);
- err = -ENOENT;
- goto err_put;
+ return -ENOENT;
}
insn[0].imm = (u32)addr;
insn[1].imm = addr >> 32;
@@ -18025,7 +19839,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
if (btf_type_is_func(t)) {
aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
aux->btf_var.mem_size = 0;
- goto check_btf;
+ return 0;
}
datasec_id = find_btf_percpu_datasec(btf);
@@ -18056,8 +19870,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
tname = btf_name_by_offset(btf, t->name_off);
verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n",
tname, PTR_ERR(ret));
- err = -EINVAL;
- goto err_put;
+ return -EINVAL;
}
aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
aux->btf_var.mem_size = tsize;
@@ -18066,39 +19879,43 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
aux->btf_var.btf = btf;
aux->btf_var.btf_id = type;
}
-check_btf:
- /* check whether we recorded this BTF (and maybe module) already */
- for (i = 0; i < env->used_btf_cnt; i++) {
- if (env->used_btfs[i].btf == btf) {
- btf_put(btf);
- return 0;
- }
- }
- if (env->used_btf_cnt >= MAX_USED_BTFS) {
- err = -E2BIG;
- goto err_put;
- }
+ return 0;
+}
- btf_mod = &env->used_btfs[env->used_btf_cnt];
- btf_mod->btf = btf;
- btf_mod->module = NULL;
+static int check_pseudo_btf_id(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_insn_aux_data *aux)
+{
+ struct btf *btf;
+ int btf_fd;
+ int err;
- /* if we reference variables from kernel module, bump its refcount */
- if (btf_is_module(btf)) {
- btf_mod->module = btf_try_get_module(btf);
- if (!btf_mod->module) {
- err = -ENXIO;
- goto err_put;
+ btf_fd = insn[1].imm;
+ if (btf_fd) {
+ CLASS(fd, f)(btf_fd);
+
+ btf = __btf_get_by_fd(f);
+ if (IS_ERR(btf)) {
+ verbose(env, "invalid module BTF object FD specified.\n");
+ return -EINVAL;
+ }
+ } else {
+ if (!btf_vmlinux) {
+ verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
+ return -EINVAL;
}
+ btf = btf_vmlinux;
}
- env->used_btf_cnt++;
+ err = __check_pseudo_btf_id(env, insn, aux, btf);
+ if (err)
+ return err;
+ err = __add_used_btf(env, btf);
+ if (err < 0)
+ return err;
return 0;
-err_put:
- btf_put(btf);
- return err;
}
static bool is_tracing_prog_type(enum bpf_prog_type type)
@@ -18115,6 +19932,12 @@ static bool is_tracing_prog_type(enum bpf_prog_type type)
}
}
+static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
+{
+ return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
+}
+
static int check_map_prog_compatibility(struct bpf_verifier_env *env,
struct bpf_map *map,
struct bpf_prog *prog)
@@ -18130,7 +19953,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
}
- if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
+ if (btf_record_has_field(map->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) {
if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n");
return -EINVAL;
@@ -18149,6 +19972,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
}
+ if (btf_record_has_field(map->record, BPF_WORKQUEUE)) {
+ if (is_tracing_prog_type(prog_type)) {
+ verbose(env, "tracing progs cannot use bpf_wq yet\n");
+ return -EINVAL;
+ }
+ }
+
if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) &&
!bpf_offload_prog_map_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n");
@@ -18186,13 +20016,89 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return -EINVAL;
}
+ if (bpf_map_is_cgroup_storage(map) &&
+ bpf_cgroup_storage_assign(env->prog->aux, map)) {
+ verbose(env, "only one cgroup storage of each type is allowed\n");
+ return -EBUSY;
+ }
+
+ if (map->map_type == BPF_MAP_TYPE_ARENA) {
+ if (env->prog->aux->arena) {
+ verbose(env, "Only one arena per program\n");
+ return -EBUSY;
+ }
+ if (!env->allow_ptr_leaks || !env->bpf_capable) {
+ verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n");
+ return -EPERM;
+ }
+ if (!env->prog->jit_requested) {
+ verbose(env, "JIT is required to use arena\n");
+ return -EOPNOTSUPP;
+ }
+ if (!bpf_jit_supports_arena()) {
+ verbose(env, "JIT doesn't support arena\n");
+ return -EOPNOTSUPP;
+ }
+ env->prog->aux->arena = (void *)map;
+ if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) {
+ verbose(env, "arena's user address must be set via map_extra or mmap()\n");
+ return -EINVAL;
+ }
+ }
+
return 0;
}
-static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
+static int __add_used_map(struct bpf_verifier_env *env, struct bpf_map *map)
{
- return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
- map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
+ int i, err;
+
+ /* check whether we recorded this map already */
+ for (i = 0; i < env->used_map_cnt; i++)
+ if (env->used_maps[i] == map)
+ return i;
+
+ if (env->used_map_cnt >= MAX_USED_MAPS) {
+ verbose(env, "The total number of maps per program has reached the limit of %u\n",
+ MAX_USED_MAPS);
+ return -E2BIG;
+ }
+
+ err = check_map_prog_compatibility(env, map, env->prog);
+ if (err)
+ return err;
+
+ if (env->prog->sleepable)
+ atomic64_inc(&map->sleepable_refcnt);
+
+ /* hold the map. If the program is rejected by verifier,
+ * the map will be released by release_maps() or it
+ * will be used by the valid program until it's unloaded
+ * and all maps are released in bpf_free_used_maps()
+ */
+ bpf_map_inc(map);
+
+ env->used_maps[env->used_map_cnt++] = map;
+
+ return env->used_map_cnt - 1;
+}
+
+/* Add map behind fd to used maps list, if it's not already there, and return
+ * its index.
+ * Returns <0 on error, or >= 0 index, on success.
+ */
+static int add_used_map(struct bpf_verifier_env *env, int fd)
+{
+ struct bpf_map *map;
+ CLASS(fd, f)(fd);
+
+ map = __bpf_map_get(f);
+ if (IS_ERR(map)) {
+ verbose(env, "fd %d is not pointing to valid bpf_map\n", fd);
+ return PTR_ERR(map);
+ }
+
+ return __add_used_map(env, map);
}
/* find and rewrite pseudo imm in ld_imm64 instructions:
@@ -18206,7 +20112,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
- int i, j, err;
+ int i, err;
err = bpf_prog_calc_tag(env->prog);
if (err)
@@ -18223,7 +20129,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
struct bpf_insn_aux_data *aux;
struct bpf_map *map;
- struct fd f;
+ int map_idx;
u64 addr;
u32 fd;
@@ -18286,21 +20192,14 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
break;
}
- f = fdget(fd);
- map = __bpf_map_get(f);
- if (IS_ERR(map)) {
- verbose(env, "fd %d is not pointing to valid bpf_map\n",
- insn[0].imm);
- return PTR_ERR(map);
- }
-
- err = check_map_prog_compatibility(env, map, env->prog);
- if (err) {
- fdput(f);
- return err;
- }
+ map_idx = add_used_map(env, fd);
+ if (map_idx < 0)
+ return map_idx;
+ map = env->used_maps[map_idx];
aux = &env->insn_aux_data[i];
+ aux->map_index = map_idx;
+
if (insn[0].src_reg == BPF_PSEUDO_MAP_FD ||
insn[0].src_reg == BPF_PSEUDO_MAP_IDX) {
addr = (unsigned long)map;
@@ -18309,13 +20208,11 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
if (off >= BPF_MAX_VAR_OFF) {
verbose(env, "direct value offset of %u is not allowed\n", off);
- fdput(f);
return -EINVAL;
}
if (!map->ops->map_direct_value_addr) {
verbose(env, "no direct value access support for this map type\n");
- fdput(f);
return -EINVAL;
}
@@ -18323,7 +20220,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
if (err) {
verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n",
map->value_size, off);
- fdput(f);
return err;
}
@@ -18334,68 +20230,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
insn[0].imm = (u32)addr;
insn[1].imm = addr >> 32;
- /* check whether we recorded this map already */
- for (j = 0; j < env->used_map_cnt; j++) {
- if (env->used_maps[j] == map) {
- aux->map_index = j;
- fdput(f);
- goto next_insn;
- }
- }
-
- if (env->used_map_cnt >= MAX_USED_MAPS) {
- fdput(f);
- return -E2BIG;
- }
-
- if (env->prog->sleepable)
- atomic64_inc(&map->sleepable_refcnt);
- /* hold the map. If the program is rejected by verifier,
- * the map will be released by release_maps() or it
- * will be used by the valid program until it's unloaded
- * and all maps are released in bpf_free_used_maps()
- */
- bpf_map_inc(map);
-
- aux->map_index = env->used_map_cnt;
- env->used_maps[env->used_map_cnt++] = map;
-
- if (bpf_map_is_cgroup_storage(map) &&
- bpf_cgroup_storage_assign(env->prog->aux, map)) {
- verbose(env, "only one cgroup storage of each type is allowed\n");
- fdput(f);
- return -EBUSY;
- }
- if (map->map_type == BPF_MAP_TYPE_ARENA) {
- if (env->prog->aux->arena) {
- verbose(env, "Only one arena per program\n");
- fdput(f);
- return -EBUSY;
- }
- if (!env->allow_ptr_leaks || !env->bpf_capable) {
- verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n");
- fdput(f);
- return -EPERM;
- }
- if (!env->prog->jit_requested) {
- verbose(env, "JIT is required to use arena\n");
- fdput(f);
- return -EOPNOTSUPP;
- }
- if (!bpf_jit_supports_arena()) {
- verbose(env, "JIT doesn't support arena\n");
- fdput(f);
- return -EOPNOTSUPP;
- }
- env->prog->aux->arena = (void *)map;
- if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) {
- verbose(env, "arena's user address must be set via map_extra or mmap()\n");
- fdput(f);
- return -EINVAL;
- }
- }
-
- fdput(f);
next_insn:
insn++;
i++;
@@ -18426,8 +20260,7 @@ static void release_maps(struct bpf_verifier_env *env)
/* drop refcnt of maps used by the rejected program */
static void release_btfs(struct bpf_verifier_env *env)
{
- __bpf_free_used_btfs(env->prog->aux, env->used_btfs,
- env->used_btf_cnt);
+ __bpf_free_used_btfs(env->used_btfs, env->used_btf_cnt);
}
/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
@@ -18538,6 +20371,44 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
return new_prog;
}
+/*
+ * For all jmp insns in a given 'prog' that point to 'tgt_idx' insn adjust the
+ * jump offset by 'delta'.
+ */
+static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta)
+{
+ struct bpf_insn *insn = prog->insnsi;
+ u32 insn_cnt = prog->len, i;
+ s32 imm;
+ s16 off;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ u8 code = insn->code;
+
+ if (tgt_idx <= i && i < tgt_idx + delta)
+ continue;
+
+ if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) ||
+ BPF_OP(code) == BPF_CALL || BPF_OP(code) == BPF_EXIT)
+ continue;
+
+ if (insn->code == (BPF_JMP32 | BPF_JA)) {
+ if (i + 1 + insn->imm != tgt_idx)
+ continue;
+ if (check_add_overflow(insn->imm, delta, &imm))
+ return -ERANGE;
+ insn->imm = imm;
+ } else {
+ if (i + 1 + insn->off != tgt_idx)
+ continue;
+ if (check_add_overflow(insn->off, delta, &off))
+ return -ERANGE;
+ insn->off = off;
+ }
+ }
+ return 0;
+}
+
static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
u32 off, u32 cnt)
{
@@ -18776,22 +20647,29 @@ static int opt_remove_dead_code(struct bpf_verifier_env *env)
return 0;
}
+static const struct bpf_insn NOP = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
+static const struct bpf_insn MAY_GOTO_0 = BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0, 0);
+
static int opt_remove_nops(struct bpf_verifier_env *env)
{
- const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
+ bool is_may_goto_0, is_ja;
int i, err;
for (i = 0; i < insn_cnt; i++) {
- if (memcmp(&insn[i], &ja, sizeof(ja)))
+ is_may_goto_0 = !memcmp(&insn[i], &MAY_GOTO_0, sizeof(MAY_GOTO_0));
+ is_ja = !memcmp(&insn[i], &NOP, sizeof(NOP));
+
+ if (!is_may_goto_0 && !is_ja)
continue;
err = verifier_remove_insns(env, i, 1);
if (err)
return err;
insn_cnt--;
- i--;
+ /* Go back one insn to catch may_goto +1; may_goto +0 sequence */
+ i -= (is_may_goto_0 && i > 0) ? 2 : 1;
}
return 0;
@@ -18873,10 +20751,9 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
if (bpf_pseudo_kfunc_call(&insn))
continue;
- if (WARN_ON(load_reg == -1)) {
- verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n");
+ if (verifier_bug_if(load_reg == -1, env,
+ "zext_dst is set, but no reg is defined"))
return -EFAULT;
- }
zext_patch[0] = insn;
zext_patch[1].dst_reg = load_reg;
@@ -18903,14 +20780,43 @@ apply_patch_buffer:
*/
static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
+ struct bpf_subprog_info *subprogs = env->subprog_info;
const struct bpf_verifier_ops *ops = env->ops;
- int i, cnt, size, ctx_field_size, delta = 0;
+ int i, cnt, size, ctx_field_size, ret, delta = 0, epilogue_cnt = 0;
const int insn_cnt = env->prog->len;
- struct bpf_insn insn_buf[16], *insn;
+ struct bpf_insn *epilogue_buf = env->epilogue_buf;
+ struct bpf_insn *insn_buf = env->insn_buf;
+ struct bpf_insn *insn;
u32 target_size, size_default, off;
struct bpf_prog *new_prog;
enum bpf_access_type type;
bool is_narrower_load;
+ int epilogue_idx = 0;
+
+ if (ops->gen_epilogue) {
+ epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog,
+ -(subprogs[0].stack_depth + 8));
+ if (epilogue_cnt >= INSN_BUF_SIZE) {
+ verbose(env, "bpf verifier is misconfigured\n");
+ return -EINVAL;
+ } else if (epilogue_cnt) {
+ /* Save the ARG_PTR_TO_CTX for the epilogue to use */
+ cnt = 0;
+ subprogs[0].stack_depth += 8;
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1,
+ -subprogs[0].stack_depth);
+ insn_buf[cnt++] = env->prog->insnsi[0];
+ new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+ env->prog = new_prog;
+ delta += cnt - 1;
+
+ ret = add_kfunc_in_insns(env, epilogue_buf, epilogue_cnt - 1);
+ if (ret < 0)
+ return ret;
+ }
+ }
if (ops->gen_prologue || env->seen_direct_write) {
if (!ops->gen_prologue) {
@@ -18919,7 +20825,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
}
cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
env->prog);
- if (cnt >= ARRAY_SIZE(insn_buf)) {
+ if (cnt >= INSN_BUF_SIZE) {
verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
} else if (cnt) {
@@ -18929,9 +20835,16 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
env->prog = new_prog;
delta += cnt - 1;
+
+ ret = add_kfunc_in_insns(env, insn_buf, cnt - 1);
+ if (ret < 0)
+ return ret;
}
}
+ if (delta)
+ WARN_ON(adjust_jmp_off(env->prog, 0, delta));
+
if (bpf_prog_is_offloaded(env->prog->aux))
return 0;
@@ -18958,6 +20871,33 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
type = BPF_WRITE;
+ } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_B) ||
+ insn->code == (BPF_STX | BPF_ATOMIC | BPF_H) ||
+ insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) ||
+ insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) &&
+ env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) {
+ insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code);
+ env->prog->aux->num_exentries++;
+ continue;
+ } else if (insn->code == (BPF_JMP | BPF_EXIT) &&
+ epilogue_cnt &&
+ i + delta < subprogs[1].start) {
+ /* Generate epilogue for the main prog */
+ if (epilogue_idx) {
+ /* jump back to the earlier generated epilogue */
+ insn_buf[0] = BPF_JMP32_A(epilogue_idx - i - delta - 1);
+ cnt = 1;
+ } else {
+ memcpy(insn_buf, epilogue_buf,
+ epilogue_cnt * sizeof(*epilogue_buf));
+ cnt = epilogue_cnt;
+ /* epilogue_idx cannot be 0. It must have at
+ * least one ctx ptr saving insn before the
+ * epilogue.
+ */
+ epilogue_idx = i + delta;
+ }
+ goto patch_insn_buf;
} else {
continue;
}
@@ -19060,7 +21000,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
target_size = 0;
cnt = convert_ctx_access(type, insn, insn_buf, env->prog,
&target_size);
- if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
+ if (cnt == 0 || cnt >= INSN_BUF_SIZE ||
(ctx_field_size && !target_size)) {
verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
@@ -19069,7 +21009,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
if (is_narrower_load && size < target_size) {
u8 shift = bpf_ctx_narrow_access_offset(
off, size, size_default) * 8;
- if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) {
+ if (shift && cnt + 1 >= INSN_BUF_SIZE) {
verbose(env, "bpf verifier narrow ctx load misconfigured\n");
return -EINVAL;
}
@@ -19094,6 +21034,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn->dst_reg, insn->dst_reg,
size * 8, 0);
+patch_insn_buf:
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
@@ -19129,11 +21070,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
* propagated in any case.
*/
subprog = find_subprog(env, i + insn->imm + 1);
- if (subprog < 0) {
- WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
- i + insn->imm + 1);
+ if (verifier_bug_if(subprog < 0, env, "No program to jit at insn %d",
+ i + insn->imm + 1))
return -EFAULT;
- }
/* temporarily remember subprog id inside insn instead of
* aux_data, since next loop will split up all insns into funcs
*/
@@ -19144,12 +21083,19 @@ static int jit_subprogs(struct bpf_verifier_env *env)
env->insn_aux_data[i].call_imm = insn->imm;
/* point imm to __bpf_call_base+1 from JITs point of view */
insn->imm = 1;
- if (bpf_pseudo_func(insn))
+ if (bpf_pseudo_func(insn)) {
+#if defined(MODULES_VADDR)
+ u64 addr = MODULES_VADDR;
+#else
+ u64 addr = VMALLOC_START;
+#endif
/* jit (e.g. x86_64) may emit fewer instructions
* if it learns a u32 imm is the same as a u64 imm.
- * Force a non zero here.
+ * Set close enough to possible prog address.
*/
- insn[1].imm = 1;
+ insn[0].imm = (u32)addr;
+ insn[1].imm = addr >> 32;
+ }
}
err = bpf_prog_alloc_jited_linfo(prog);
@@ -19181,6 +21127,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
if (bpf_prog_calc_tag(func[i]))
goto out_free;
func[i]->is_func = 1;
+ func[i]->sleepable = prog->sleepable;
func[i]->aux->func_idx = i;
/* Below members will be freed only at prog->aux */
func[i]->aux->btf = prog->aux->btf;
@@ -19200,6 +21147,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->name[0] = 'F';
func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
+ if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE)
+ func[i]->aux->jits_use_priv_stack = true;
+
func[i]->jit_requested = 1;
func[i]->blinding_requested = prog->blinding_requested;
func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
@@ -19221,10 +21171,15 @@ static int jit_subprogs(struct bpf_verifier_env *env)
BPF_CLASS(insn->code) == BPF_ST) &&
BPF_MODE(insn->code) == BPF_PROBE_MEM32)
num_exentries++;
+ if (BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_PROBE_ATOMIC)
+ num_exentries++;
}
func[i]->aux->num_exentries = num_exentries;
func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb;
+ func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data;
+ func[i]->aux->might_sleep = env->subprog_info[i].might_sleep;
if (!i)
func[i]->aux->exception_boundary = env->seen_exception;
func[i] = bpf_int_jit_compile(func[i]);
@@ -19285,10 +21240,14 @@ static int jit_subprogs(struct bpf_verifier_env *env)
* bpf_prog_load will add the kallsyms for the main program.
*/
for (i = 1; i < env->subprog_cnt; i++) {
- bpf_prog_lock_ro(func[i]);
- bpf_prog_kallsyms_add(func[i]);
+ err = bpf_prog_lock_ro(func[i]);
+ if (err)
+ goto out_free;
}
+ for (i = 1; i < env->subprog_cnt; i++)
+ bpf_prog_kallsyms_add(func[i]);
+
/* Last step: make now unused interpreter insns from main
* prog consistent for later dump requests, so they can
* later look the same as if they were interpreted only.
@@ -19437,6 +21396,14 @@ static void specialize_kfunc(struct bpf_verifier_env *env,
*/
env->seen_direct_write = seen_direct_write;
}
+
+ if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr] &&
+ bpf_lsm_has_d_inode_locked(prog))
+ *addr = (unsigned long)bpf_set_dentry_xattr_locked;
+
+ if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr] &&
+ bpf_lsm_has_d_inode_locked(prog))
+ *addr = (unsigned long)bpf_remove_dentry_xattr_locked;
}
static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
@@ -19549,6 +21516,17 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
*cnt = 1;
}
+
+ if (env->insn_aux_data[insn_idx].arg_prog) {
+ u32 regno = env->insn_aux_data[insn_idx].arg_prog;
+ struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(regno, (long)env->prog->aux) };
+ int idx = *cnt;
+
+ insn_buf[idx++] = ld_addrs[0];
+ insn_buf[idx++] = ld_addrs[1];
+ insn_buf[idx++] = *insn;
+ *cnt = idx;
+ }
return 0;
}
@@ -19592,7 +21570,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
const int insn_cnt = prog->len;
const struct bpf_map_ops *ops;
struct bpf_insn_aux_data *aux;
- struct bpf_insn insn_buf[16];
+ struct bpf_insn *insn_buf = env->insn_buf;
struct bpf_prog *new_prog;
struct bpf_map *map_ptr;
int i, ret, cnt, delta = 0, cur_subprog = 0;
@@ -19635,13 +21613,46 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
/* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */
insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code);
- /* Make divide-by-zero exceptions impossible. */
+ /* Make sdiv/smod divide-by-minus-one exceptions impossible. */
+ if ((insn->code == (BPF_ALU64 | BPF_MOD | BPF_K) ||
+ insn->code == (BPF_ALU64 | BPF_DIV | BPF_K) ||
+ insn->code == (BPF_ALU | BPF_MOD | BPF_K) ||
+ insn->code == (BPF_ALU | BPF_DIV | BPF_K)) &&
+ insn->off == 1 && insn->imm == -1) {
+ bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+ bool isdiv = BPF_OP(insn->code) == BPF_DIV;
+ struct bpf_insn *patchlet;
+ struct bpf_insn chk_and_sdiv[] = {
+ BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_NEG | BPF_K, insn->dst_reg,
+ 0, 0, 0),
+ };
+ struct bpf_insn chk_and_smod[] = {
+ BPF_MOV32_IMM(insn->dst_reg, 0),
+ };
+
+ patchlet = isdiv ? chk_and_sdiv : chk_and_smod;
+ cnt = isdiv ? ARRAY_SIZE(chk_and_sdiv) : ARRAY_SIZE(chk_and_smod);
+
+ new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Make divide-by-zero and divide-by-minus-one exceptions impossible. */
if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
bool isdiv = BPF_OP(insn->code) == BPF_DIV;
+ bool is_sdiv = isdiv && insn->off == 1;
+ bool is_smod = !isdiv && insn->off == 1;
struct bpf_insn *patchlet;
struct bpf_insn chk_and_div[] = {
/* [R,W]x div 0 -> 0 */
@@ -19661,10 +21672,62 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
BPF_JMP_IMM(BPF_JA, 0, 0, 1),
BPF_MOV32_REG(insn->dst_reg, insn->dst_reg),
};
+ struct bpf_insn chk_and_sdiv[] = {
+ /* [R,W]x sdiv 0 -> 0
+ * LLONG_MIN sdiv -1 -> LLONG_MIN
+ * INT_MIN sdiv -1 -> INT_MIN
+ */
+ BPF_MOV64_REG(BPF_REG_AX, insn->src_reg),
+ BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_ADD | BPF_K, BPF_REG_AX,
+ 0, 0, 1),
+ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JGT | BPF_K, BPF_REG_AX,
+ 0, 4, 1),
+ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, BPF_REG_AX,
+ 0, 1, 0),
+ BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_MOV | BPF_K, insn->dst_reg,
+ 0, 0, 0),
+ /* BPF_NEG(LLONG_MIN) == -LLONG_MIN == LLONG_MIN */
+ BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_NEG | BPF_K, insn->dst_reg,
+ 0, 0, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ *insn,
+ };
+ struct bpf_insn chk_and_smod[] = {
+ /* [R,W]x mod 0 -> [R,W]x */
+ /* [R,W]x mod -1 -> 0 */
+ BPF_MOV64_REG(BPF_REG_AX, insn->src_reg),
+ BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_ADD | BPF_K, BPF_REG_AX,
+ 0, 0, 1),
+ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JGT | BPF_K, BPF_REG_AX,
+ 0, 3, 1),
+ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, BPF_REG_AX,
+ 0, 3 + (is64 ? 0 : 1), 1),
+ BPF_MOV32_IMM(insn->dst_reg, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ *insn,
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_MOV32_REG(insn->dst_reg, insn->dst_reg),
+ };
- patchlet = isdiv ? chk_and_div : chk_and_mod;
- cnt = isdiv ? ARRAY_SIZE(chk_and_div) :
- ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0);
+ if (is_sdiv) {
+ patchlet = chk_and_sdiv;
+ cnt = ARRAY_SIZE(chk_and_sdiv);
+ } else if (is_smod) {
+ patchlet = chk_and_smod;
+ cnt = ARRAY_SIZE(chk_and_smod) - (is64 ? 2 : 0);
+ } else {
+ patchlet = isdiv ? chk_and_div : chk_and_mod;
+ cnt = isdiv ? ARRAY_SIZE(chk_and_div) :
+ ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0);
+ }
new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
if (!new_prog)
@@ -19676,12 +21739,42 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
goto next_insn;
}
+ /* Make it impossible to de-reference a userspace address */
+ if (BPF_CLASS(insn->code) == BPF_LDX &&
+ (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
+ struct bpf_insn *patch = &insn_buf[0];
+ u64 uaddress_limit = bpf_arch_uaddress_limit();
+
+ if (!uaddress_limit)
+ goto next_insn;
+
+ *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
+ if (insn->off)
+ *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
+ *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
+ *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
+ *patch++ = *insn;
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);
+
+ cnt = patch - insn_buf;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
/* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
if (BPF_CLASS(insn->code) == BPF_LD &&
(BPF_MODE(insn->code) == BPF_ABS ||
BPF_MODE(insn->code) == BPF_IND)) {
cnt = env->ops->gen_ld_abs(insn, insn_buf);
- if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ if (cnt == 0 || cnt >= INSN_BUF_SIZE) {
verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
@@ -19749,12 +21842,58 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
goto next_insn;
}
- if (is_may_goto_insn(insn)) {
+ if (is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) {
+ int stack_off_cnt = -stack_depth - 16;
+
+ /*
+ * Two 8 byte slots, depth-16 stores the count, and
+ * depth-8 stores the start timestamp of the loop.
+ *
+ * The starting value of count is BPF_MAX_TIMED_LOOPS
+ * (0xffff). Every iteration loads it and subs it by 1,
+ * until the value becomes 0 in AX (thus, 1 in stack),
+ * after which we call arch_bpf_timed_may_goto, which
+ * either sets AX to 0xffff to keep looping, or to 0
+ * upon timeout. AX is then stored into the stack. In
+ * the next iteration, we either see 0 and break out, or
+ * continue iterating until the next time value is 0
+ * after subtraction, rinse and repeat.
+ */
+ stack_depth_extra = 16;
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt);
+ if (insn->off >= 0)
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5);
+ else
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
+ insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2);
+ /*
+ * AX is used as an argument to pass in stack_off_cnt
+ * (to add to r10/fp), and also as the return value of
+ * the call to arch_bpf_timed_may_goto.
+ */
+ insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt);
+ insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto);
+ insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt);
+ cnt = 7;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ } else if (is_may_goto_insn(insn)) {
int stack_off = -stack_depth - 8;
stack_depth_extra = 8;
insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off);
- insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2);
+ if (insn->off >= 0)
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2);
+ else
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off);
cnt = 4;
@@ -19790,6 +21929,10 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
goto next_insn;
}
+ /* Skip inlining the helper call if the JIT does it. */
+ if (bpf_jit_inlines_helper_call(insn->imm))
+ goto next_insn;
+
if (insn->imm == BPF_FUNC_get_route_realm)
prog->dst_needed = 1;
if (insn->imm == BPF_FUNC_get_prandom_u32)
@@ -19823,7 +21966,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
!bpf_map_ptr_unpriv(aux)) {
struct bpf_jit_poke_descriptor desc = {
.reason = BPF_POKE_REASON_TAIL_CALL,
- .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state),
+ .tail_call.map = aux->map_ptr_state.map_ptr,
.tail_call.key = bpf_map_key_immediate(aux),
.insn_idx = i + delta,
};
@@ -19852,7 +21995,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
return -EINVAL;
}
- map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
+ map_ptr = aux->map_ptr_state.map_ptr;
insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
map_ptr->max_entries, 2);
insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
@@ -19960,14 +22103,14 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
if (bpf_map_ptr_poisoned(aux))
goto patch_call_imm;
- map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
+ map_ptr = aux->map_ptr_state.map_ptr;
ops = map_ptr->ops;
if (insn->imm == BPF_FUNC_map_lookup_elem &&
ops->map_gen_lookup) {
cnt = ops->map_gen_lookup(map_ptr, insn_buf);
if (cnt == -EOPNOTSUPP)
goto patch_map_ops_generic;
- if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ if (cnt <= 0 || cnt >= INSN_BUF_SIZE) {
verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
@@ -20066,6 +22209,34 @@ patch_map_ops_generic:
goto next_insn;
}
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
+ /* Implement bpf_get_smp_processor_id() inline. */
+ if (insn->imm == BPF_FUNC_get_smp_processor_id &&
+ verifier_inlines_helper_call(env, insn->imm)) {
+ /* BPF_FUNC_get_smp_processor_id inlining is an
+ * optimization, so if cpu_number is ever
+ * changed in some incompatible and hard to support
+ * way, it's fine to back out this inlining logic
+ */
+#ifdef CONFIG_SMP
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&cpu_number);
+ insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
+ insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0);
+ cnt = 3;
+#else
+ insn_buf[0] = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
+ cnt = 1;
+#endif
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+#endif
/* Implement bpf_get_func_arg inline. */
if (prog_type == BPF_PROG_TYPE_TRACING &&
insn->imm == BPF_FUNC_get_func_arg) {
@@ -20149,6 +22320,62 @@ patch_map_ops_generic:
goto next_insn;
}
+ /* Implement bpf_get_branch_snapshot inline. */
+ if (IS_ENABLED(CONFIG_PERF_EVENTS) &&
+ prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_get_branch_snapshot) {
+ /* We are dealing with the following func protos:
+ * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags);
+ * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt);
+ */
+ const u32 br_entry_size = sizeof(struct perf_branch_entry);
+
+ /* struct perf_branch_entry is part of UAPI and is
+ * used as an array element, so extremely unlikely to
+ * ever grow or shrink
+ */
+ BUILD_BUG_ON(br_entry_size != 24);
+
+ /* if (unlikely(flags)) return -EINVAL */
+ insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7);
+
+ /* Transform size (bytes) into number of entries (cnt = size / 24).
+ * But to avoid expensive division instruction, we implement
+ * divide-by-3 through multiplication, followed by further
+ * division by 8 through 3-bit right shift.
+ * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr.,
+ * p. 227, chapter "Unsigned Division by 3" for details and proofs.
+ *
+ * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab.
+ */
+ insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab);
+ insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0);
+ insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36);
+
+ /* call perf_snapshot_branch_stack implementation */
+ insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack));
+ /* if (entry_cnt == 0) return -ENOENT */
+ insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4);
+ /* return entry_cnt * sizeof(struct perf_branch_entry) */
+ insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size);
+ insn_buf[7] = BPF_JMP_A(3);
+ /* return -EINVAL; */
+ insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
+ insn_buf[9] = BPF_JMP_A(1);
+ /* return -ENOENT; */
+ insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT);
+ cnt = 11;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
/* Implement bpf_kptr_xchg inline */
if (prog->jit_requested && BITS_PER_LONG == 64 &&
insn->imm == BPF_FUNC_kptr_xchg &&
@@ -20182,6 +22409,13 @@ next_insn:
if (subprogs[cur_subprog + 1].start == i + delta + 1) {
subprogs[cur_subprog].stack_depth += stack_depth_extra;
subprogs[cur_subprog].stack_extra = stack_depth_extra;
+
+ stack_depth = subprogs[cur_subprog].stack_depth;
+ if (stack_depth > MAX_BPF_STACK && !prog->jit_requested) {
+ verbose(env, "stack size %d(extra %d) is too large\n",
+ stack_depth, stack_depth_extra);
+ return -EINVAL;
+ }
cur_subprog++;
stack_depth = subprogs[cur_subprog].stack_depth;
stack_depth_extra = 0;
@@ -20192,26 +22426,43 @@ next_insn:
env->prog->aux->stack_depth = subprogs[0].stack_depth;
for (i = 0; i < env->subprog_cnt; i++) {
+ int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1;
int subprog_start = subprogs[i].start;
int stack_slots = subprogs[i].stack_extra / 8;
+ int slots = delta, cnt = 0;
if (!stack_slots)
continue;
- if (stack_slots > 1) {
- verbose(env, "verifier bug: stack_slots supports may_goto only\n");
+ /* We need two slots in case timed may_goto is supported. */
+ if (stack_slots > slots) {
+ verifier_bug(env, "stack_slots supports may_goto only");
return -EFAULT;
}
- /* Add ST insn to subprog prologue to init extra stack */
- insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP,
- -subprogs[i].stack_depth, BPF_MAX_LOOPS);
+ stack_depth = subprogs[i].stack_depth;
+ if (bpf_jit_supports_timed_may_goto()) {
+ insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
+ BPF_MAX_TIMED_LOOPS);
+ insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0);
+ } else {
+ /* Add ST insn to subprog prologue to init extra stack */
+ insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
+ BPF_MAX_LOOPS);
+ }
/* Copy first actual insn to preserve it */
- insn_buf[1] = env->prog->insnsi[subprog_start];
+ insn_buf[cnt++] = env->prog->insnsi[subprog_start];
- new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2);
+ new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
env->prog = prog = new_prog;
+ /*
+ * If may_goto is a first insn of a prog there could be a jmp
+ * insn that points to it, hence adjust all such jmps to point
+ * to insn after BPF_ST that inits may_goto count.
+ * Adjustment will succeed because bpf_patch_insn_data() didn't fail.
+ */
+ WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta));
}
/* Since poke tab is now finalized, publish aux to tracker. */
@@ -20240,7 +22491,7 @@ static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
int position,
s32 stack_base,
u32 callback_subprogno,
- u32 *cnt)
+ u32 *total_cnt)
{
s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
@@ -20249,55 +22500,56 @@ static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
int reg_loop_cnt = BPF_REG_7;
int reg_loop_ctx = BPF_REG_8;
+ struct bpf_insn *insn_buf = env->insn_buf;
struct bpf_prog *new_prog;
u32 callback_start;
u32 call_insn_offset;
s32 callback_offset;
+ u32 cnt = 0;
/* This represents an inlined version of bpf_iter.c:bpf_loop,
* be careful to modify this code in sync.
*/
- struct bpf_insn insn_buf[] = {
- /* Return error and jump to the end of the patch if
- * expected number of iterations is too big.
- */
- BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2),
- BPF_MOV32_IMM(BPF_REG_0, -E2BIG),
- BPF_JMP_IMM(BPF_JA, 0, 0, 16),
- /* spill R6, R7, R8 to use these as loop vars */
- BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset),
- BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset),
- BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset),
- /* initialize loop vars */
- BPF_MOV64_REG(reg_loop_max, BPF_REG_1),
- BPF_MOV32_IMM(reg_loop_cnt, 0),
- BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3),
- /* loop header,
- * if reg_loop_cnt >= reg_loop_max skip the loop body
- */
- BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5),
- /* callback call,
- * correct callback offset would be set after patching
- */
- BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt),
- BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx),
- BPF_CALL_REL(0),
- /* increment loop counter */
- BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1),
- /* jump to loop header if callback returned 0 */
- BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6),
- /* return value of bpf_loop,
- * set R0 to the number of iterations
- */
- BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt),
- /* restore original values of R6, R7, R8 */
- BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset),
- BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset),
- BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset),
- };
- *cnt = ARRAY_SIZE(insn_buf);
- new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt);
+ /* Return error and jump to the end of the patch if
+ * expected number of iterations is too big.
+ */
+ insn_buf[cnt++] = BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2);
+ insn_buf[cnt++] = BPF_MOV32_IMM(BPF_REG_0, -E2BIG);
+ insn_buf[cnt++] = BPF_JMP_IMM(BPF_JA, 0, 0, 16);
+ /* spill R6, R7, R8 to use these as loop vars */
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset);
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset);
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset);
+ /* initialize loop vars */
+ insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_max, BPF_REG_1);
+ insn_buf[cnt++] = BPF_MOV32_IMM(reg_loop_cnt, 0);
+ insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3);
+ /* loop header,
+ * if reg_loop_cnt >= reg_loop_max skip the loop body
+ */
+ insn_buf[cnt++] = BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5);
+ /* callback call,
+ * correct callback offset would be set after patching
+ */
+ insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt);
+ insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx);
+ insn_buf[cnt++] = BPF_CALL_REL(0);
+ /* increment loop counter */
+ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1);
+ /* jump to loop header if callback returned 0 */
+ insn_buf[cnt++] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6);
+ /* return value of bpf_loop,
+ * set R0 to the number of iterations
+ */
+ insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt);
+ /* restore original values of R6, R7, R8 */
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset);
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset);
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset);
+
+ *total_cnt = cnt;
+ new_prog = bpf_patch_insn_data(env, position, insn_buf, cnt);
if (!new_prog)
return new_prog;
@@ -20372,33 +22624,65 @@ static int optimize_bpf_loop(struct bpf_verifier_env *env)
return 0;
}
+/* Remove unnecessary spill/fill pairs, members of fastcall pattern,
+ * adjust subprograms stack depth when possible.
+ */
+static int remove_fastcall_spills_fills(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprog = env->subprog_info;
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ u32 spills_num;
+ bool modified = false;
+ int i, j;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (aux[i].fastcall_spills_num > 0) {
+ spills_num = aux[i].fastcall_spills_num;
+ /* NOPs would be removed by opt_remove_nops() */
+ for (j = 1; j <= spills_num; ++j) {
+ *(insn - j) = NOP;
+ *(insn + j) = NOP;
+ }
+ modified = true;
+ }
+ if ((subprog + 1)->start == i + 1) {
+ if (modified && !subprog->keep_fastcall_stack)
+ subprog->stack_depth = -subprog->fastcall_stack_off;
+ subprog++;
+ modified = false;
+ }
+ }
+
+ return 0;
+}
+
static void free_states(struct bpf_verifier_env *env)
{
- struct bpf_verifier_state_list *sl, *sln;
+ struct bpf_verifier_state_list *sl;
+ struct list_head *head, *pos, *tmp;
int i;
- sl = env->free_list;
- while (sl) {
- sln = sl->next;
+ list_for_each_safe(pos, tmp, &env->free_list) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
free_verifier_state(&sl->state, false);
kfree(sl);
- sl = sln;
}
- env->free_list = NULL;
+ INIT_LIST_HEAD(&env->free_list);
if (!env->explored_states)
return;
for (i = 0; i < state_htab_size(env); i++) {
- sl = env->explored_states[i];
+ head = &env->explored_states[i];
- while (sl) {
- sln = sl->next;
+ list_for_each_safe(pos, tmp, head) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
free_verifier_state(&sl->state, false);
kfree(sl);
- sl = sln;
}
- env->explored_states[i] = NULL;
+ INIT_LIST_HEAD(&env->explored_states[i]);
}
}
@@ -20406,6 +22690,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
struct bpf_subprog_info *sub = subprog_info(env, subprog);
+ struct bpf_prog_aux *aux = env->prog->aux;
struct bpf_verifier_state *state;
struct bpf_reg_state *regs;
int ret, i;
@@ -20513,6 +22798,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
mark_reg_known_zero(env, regs, BPF_REG_1);
}
+ /* Acquire references for struct_ops program arguments tagged with "__ref" */
+ if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
+ for (i = 0; i < aux->ctx_arg_info_size; i++)
+ aux->ctx_arg_info[i].ref_obj_id = aux->ctx_arg_info[i].refcounted ?
+ acquire_reference(env, 0) : 0;
+ }
+
ret = do_check(env);
out:
/* check for NULL is necessary, since cur_state can be freed inside
@@ -20635,6 +22927,15 @@ static void print_verification_stats(struct bpf_verifier_env *env)
env->peak_states, env->longest_mark_read_walk);
}
+int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog,
+ const struct bpf_ctx_arg_aux *info, u32 cnt)
+{
+ prog->aux->ctx_arg_info = kmemdup_array(info, cnt, sizeof(*info), GFP_KERNEL);
+ prog->aux->ctx_arg_info_size = cnt;
+
+ return prog->aux->ctx_arg_info ? 0 : -ENOMEM;
+}
+
static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
{
const struct btf_type *t, *func_proto;
@@ -20642,9 +22943,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
const struct bpf_struct_ops *st_ops;
const struct btf_member *member;
struct bpf_prog *prog = env->prog;
- u32 btf_id, member_idx;
+ bool has_refcounted_arg = false;
+ u32 btf_id, member_idx, member_off;
struct btf *btf;
const char *mname;
+ int i, err;
if (!prog->gpl_compatible) {
verbose(env, "struct ops programs must have a GPL compatible license\n");
@@ -20692,8 +22995,16 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
return -EINVAL;
}
+ member_off = __btf_member_bit_offset(t, member) / 8;
+ err = bpf_struct_ops_supported(st_ops, member_off);
+ if (err) {
+ verbose(env, "attach to unsupported member %s of struct %s\n",
+ mname, st_ops->name);
+ return err;
+ }
+
if (st_ops->check_member) {
- int err = st_ops->check_member(t, member, prog);
+ err = st_ops->check_member(t, member, prog);
if (err) {
verbose(env, "attach to unsupported member %s of struct %s\n",
@@ -20702,17 +23013,37 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
}
}
- /* btf_ctx_access() used this to provide argument type info */
- prog->aux->ctx_arg_info =
- st_ops_desc->arg_info[member_idx].info;
- prog->aux->ctx_arg_info_size =
- st_ops_desc->arg_info[member_idx].cnt;
+ if (prog->aux->priv_stack_requested && !bpf_jit_supports_private_stack()) {
+ verbose(env, "Private stack not supported by jit\n");
+ return -EACCES;
+ }
+
+ for (i = 0; i < st_ops_desc->arg_info[member_idx].cnt; i++) {
+ if (st_ops_desc->arg_info[member_idx].info->refcounted) {
+ has_refcounted_arg = true;
+ break;
+ }
+ }
+
+ /* Tail call is not allowed for programs with refcounted arguments since we
+ * cannot guarantee that valid refcounted kptrs will be passed to the callee.
+ */
+ for (i = 0; i < env->subprog_cnt; i++) {
+ if (has_refcounted_arg && env->subprog_info[i].has_tail_call) {
+ verbose(env, "program with __ref argument cannot tail call\n");
+ return -EINVAL;
+ }
+ }
+
+ prog->aux->st_ops = st_ops;
+ prog->aux->attach_st_ops_member_off = member_off;
prog->aux->attach_func_proto = func_proto;
prog->aux->attach_func_name = mname;
env->ops = st_ops->verifier_ops;
- return 0;
+ return bpf_prog_ctx_arg_info_init(prog, st_ops_desc->arg_info[member_idx].info,
+ st_ops_desc->arg_info[member_idx].cnt);
}
#define SECURITY_PREFIX "security_"
@@ -20733,8 +23064,12 @@ BTF_SET_START(btf_non_sleepable_error_inject)
* Assume non-sleepable from bpf safety point of view.
*/
BTF_ID(func, __filemap_add_folio)
+#ifdef CONFIG_FAIL_PAGE_ALLOC
BTF_ID(func, should_fail_alloc_page)
+#endif
+#ifdef CONFIG_FAILSLAB
BTF_ID(func, should_failslab)
+#endif
BTF_SET_END(btf_non_sleepable_error_inject)
static int check_non_sleepable_error_inject(u32 btf_id)
@@ -20750,11 +23085,13 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
{
bool prog_extension = prog->type == BPF_PROG_TYPE_EXT;
bool prog_tracing = prog->type == BPF_PROG_TYPE_TRACING;
+ char trace_symbol[KSYM_SYMBOL_LEN];
const char prefix[] = "btf_trace_";
+ struct bpf_raw_event_map *btp;
int ret = 0, subprog = -1, i;
const struct btf_type *t;
bool conservative = true;
- const char *tname;
+ const char *tname, *fname;
struct btf *btf;
long addr = 0;
struct module *mod = NULL;
@@ -20781,6 +23118,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
}
if (tgt_prog) {
struct bpf_prog_aux *aux = tgt_prog->aux;
+ bool tgt_changes_pkt_data;
+ bool tgt_might_sleep;
if (bpf_prog_is_dev_bound(prog->aux) &&
!bpf_prog_dev_bound_match(prog, tgt_prog)) {
@@ -20815,6 +23154,23 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
"Extension programs should be JITed\n");
return -EINVAL;
}
+ tgt_changes_pkt_data = aux->func
+ ? aux->func[subprog]->aux->changes_pkt_data
+ : aux->changes_pkt_data;
+ if (prog->aux->changes_pkt_data && !tgt_changes_pkt_data) {
+ bpf_log(log,
+ "Extension program changes packet data, while original does not\n");
+ return -EINVAL;
+ }
+
+ tgt_might_sleep = aux->func
+ ? aux->func[subprog]->aux->might_sleep
+ : aux->might_sleep;
+ if (prog->aux->might_sleep && !tgt_might_sleep) {
+ bpf_log(log,
+ "Extension program may sleep, while original does not\n");
+ return -EINVAL;
+ }
}
if (!tgt_prog->jited) {
bpf_log(log, "Can attach to only JITed progs\n");
@@ -20885,10 +23241,34 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
return -EINVAL;
}
tname += sizeof(prefix) - 1;
- t = btf_type_by_id(btf, t->type);
- if (!btf_type_is_ptr(t))
- /* should never happen in valid vmlinux build */
+
+ /* The func_proto of "btf_trace_##tname" is generated from typedef without argument
+ * names. Thus using bpf_raw_event_map to get argument names.
+ */
+ btp = bpf_get_raw_tracepoint(tname);
+ if (!btp)
return -EINVAL;
+ fname = kallsyms_lookup((unsigned long)btp->bpf_func, NULL, NULL, NULL,
+ trace_symbol);
+ bpf_put_raw_tracepoint(btp);
+
+ if (fname)
+ ret = btf_find_by_name_kind(btf, fname, BTF_KIND_FUNC);
+
+ if (!fname || ret < 0) {
+ bpf_log(log, "Cannot find btf of tracepoint template, fall back to %s%s.\n",
+ prefix, tname);
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_ptr(t))
+ /* should never happen in valid vmlinux build */
+ return -EINVAL;
+ } else {
+ t = btf_type_by_id(btf, ret);
+ if (!btf_type_is_func(t))
+ /* should never happen in valid vmlinux build */
+ return -EINVAL;
+ }
+
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_func_proto(t))
/* should never happen in valid vmlinux build */
@@ -21047,6 +23427,33 @@ BTF_ID(func, __rcu_read_unlock)
#endif
BTF_SET_END(btf_id_deny)
+/* fexit and fmod_ret can't be used to attach to __noreturn functions.
+ * Currently, we must manually list all __noreturn functions here. Once a more
+ * robust solution is implemented, this workaround can be removed.
+ */
+BTF_SET_START(noreturn_deny)
+#ifdef CONFIG_IA32_EMULATION
+BTF_ID(func, __ia32_sys_exit)
+BTF_ID(func, __ia32_sys_exit_group)
+#endif
+#ifdef CONFIG_KUNIT
+BTF_ID(func, __kunit_abort)
+BTF_ID(func, kunit_try_catch_throw)
+#endif
+#ifdef CONFIG_MODULES
+BTF_ID(func, __module_put_and_kthread_exit)
+#endif
+#ifdef CONFIG_X86_64
+BTF_ID(func, __x64_sys_exit)
+BTF_ID(func, __x64_sys_exit_group)
+#endif
+BTF_ID(func, do_exit)
+BTF_ID(func, do_group_exit)
+BTF_ID(func, kthread_complete_and_exit)
+BTF_ID(func, kthread_exit)
+BTF_ID(func, make_task_dead)
+BTF_SET_END(noreturn_deny)
+
static bool can_be_sleepable(struct bpf_prog *prog)
{
if (prog->type == BPF_PROG_TYPE_TRACING) {
@@ -21123,9 +23530,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
prog->aux->attach_btf_trace = true;
return 0;
} else if (prog->expected_attach_type == BPF_TRACE_ITER) {
- if (!bpf_iter_prog_supported(prog))
- return -EINVAL;
- return 0;
+ return bpf_iter_prog_supported(prog);
}
if (prog->type == BPF_PROG_TYPE_LSM) {
@@ -21135,6 +23540,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
} else if (prog->type == BPF_PROG_TYPE_TRACING &&
btf_id_set_contains(&btf_id_deny, btf_id)) {
return -EINVAL;
+ } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT ||
+ prog->expected_attach_type == BPF_MODIFY_RETURN) &&
+ btf_id_set_contains(&noreturn_deny, btf_id)) {
+ verbose(env, "Attaching fexit/fmod_ret to __noreturn functions is rejected.\n");
+ return -EINVAL;
}
key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id);
@@ -21160,6 +23570,369 @@ struct btf *bpf_get_btf_vmlinux(void)
return btf_vmlinux;
}
+/*
+ * The add_fd_from_fd_array() is executed only if fd_array_cnt is non-zero. In
+ * this case expect that every file descriptor in the array is either a map or
+ * a BTF. Everything else is considered to be trash.
+ */
+static int add_fd_from_fd_array(struct bpf_verifier_env *env, int fd)
+{
+ struct bpf_map *map;
+ struct btf *btf;
+ CLASS(fd, f)(fd);
+ int err;
+
+ map = __bpf_map_get(f);
+ if (!IS_ERR(map)) {
+ err = __add_used_map(env, map);
+ if (err < 0)
+ return err;
+ return 0;
+ }
+
+ btf = __btf_get_by_fd(f);
+ if (!IS_ERR(btf)) {
+ err = __add_used_btf(env, btf);
+ if (err < 0)
+ return err;
+ return 0;
+ }
+
+ verbose(env, "fd %d is not pointing to valid bpf_map or btf\n", fd);
+ return PTR_ERR(map);
+}
+
+static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr, bpfptr_t uattr)
+{
+ size_t size = sizeof(int);
+ int ret;
+ int fd;
+ u32 i;
+
+ env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);
+
+ /*
+ * The only difference between old (no fd_array_cnt is given) and new
+ * APIs is that in the latter case the fd_array is expected to be
+ * continuous and is scanned for map fds right away
+ */
+ if (!attr->fd_array_cnt)
+ return 0;
+
+ /* Check for integer overflow */
+ if (attr->fd_array_cnt >= (U32_MAX / size)) {
+ verbose(env, "fd_array_cnt is too big (%u)\n", attr->fd_array_cnt);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < attr->fd_array_cnt; i++) {
+ if (copy_from_bpfptr_offset(&fd, env->fd_array, i * size, size))
+ return -EFAULT;
+
+ ret = add_fd_from_fd_array(env, fd);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static bool can_fallthrough(struct bpf_insn *insn)
+{
+ u8 class = BPF_CLASS(insn->code);
+ u8 opcode = BPF_OP(insn->code);
+
+ if (class != BPF_JMP && class != BPF_JMP32)
+ return true;
+
+ if (opcode == BPF_EXIT || opcode == BPF_JA)
+ return false;
+
+ return true;
+}
+
+static bool can_jump(struct bpf_insn *insn)
+{
+ u8 class = BPF_CLASS(insn->code);
+ u8 opcode = BPF_OP(insn->code);
+
+ if (class != BPF_JMP && class != BPF_JMP32)
+ return false;
+
+ switch (opcode) {
+ case BPF_JA:
+ case BPF_JEQ:
+ case BPF_JNE:
+ case BPF_JLT:
+ case BPF_JLE:
+ case BPF_JGT:
+ case BPF_JGE:
+ case BPF_JSGT:
+ case BPF_JSGE:
+ case BPF_JSLT:
+ case BPF_JSLE:
+ case BPF_JCOND:
+ return true;
+ }
+
+ return false;
+}
+
+static int insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2])
+{
+ struct bpf_insn *insn = &prog->insnsi[idx];
+ int i = 0, insn_sz;
+ u32 dst;
+
+ insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
+ if (can_fallthrough(insn) && idx + 1 < prog->len)
+ succ[i++] = idx + insn_sz;
+
+ if (can_jump(insn)) {
+ dst = idx + jmp_offset(insn) + 1;
+ if (i == 0 || succ[0] != dst)
+ succ[i++] = dst;
+ }
+
+ return i;
+}
+
+/* Each field is a register bitmask */
+struct insn_live_regs {
+ u16 use; /* registers read by instruction */
+ u16 def; /* registers written by instruction */
+ u16 in; /* registers that may be alive before instruction */
+ u16 out; /* registers that may be alive after instruction */
+};
+
+/* Bitmask with 1s for all caller saved registers */
+#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1)
+
+/* Compute info->{use,def} fields for the instruction */
+static void compute_insn_live_regs(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct insn_live_regs *info)
+{
+ struct call_summary cs;
+ u8 class = BPF_CLASS(insn->code);
+ u8 code = BPF_OP(insn->code);
+ u8 mode = BPF_MODE(insn->code);
+ u16 src = BIT(insn->src_reg);
+ u16 dst = BIT(insn->dst_reg);
+ u16 r0 = BIT(0);
+ u16 def = 0;
+ u16 use = 0xffff;
+
+ switch (class) {
+ case BPF_LD:
+ switch (mode) {
+ case BPF_IMM:
+ if (BPF_SIZE(insn->code) == BPF_DW) {
+ def = dst;
+ use = 0;
+ }
+ break;
+ case BPF_LD | BPF_ABS:
+ case BPF_LD | BPF_IND:
+ /* stick with defaults */
+ break;
+ }
+ break;
+ case BPF_LDX:
+ switch (mode) {
+ case BPF_MEM:
+ case BPF_MEMSX:
+ def = dst;
+ use = src;
+ break;
+ }
+ break;
+ case BPF_ST:
+ switch (mode) {
+ case BPF_MEM:
+ def = 0;
+ use = dst;
+ break;
+ }
+ break;
+ case BPF_STX:
+ switch (mode) {
+ case BPF_MEM:
+ def = 0;
+ use = dst | src;
+ break;
+ case BPF_ATOMIC:
+ switch (insn->imm) {
+ case BPF_CMPXCHG:
+ use = r0 | dst | src;
+ def = r0;
+ break;
+ case BPF_LOAD_ACQ:
+ def = dst;
+ use = src;
+ break;
+ case BPF_STORE_REL:
+ def = 0;
+ use = dst | src;
+ break;
+ default:
+ use = dst | src;
+ if (insn->imm & BPF_FETCH)
+ def = src;
+ else
+ def = 0;
+ }
+ break;
+ }
+ break;
+ case BPF_ALU:
+ case BPF_ALU64:
+ switch (code) {
+ case BPF_END:
+ use = dst;
+ def = dst;
+ break;
+ case BPF_MOV:
+ def = dst;
+ if (BPF_SRC(insn->code) == BPF_K)
+ use = 0;
+ else
+ use = src;
+ break;
+ default:
+ def = dst;
+ if (BPF_SRC(insn->code) == BPF_K)
+ use = dst;
+ else
+ use = dst | src;
+ }
+ break;
+ case BPF_JMP:
+ case BPF_JMP32:
+ switch (code) {
+ case BPF_JA:
+ case BPF_JCOND:
+ def = 0;
+ use = 0;
+ break;
+ case BPF_EXIT:
+ def = 0;
+ use = r0;
+ break;
+ case BPF_CALL:
+ def = ALL_CALLER_SAVED_REGS;
+ use = def & ~BIT(BPF_REG_0);
+ if (get_call_summary(env, insn, &cs))
+ use = GENMASK(cs.num_params, 1);
+ break;
+ default:
+ def = 0;
+ if (BPF_SRC(insn->code) == BPF_K)
+ use = dst;
+ else
+ use = dst | src;
+ }
+ break;
+ }
+
+ info->def = def;
+ info->use = use;
+}
+
+/* Compute may-live registers after each instruction in the program.
+ * The register is live after the instruction I if it is read by some
+ * instruction S following I during program execution and is not
+ * overwritten between I and S.
+ *
+ * Store result in env->insn_aux_data[i].live_regs.
+ */
+static int compute_live_registers(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *insn_aux = env->insn_aux_data;
+ struct bpf_insn *insns = env->prog->insnsi;
+ struct insn_live_regs *state;
+ int insn_cnt = env->prog->len;
+ int err = 0, i, j;
+ bool changed;
+
+ /* Use the following algorithm:
+ * - define the following:
+ * - I.use : a set of all registers read by instruction I;
+ * - I.def : a set of all registers written by instruction I;
+ * - I.in : a set of all registers that may be alive before I execution;
+ * - I.out : a set of all registers that may be alive after I execution;
+ * - insn_successors(I): a set of instructions S that might immediately
+ * follow I for some program execution;
+ * - associate separate empty sets 'I.in' and 'I.out' with each instruction;
+ * - visit each instruction in a postorder and update
+ * state[i].in, state[i].out as follows:
+ *
+ * state[i].out = U [state[s].in for S in insn_successors(i)]
+ * state[i].in = (state[i].out / state[i].def) U state[i].use
+ *
+ * (where U stands for set union, / stands for set difference)
+ * - repeat the computation while {in,out} fields changes for
+ * any instruction.
+ */
+ state = kvcalloc(insn_cnt, sizeof(*state), GFP_KERNEL);
+ if (!state) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < insn_cnt; ++i)
+ compute_insn_live_regs(env, &insns[i], &state[i]);
+
+ changed = true;
+ while (changed) {
+ changed = false;
+ for (i = 0; i < env->cfg.cur_postorder; ++i) {
+ int insn_idx = env->cfg.insn_postorder[i];
+ struct insn_live_regs *live = &state[insn_idx];
+ int succ_num;
+ u32 succ[2];
+ u16 new_out = 0;
+ u16 new_in = 0;
+
+ succ_num = insn_successors(env->prog, insn_idx, succ);
+ for (int s = 0; s < succ_num; ++s)
+ new_out |= state[succ[s]].in;
+ new_in = (new_out & ~live->def) | live->use;
+ if (new_out != live->out || new_in != live->in) {
+ live->in = new_in;
+ live->out = new_out;
+ changed = true;
+ }
+ }
+ }
+
+ for (i = 0; i < insn_cnt; ++i)
+ insn_aux[i].live_regs_before = state[i].in;
+
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "Live regs before insn:\n");
+ for (i = 0; i < insn_cnt; ++i) {
+ verbose(env, "%3d: ", i);
+ for (j = BPF_REG_0; j < BPF_REG_10; ++j)
+ if (insn_aux[i].live_regs_before & BIT(j))
+ verbose(env, "%d", j);
+ else
+ verbose(env, ".");
+ verbose(env, " ");
+ verbose_insn(env, &insns[i]);
+ if (bpf_is_ldimm64(&insns[i]))
+ i++;
+ }
+ }
+
+out:
+ kvfree(state);
+ kvfree(env->cfg.insn_postorder);
+ env->cfg.insn_postorder = NULL;
+ env->cfg.cur_postorder = 0;
+ return err;
+}
+
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
u64 start_time = ktime_get_ns();
@@ -21175,7 +23948,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
/* 'struct bpf_verifier_env' can be global, but since it's not small,
* allocate/free it every time bpf_check() is called
*/
- env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
+ env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
if (!env)
return -ENOMEM;
@@ -21191,7 +23964,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
env->insn_aux_data[i].orig_idx = i;
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
- env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);
env->allow_ptr_leaks = bpf_allow_ptr_leaks(env->prog->aux->token);
env->allow_uninit_stack = bpf_allow_uninit_stack(env->prog->aux->token);
@@ -21214,6 +23986,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret)
goto err_unlock;
+ ret = process_fd_array(env, attr, uattr);
+ if (ret)
+ goto skip_full_check;
+
mark_verifier_state_clean(env);
if (IS_ERR(btf_vmlinux)) {
@@ -21234,12 +24010,16 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS;
env->explored_states = kvcalloc(state_htab_size(env),
- sizeof(struct bpf_verifier_state_list *),
+ sizeof(struct list_head),
GFP_USER);
ret = -ENOMEM;
if (!env->explored_states)
goto skip_full_check;
+ for (i = 0; i < state_htab_size(env); i++)
+ INIT_LIST_HEAD(&env->explored_states[i]);
+ INIT_LIST_HEAD(&env->free_list);
+
ret = check_btf_info_early(env, attr, uattr);
if (ret < 0)
goto skip_full_check;
@@ -21256,10 +24036,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret < 0)
goto skip_full_check;
- ret = check_attach_btf_id(env);
- if (ret)
- goto skip_full_check;
-
ret = resolve_pseudo_ldimm64(env);
if (ret < 0)
goto skip_full_check;
@@ -21274,6 +24050,18 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret < 0)
goto skip_full_check;
+ ret = check_attach_btf_id(env);
+ if (ret)
+ goto skip_full_check;
+
+ ret = compute_live_registers(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = mark_fastcall_patterns(env);
+ if (ret < 0)
+ goto skip_full_check;
+
ret = do_check_main(env);
ret = ret ?: do_check_subprogs(env);
@@ -21283,6 +24071,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
skip_full_check:
kvfree(env->explored_states);
+ /* might decrease stack depth, keep it before passes that
+ * allocate additional slots.
+ */
+ if (ret == 0)
+ ret = remove_fastcall_spills_fills(env);
+
if (ret == 0)
ret = check_max_stack_depth(env);
@@ -21400,7 +24194,9 @@ err_unlock:
if (!is_priv)
mutex_unlock(&bpf_verifier_lock);
vfree(env->insn_aux_data);
+ kvfree(env->insn_hist);
err_free_env:
- kfree(env);
+ kvfree(env->cfg.insn_postorder);
+ kvfree(env);
return ret;
}