summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/bpf.h1
-rw-r--r--include/linux/bpf_mem_alloc.h1
-rw-r--r--include/linux/percpu.h1
-rw-r--r--kernel/bpf/helpers.c24
-rw-r--r--kernel/bpf/memalloc.c38
-rw-r--r--kernel/bpf/syscall.c6
-rw-r--r--mm/percpu.c35
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_bpf_ma.c20
-rw-r--r--tools/testing/selftests/bpf/progs/test_bpf_ma.c180
9 files changed, 270 insertions, 36 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b4b40b45962b..b4825d3cdb29 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2058,6 +2058,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec);
bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b);
void bpf_obj_free_timer(const struct btf_record *rec, void *obj);
void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu);
struct bpf_map *bpf_map_get(u32 ufd);
struct bpf_map *bpf_map_get_with_uref(u32 ufd);
diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index d644bbb298af..bb1223b21308 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -11,6 +11,7 @@ struct bpf_mem_caches;
struct bpf_mem_alloc {
struct bpf_mem_caches __percpu *caches;
struct bpf_mem_cache __percpu *cache;
+ bool percpu;
struct work_struct work;
};
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 68fac2e7cbe6..8c677f185901 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -132,6 +132,7 @@ extern void __init setup_per_cpu_areas(void);
extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1);
extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1);
extern void free_percpu(void __percpu *__pdata);
+extern size_t pcpu_alloc_size(void __percpu *__pdata);
DEFINE_FREE(free_percpu, void __percpu *, free_percpu(_T))
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index da058aead20c..e46ac288a108 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1811,8 +1811,6 @@ bpf_base_func_proto(enum bpf_func_id func_id)
}
}
-void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
-
void bpf_list_head_free(const struct btf_field *field, void *list_head,
struct bpf_spin_lock *spin_lock)
{
@@ -1844,7 +1842,7 @@ unlock:
* bpf_list_head which needs to be freed.
*/
migrate_disable();
- __bpf_obj_drop_impl(obj, field->graph_root.value_rec);
+ __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
migrate_enable();
}
}
@@ -1883,7 +1881,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
migrate_disable();
- __bpf_obj_drop_impl(obj, field->graph_root.value_rec);
+ __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
migrate_enable();
}
}
@@ -1915,8 +1913,10 @@ __bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
}
/* Must be called under migrate_disable(), as required by bpf_mem_free */
-void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
{
+ struct bpf_mem_alloc *ma;
+
if (rec && rec->refcount_off >= 0 &&
!refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
/* Object is refcounted and refcount_dec didn't result in 0
@@ -1928,10 +1928,14 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
if (rec)
bpf_obj_free_fields(rec, p);
+ if (percpu)
+ ma = &bpf_global_percpu_ma;
+ else
+ ma = &bpf_global_ma;
if (rec && rec->refcount_off >= 0)
- bpf_mem_free_rcu(&bpf_global_ma, p);
+ bpf_mem_free_rcu(ma, p);
else
- bpf_mem_free(&bpf_global_ma, p);
+ bpf_mem_free(ma, p);
}
__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
@@ -1939,7 +1943,7 @@ __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
struct btf_struct_meta *meta = meta__ign;
void *p = p__alloc;
- __bpf_obj_drop_impl(p, meta ? meta->record : NULL);
+ __bpf_obj_drop_impl(p, meta ? meta->record : NULL, false);
}
__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
@@ -1983,7 +1987,7 @@ static int __bpf_list_add(struct bpf_list_node_kern *node,
*/
if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
/* Only called from BPF prog, no need to migrate_disable */
- __bpf_obj_drop_impl((void *)n - off, rec);
+ __bpf_obj_drop_impl((void *)n - off, rec, false);
return -EINVAL;
}
@@ -2082,7 +2086,7 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root,
*/
if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
/* Only called from BPF prog, no need to migrate_disable */
- __bpf_obj_drop_impl((void *)n - off, rec);
+ __bpf_obj_drop_impl((void *)n - off, rec, false);
return -EINVAL;
}
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 39ea316c55e7..5308e386380a 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -491,21 +491,17 @@ static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx)
struct llist_node *first;
unsigned int obj_size;
- /* For per-cpu allocator, the size of free objects in free list doesn't
- * match with unit_size and now there is no way to get the size of
- * per-cpu pointer saved in free object, so just skip the checking.
- */
- if (c->percpu_size)
- return 0;
-
first = c->free_llist.first;
if (!first)
return 0;
- obj_size = ksize(first);
+ if (c->percpu_size)
+ obj_size = pcpu_alloc_size(((void **)first)[1]);
+ else
+ obj_size = ksize(first);
if (obj_size != c->unit_size) {
- WARN_ONCE(1, "bpf_mem_cache[%u]: unexpected object size %u, expect %u\n",
- idx, obj_size, c->unit_size);
+ WARN_ONCE(1, "bpf_mem_cache[%u]: percpu %d, unexpected object size %u, expect %u\n",
+ idx, c->percpu_size, obj_size, c->unit_size);
return -EINVAL;
}
return 0;
@@ -529,6 +525,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
/* room for llist_node and per-cpu pointer */
if (percpu)
percpu_size = LLIST_NODE_SZ + sizeof(void *);
+ ma->percpu = percpu;
if (size) {
pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
@@ -878,6 +875,17 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
+static notrace int bpf_mem_free_idx(void *ptr, bool percpu)
+{
+ size_t size;
+
+ if (percpu)
+ size = pcpu_alloc_size(*((void **)ptr));
+ else
+ size = ksize(ptr - LLIST_NODE_SZ);
+ return bpf_mem_cache_idx(size);
+}
+
void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
{
int idx;
@@ -885,7 +893,7 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
if (!ptr)
return;
- idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
+ idx = bpf_mem_free_idx(ptr, ma->percpu);
if (idx < 0)
return;
@@ -899,7 +907,7 @@ void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
if (!ptr)
return;
- idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
+ idx = bpf_mem_free_idx(ptr, ma->percpu);
if (idx < 0)
return;
@@ -973,6 +981,12 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
+/* The alignment of dynamic per-cpu area is 8, so c->unit_size and the
+ * actual size of dynamic per-cpu area will always be matched and there is
+ * no need to adjust size_index for per-cpu allocation. However for the
+ * simplicity of the implementation, use an unified size_index for both
+ * kmalloc and per-cpu allocation.
+ */
static __init int bpf_mem_cache_adjust_size(void)
{
unsigned int size;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 341f8cb4405c..a9b2cb500bf7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -626,8 +626,6 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
bpf_timer_cancel_and_free(obj + rec->timer_off);
}
-extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
-
void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
{
const struct btf_field *fields;
@@ -662,8 +660,8 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
field->kptr.btf_id);
migrate_disable();
__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
- pointee_struct_meta->record :
- NULL);
+ pointee_struct_meta->record : NULL,
+ fields[i].type == BPF_KPTR_PERCPU);
migrate_enable();
} else {
field->kptr.dtor(xchgd_field);
diff --git a/mm/percpu.c b/mm/percpu.c
index a7665de8485f..60ed078e4cd0 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -2245,6 +2245,37 @@ static void pcpu_balance_workfn(struct work_struct *work)
}
/**
+ * pcpu_alloc_size - the size of the dynamic percpu area
+ * @ptr: pointer to the dynamic percpu area
+ *
+ * Returns the size of the @ptr allocation. This is undefined for statically
+ * defined percpu variables as there is no corresponding chunk->bound_map.
+ *
+ * RETURNS:
+ * The size of the dynamic percpu area.
+ *
+ * CONTEXT:
+ * Can be called from atomic context.
+ */
+size_t pcpu_alloc_size(void __percpu *ptr)
+{
+ struct pcpu_chunk *chunk;
+ unsigned long bit_off, end;
+ void *addr;
+
+ if (!ptr)
+ return 0;
+
+ addr = __pcpu_ptr_to_addr(ptr);
+ /* No pcpu_lock here: ptr has not been freed, so chunk is still alive */
+ chunk = pcpu_chunk_addr_search(addr);
+ bit_off = (addr - chunk->base_addr) / PCPU_MIN_ALLOC_SIZE;
+ end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
+ bit_off + 1);
+ return (end - bit_off) * PCPU_MIN_ALLOC_SIZE;
+}
+
+/**
* free_percpu - free percpu area
* @ptr: pointer to area to free
*
@@ -2267,12 +2298,10 @@ void free_percpu(void __percpu *ptr)
kmemleak_free_percpu(ptr);
addr = __pcpu_ptr_to_addr(ptr);
-
- spin_lock_irqsave(&pcpu_lock, flags);
-
chunk = pcpu_chunk_addr_search(addr);
off = addr - chunk->base_addr;
+ spin_lock_irqsave(&pcpu_lock, flags);
size = pcpu_free_area(chunk, off);
pcpu_memcg_free_hook(chunk, off, size);
diff --git a/tools/testing/selftests/bpf/prog_tests/test_bpf_ma.c b/tools/testing/selftests/bpf/prog_tests/test_bpf_ma.c
index 0cca4e8ae38e..d3491a84b3b9 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_bpf_ma.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_bpf_ma.c
@@ -9,9 +9,10 @@
#include "test_bpf_ma.skel.h"
-void test_test_bpf_ma(void)
+static void do_bpf_ma_test(const char *name)
{
struct test_bpf_ma *skel;
+ struct bpf_program *prog;
struct btf *btf;
int i, err;
@@ -34,6 +35,11 @@ void test_test_bpf_ma(void)
skel->rodata->data_btf_ids[i] = id;
}
+ prog = bpf_object__find_program_by_name(skel->obj, name);
+ if (!ASSERT_OK_PTR(prog, "invalid prog name"))
+ goto out;
+ bpf_program__set_autoload(prog, true);
+
err = test_bpf_ma__load(skel);
if (!ASSERT_OK(err, "load"))
goto out;
@@ -48,3 +54,15 @@ void test_test_bpf_ma(void)
out:
test_bpf_ma__destroy(skel);
}
+
+void test_test_bpf_ma(void)
+{
+ if (test__start_subtest("batch_alloc_free"))
+ do_bpf_ma_test("test_batch_alloc_free");
+ if (test__start_subtest("free_through_map_free"))
+ do_bpf_ma_test("test_free_through_map_free");
+ if (test__start_subtest("batch_percpu_alloc_free"))
+ do_bpf_ma_test("test_batch_percpu_alloc_free");
+ if (test__start_subtest("percpu_free_through_map_free"))
+ do_bpf_ma_test("test_percpu_free_through_map_free");
+}
diff --git a/tools/testing/selftests/bpf/progs/test_bpf_ma.c b/tools/testing/selftests/bpf/progs/test_bpf_ma.c
index ecde41ae0fc8..b685a4aba6bd 100644
--- a/tools/testing/selftests/bpf/progs/test_bpf_ma.c
+++ b/tools/testing/selftests/bpf/progs/test_bpf_ma.c
@@ -37,10 +37,20 @@ int pid = 0;
__type(key, int); \
__type(value, struct map_value_##_size); \
__uint(max_entries, 128); \
- } array_##_size SEC(".maps");
+ } array_##_size SEC(".maps")
-static __always_inline void batch_alloc_free(struct bpf_map *map, unsigned int batch,
- unsigned int idx)
+#define DEFINE_ARRAY_WITH_PERCPU_KPTR(_size) \
+ struct map_value_percpu_##_size { \
+ struct bin_data_##_size __percpu_kptr * data; \
+ }; \
+ struct { \
+ __uint(type, BPF_MAP_TYPE_ARRAY); \
+ __type(key, int); \
+ __type(value, struct map_value_percpu_##_size); \
+ __uint(max_entries, 128); \
+ } array_percpu_##_size SEC(".maps")
+
+static __always_inline void batch_alloc(struct bpf_map *map, unsigned int batch, unsigned int idx)
{
struct generic_map_value *value;
unsigned int i, key;
@@ -65,6 +75,14 @@ static __always_inline void batch_alloc_free(struct bpf_map *map, unsigned int b
return;
}
}
+}
+
+static __always_inline void batch_free(struct bpf_map *map, unsigned int batch, unsigned int idx)
+{
+ struct generic_map_value *value;
+ unsigned int i, key;
+ void *old;
+
for (i = 0; i < batch; i++) {
key = i;
value = bpf_map_lookup_elem(map, &key);
@@ -81,8 +99,72 @@ static __always_inline void batch_alloc_free(struct bpf_map *map, unsigned int b
}
}
+static __always_inline void batch_percpu_alloc(struct bpf_map *map, unsigned int batch,
+ unsigned int idx)
+{
+ struct generic_map_value *value;
+ unsigned int i, key;
+ void *old, *new;
+
+ for (i = 0; i < batch; i++) {
+ key = i;
+ value = bpf_map_lookup_elem(map, &key);
+ if (!value) {
+ err = 1;
+ return;
+ }
+ /* per-cpu allocator may not be able to refill in time */
+ new = bpf_percpu_obj_new_impl(data_btf_ids[idx], NULL);
+ if (!new)
+ continue;
+
+ old = bpf_kptr_xchg(&value->data, new);
+ if (old) {
+ bpf_percpu_obj_drop(old);
+ err = 2;
+ return;
+ }
+ }
+}
+
+static __always_inline void batch_percpu_free(struct bpf_map *map, unsigned int batch,
+ unsigned int idx)
+{
+ struct generic_map_value *value;
+ unsigned int i, key;
+ void *old;
+
+ for (i = 0; i < batch; i++) {
+ key = i;
+ value = bpf_map_lookup_elem(map, &key);
+ if (!value) {
+ err = 3;
+ return;
+ }
+ old = bpf_kptr_xchg(&value->data, NULL);
+ if (!old)
+ continue;
+ bpf_percpu_obj_drop(old);
+ }
+}
+
+#define CALL_BATCH_ALLOC(size, batch, idx) \
+ batch_alloc((struct bpf_map *)(&array_##size), batch, idx)
+
#define CALL_BATCH_ALLOC_FREE(size, batch, idx) \
- batch_alloc_free((struct bpf_map *)(&array_##size), batch, idx)
+ do { \
+ batch_alloc((struct bpf_map *)(&array_##size), batch, idx); \
+ batch_free((struct bpf_map *)(&array_##size), batch, idx); \
+ } while (0)
+
+#define CALL_BATCH_PERCPU_ALLOC(size, batch, idx) \
+ batch_percpu_alloc((struct bpf_map *)(&array_percpu_##size), batch, idx)
+
+#define CALL_BATCH_PERCPU_ALLOC_FREE(size, batch, idx) \
+ do { \
+ batch_percpu_alloc((struct bpf_map *)(&array_percpu_##size), batch, idx); \
+ batch_percpu_free((struct bpf_map *)(&array_percpu_##size), batch, idx); \
+ } while (0)
DEFINE_ARRAY_WITH_KPTR(8);
DEFINE_ARRAY_WITH_KPTR(16);
@@ -97,8 +179,21 @@ DEFINE_ARRAY_WITH_KPTR(1024);
DEFINE_ARRAY_WITH_KPTR(2048);
DEFINE_ARRAY_WITH_KPTR(4096);
-SEC("fentry/" SYS_PREFIX "sys_nanosleep")
-int test_bpf_mem_alloc_free(void *ctx)
+/* per-cpu kptr doesn't support bin_data_8 which is a zero-sized array */
+DEFINE_ARRAY_WITH_PERCPU_KPTR(16);
+DEFINE_ARRAY_WITH_PERCPU_KPTR(32);
+DEFINE_ARRAY_WITH_PERCPU_KPTR(64);
+DEFINE_ARRAY_WITH_PERCPU_KPTR(96);
+DEFINE_ARRAY_WITH_PERCPU_KPTR(128);
+DEFINE_ARRAY_WITH_PERCPU_KPTR(192);
+DEFINE_ARRAY_WITH_PERCPU_KPTR(256);
+DEFINE_ARRAY_WITH_PERCPU_KPTR(512);
+DEFINE_ARRAY_WITH_PERCPU_KPTR(1024);
+DEFINE_ARRAY_WITH_PERCPU_KPTR(2048);
+DEFINE_ARRAY_WITH_PERCPU_KPTR(4096);
+
+SEC("?fentry/" SYS_PREFIX "sys_nanosleep")
+int test_batch_alloc_free(void *ctx)
{
if ((u32)bpf_get_current_pid_tgid() != pid)
return 0;
@@ -121,3 +216,76 @@ int test_bpf_mem_alloc_free(void *ctx)
return 0;
}
+
+SEC("?fentry/" SYS_PREFIX "sys_nanosleep")
+int test_free_through_map_free(void *ctx)
+{
+ if ((u32)bpf_get_current_pid_tgid() != pid)
+ return 0;
+
+ /* Alloc 128 8-bytes objects in batch to trigger refilling,
+ * then free these objects through map free.
+ */
+ CALL_BATCH_ALLOC(8, 128, 0);
+ CALL_BATCH_ALLOC(16, 128, 1);
+ CALL_BATCH_ALLOC(32, 128, 2);
+ CALL_BATCH_ALLOC(64, 128, 3);
+ CALL_BATCH_ALLOC(96, 128, 4);
+ CALL_BATCH_ALLOC(128, 128, 5);
+ CALL_BATCH_ALLOC(192, 128, 6);
+ CALL_BATCH_ALLOC(256, 128, 7);
+ CALL_BATCH_ALLOC(512, 64, 8);
+ CALL_BATCH_ALLOC(1024, 32, 9);
+ CALL_BATCH_ALLOC(2048, 16, 10);
+ CALL_BATCH_ALLOC(4096, 8, 11);
+
+ return 0;
+}
+
+SEC("?fentry/" SYS_PREFIX "sys_nanosleep")
+int test_batch_percpu_alloc_free(void *ctx)
+{
+ if ((u32)bpf_get_current_pid_tgid() != pid)
+ return 0;
+
+ /* Alloc 128 16-bytes per-cpu objects in batch to trigger refilling,
+ * then free 128 16-bytes per-cpu objects in batch to trigger freeing.
+ */
+ CALL_BATCH_PERCPU_ALLOC_FREE(16, 128, 1);
+ CALL_BATCH_PERCPU_ALLOC_FREE(32, 128, 2);
+ CALL_BATCH_PERCPU_ALLOC_FREE(64, 128, 3);
+ CALL_BATCH_PERCPU_ALLOC_FREE(96, 128, 4);
+ CALL_BATCH_PERCPU_ALLOC_FREE(128, 128, 5);
+ CALL_BATCH_PERCPU_ALLOC_FREE(192, 128, 6);
+ CALL_BATCH_PERCPU_ALLOC_FREE(256, 128, 7);
+ CALL_BATCH_PERCPU_ALLOC_FREE(512, 64, 8);
+ CALL_BATCH_PERCPU_ALLOC_FREE(1024, 32, 9);
+ CALL_BATCH_PERCPU_ALLOC_FREE(2048, 16, 10);
+ CALL_BATCH_PERCPU_ALLOC_FREE(4096, 8, 11);
+
+ return 0;
+}
+
+SEC("?fentry/" SYS_PREFIX "sys_nanosleep")
+int test_percpu_free_through_map_free(void *ctx)
+{
+ if ((u32)bpf_get_current_pid_tgid() != pid)
+ return 0;
+
+ /* Alloc 128 16-bytes per-cpu objects in batch to trigger refilling,
+ * then free these object through map free.
+ */
+ CALL_BATCH_PERCPU_ALLOC(16, 128, 1);
+ CALL_BATCH_PERCPU_ALLOC(32, 128, 2);
+ CALL_BATCH_PERCPU_ALLOC(64, 128, 3);
+ CALL_BATCH_PERCPU_ALLOC(96, 128, 4);
+ CALL_BATCH_PERCPU_ALLOC(128, 128, 5);
+ CALL_BATCH_PERCPU_ALLOC(192, 128, 6);
+ CALL_BATCH_PERCPU_ALLOC(256, 128, 7);
+ CALL_BATCH_PERCPU_ALLOC(512, 64, 8);
+ CALL_BATCH_PERCPU_ALLOC(1024, 32, 9);
+ CALL_BATCH_PERCPU_ALLOC(2048, 16, 10);
+ CALL_BATCH_PERCPU_ALLOC(4096, 8, 11);
+
+ return 0;
+}