From 52e7c083b417417bb4352ebf3a6409988b6af238 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 26 Feb 2020 17:13:53 +0000 Subject: mailmap: Update email address My Netronome address is no longer active. I am no maintainer, but get_maintainer.pl sometimes returns my name for a small number of files (BPF-related). Add an entry to .mailmap for good measure. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200226171353.18982-1-quentin@isovalent.com Signed-off-by: Alexei Starovoitov --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index ffb8f28290c7..a0dfce8de1ba 100644 --- a/.mailmap +++ b/.mailmap @@ -225,6 +225,7 @@ Pratyush Anand Praveen BP Punit Agrawal Qais Yousef +Quentin Monnet Quentin Perret Rafael J. Wysocki Rajesh Shah -- cgit From 1bc7896e9ef44fd77858b3ef0b8a6840be3a4494 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 4 Mar 2020 11:11:04 -0800 Subject: bpf: Fix deadlock with rq_lock in bpf_send_signal() When experimenting with bpf_send_signal() helper in our production environment (5.2 based), we experienced a deadlock in NMI mode: #5 [ffffc9002219f770] queued_spin_lock_slowpath at ffffffff8110be24 #6 [ffffc9002219f770] _raw_spin_lock_irqsave at ffffffff81a43012 #7 [ffffc9002219f780] try_to_wake_up at ffffffff810e7ecd #8 [ffffc9002219f7e0] signal_wake_up_state at ffffffff810c7b55 #9 [ffffc9002219f7f0] __send_signal at ffffffff810c8602 #10 [ffffc9002219f830] do_send_sig_info at ffffffff810ca31a #11 [ffffc9002219f868] bpf_send_signal at ffffffff8119d227 #12 [ffffc9002219f988] bpf_overflow_handler at ffffffff811d4140 #13 [ffffc9002219f9e0] __perf_event_overflow at ffffffff811d68cf #14 [ffffc9002219fa10] perf_swevent_overflow at ffffffff811d6a09 #15 [ffffc9002219fa38] ___perf_sw_event at ffffffff811e0f47 #16 [ffffc9002219fc30] __schedule at ffffffff81a3e04d #17 [ffffc9002219fc90] schedule at ffffffff81a3e219 #18 [ffffc9002219fca0] futex_wait_queue_me at ffffffff8113d1b9 #19 [ffffc9002219fcd8] futex_wait at ffffffff8113e529 #20 [ffffc9002219fdf0] do_futex at ffffffff8113ffbc #21 [ffffc9002219fec0] __x64_sys_futex at ffffffff81140d1c #22 [ffffc9002219ff38] do_syscall_64 at ffffffff81002602 #23 [ffffc9002219ff50] entry_SYSCALL_64_after_hwframe at ffffffff81c00068 The above call stack is actually very similar to an issue reported by Commit eac9153f2b58 ("bpf/stackmap: Fix deadlock with rq_lock in bpf_get_stack()") by Song Liu. The only difference is bpf_send_signal() helper instead of bpf_get_stack() helper. The above deadlock is triggered with a perf_sw_event. Similar to Commit eac9153f2b58, the below almost identical reproducer used tracepoint point sched/sched_switch so the issue can be easily caught. /* stress_test.c */ #include #include #include #include #include #include #include #define THREAD_COUNT 1000 char *filename; void *worker(void *p) { void *ptr; int fd; char *pptr; fd = open(filename, O_RDONLY); if (fd < 0) return NULL; while (1) { struct timespec ts = {0, 1000 + rand() % 2000}; ptr = mmap(NULL, 4096 * 64, PROT_READ, MAP_PRIVATE, fd, 0); usleep(1); if (ptr == MAP_FAILED) { printf("failed to mmap\n"); break; } munmap(ptr, 4096 * 64); usleep(1); pptr = malloc(1); usleep(1); pptr[0] = 1; usleep(1); free(pptr); usleep(1); nanosleep(&ts, NULL); } close(fd); return NULL; } int main(int argc, char *argv[]) { void *ptr; int i; pthread_t threads[THREAD_COUNT]; if (argc < 2) return 0; filename = argv[1]; for (i = 0; i < THREAD_COUNT; i++) { if (pthread_create(threads + i, NULL, worker, NULL)) { fprintf(stderr, "Error creating thread\n"); return 0; } } for (i = 0; i < THREAD_COUNT; i++) pthread_join(threads[i], NULL); return 0; } and the following command: 1. run `stress_test /bin/ls` in one windown 2. hack bcc trace.py with the following change: --- a/tools/trace.py +++ b/tools/trace.py @@ -513,6 +513,7 @@ BPF_PERF_OUTPUT(%s); __data.tgid = __tgid; __data.pid = __pid; bpf_get_current_comm(&__data.comm, sizeof(__data.comm)); + bpf_send_signal(10); %s %s %s.perf_submit(%s, &__data, sizeof(__data)); 3. in a different window run ./trace.py -p $(pidof stress_test) t:sched:sched_switch The deadlock can be reproduced in our production system. Similar to Song's fix, the fix is to delay sending signal if irqs is disabled to avoid deadlocks involving with rq_lock. With this change, my above stress-test in our production system won't cause deadlock any more. I also implemented a scale-down version of reproducer in the selftest (a subsequent commit). With latest bpf-next, it complains for the following potential deadlock. [ 32.832450] -> #1 (&p->pi_lock){-.-.}: [ 32.833100] _raw_spin_lock_irqsave+0x44/0x80 [ 32.833696] task_rq_lock+0x2c/0xa0 [ 32.834182] task_sched_runtime+0x59/0xd0 [ 32.834721] thread_group_cputime+0x250/0x270 [ 32.835304] thread_group_cputime_adjusted+0x2e/0x70 [ 32.835959] do_task_stat+0x8a7/0xb80 [ 32.836461] proc_single_show+0x51/0xb0 ... [ 32.839512] -> #0 (&(&sighand->siglock)->rlock){....}: [ 32.840275] __lock_acquire+0x1358/0x1a20 [ 32.840826] lock_acquire+0xc7/0x1d0 [ 32.841309] _raw_spin_lock_irqsave+0x44/0x80 [ 32.841916] __lock_task_sighand+0x79/0x160 [ 32.842465] do_send_sig_info+0x35/0x90 [ 32.842977] bpf_send_signal+0xa/0x10 [ 32.843464] bpf_prog_bc13ed9e4d3163e3_send_signal_tp_sched+0x465/0x1000 [ 32.844301] trace_call_bpf+0x115/0x270 [ 32.844809] perf_trace_run_bpf_submit+0x4a/0xc0 [ 32.845411] perf_trace_sched_switch+0x10f/0x180 [ 32.846014] __schedule+0x45d/0x880 [ 32.846483] schedule+0x5f/0xd0 ... [ 32.853148] Chain exists of: [ 32.853148] &(&sighand->siglock)->rlock --> &p->pi_lock --> &rq->lock [ 32.853148] [ 32.854451] Possible unsafe locking scenario: [ 32.854451] [ 32.855173] CPU0 CPU1 [ 32.855745] ---- ---- [ 32.856278] lock(&rq->lock); [ 32.856671] lock(&p->pi_lock); [ 32.857332] lock(&rq->lock); [ 32.857999] lock(&(&sighand->siglock)->rlock); Deadlock happens on CPU0 when it tries to acquire &sighand->siglock but it has been held by CPU1 and CPU1 tries to grab &rq->lock and cannot get it. This is not exactly the callstack in our production environment, but sympotom is similar and both locks are using spin_lock_irqsave() to acquire the lock, and both involves rq_lock. The fix to delay sending signal when irq is disabled also fixed this issue. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Cc: Song Liu Link: https://lore.kernel.org/bpf/20200304191104.2796501-1-yhs@fb.com --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 19e793aa441a..68250d433bd7 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -732,7 +732,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) if (unlikely(!nmi_uaccess_okay())) return -EPERM; - if (in_nmi()) { + if (irqs_disabled()) { /* Do an early check on signal validity. Otherwise, * the error is lost in deferred irq_work. */ -- cgit From c4ef2f3256e3bc008f98121cad39ee5467db07a6 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 4 Mar 2020 11:11:05 -0800 Subject: selftests/bpf: Add send_signal_sched_switch test Added one test, send_signal_sched_switch, to test bpf_send_signal() helper triggered by sched/sched_switch tracepoint. This test can be used to verify kernel deadlocks fixed by the previous commit. The test itself is heavily borrowed from Commit eac9153f2b58 ("bpf/stackmap: Fix deadlock with rq_lock in bpf_get_stack()"). Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Cc: Song Liu Link: https://lore.kernel.org/bpf/20200304191105.2796601-1-yhs@fb.com --- .../bpf/prog_tests/send_signal_sched_switch.c | 60 ++++++++++++++++++++++ .../selftests/bpf/progs/test_send_signal_kern.c | 6 +++ 2 files changed, 66 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/send_signal_sched_switch.c diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal_sched_switch.c b/tools/testing/selftests/bpf/prog_tests/send_signal_sched_switch.c new file mode 100644 index 000000000000..189a34a7addb --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/send_signal_sched_switch.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include "test_send_signal_kern.skel.h" + +static void sigusr1_handler(int signum) +{ +} + +#define THREAD_COUNT 100 + +static void *worker(void *p) +{ + int i; + + for ( i = 0; i < 1000; i++) + usleep(1); + + return NULL; +} + +void test_send_signal_sched_switch(void) +{ + struct test_send_signal_kern *skel; + pthread_t threads[THREAD_COUNT]; + u32 duration = 0; + int i, err; + + signal(SIGUSR1, sigusr1_handler); + + skel = test_send_signal_kern__open_and_load(); + if (CHECK(!skel, "skel_open_and_load", "skeleton open_and_load failed\n")) + return; + + skel->bss->pid = getpid(); + skel->bss->sig = SIGUSR1; + + err = test_send_signal_kern__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attach failed\n")) + goto destroy_skel; + + for (i = 0; i < THREAD_COUNT; i++) { + err = pthread_create(threads + i, NULL, worker, NULL); + if (CHECK(err, "pthread_create", "Error creating thread, %s\n", + strerror(errno))) + goto destroy_skel; + } + + for (i = 0; i < THREAD_COUNT; i++) + pthread_join(threads[i], NULL); + +destroy_skel: + test_send_signal_kern__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c index 1acc91e87bfc..b4233d3efac2 100644 --- a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c +++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c @@ -31,6 +31,12 @@ int send_signal_tp(void *ctx) return bpf_send_signal_test(ctx); } +SEC("tracepoint/sched/sched_switch") +int send_signal_tp_sched(void *ctx) +{ + return bpf_send_signal_test(ctx); +} + SEC("perf_event") int send_signal_perf(void *ctx) { -- cgit From 8e5290e710f4ffe8e9f8813e2ed0397a94d7b2f1 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 4 Mar 2020 17:34:47 -0800 Subject: bpf: Return better error value in delete_elem for struct_ops map The current always succeed behavior in bpf_struct_ops_map_delete_elem() is not ideal for userspace tool. It can be improved to return proper error value. If it is in TOBEFREE, it means unregistration has been already done before but it is in progress and waiting for the subsystem to clear the refcnt to zero, so -EINPROGRESS. If it is INIT, it means the struct_ops has not been registered yet, so -ENOENT. Fixes: 85d33df357b6 ("bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS") Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200305013447.535326-1-kafai@fb.com --- kernel/bpf/bpf_struct_ops.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 042f95534f86..68a89a9f7ccd 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -482,13 +482,21 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) prev_state = cmpxchg(&st_map->kvalue.state, BPF_STRUCT_OPS_STATE_INUSE, BPF_STRUCT_OPS_STATE_TOBEFREE); - if (prev_state == BPF_STRUCT_OPS_STATE_INUSE) { + switch (prev_state) { + case BPF_STRUCT_OPS_STATE_INUSE: st_map->st_ops->unreg(&st_map->kvalue.data); if (refcount_dec_and_test(&st_map->kvalue.refcnt)) bpf_map_put(map); + return 0; + case BPF_STRUCT_OPS_STATE_TOBEFREE: + return -EINPROGRESS; + case BPF_STRUCT_OPS_STATE_INIT: + return -ENOENT; + default: + WARN_ON_ONCE(1); + /* Should never happen. Treat it as not found. */ + return -ENOENT; } - - return 0; } static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, -- cgit From 849b4d94582a966ecb533448415462846da1f0fa Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 4 Mar 2020 17:34:54 -0800 Subject: bpf: Do not allow map_freeze in struct_ops map struct_ops map cannot support map_freeze. Otherwise, a struct_ops cannot be unregistered from the subsystem. Fixes: 85d33df357b6 ("bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS") Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200305013454.535397-1-kafai@fb.com --- kernel/bpf/syscall.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a91ad518c050..0c7fb0d4836d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1510,6 +1510,11 @@ static int map_freeze(const union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + fdput(f); + return -ENOTSUPP; + } + mutex_lock(&map->freeze_mutex); if (map->writecnt) { -- cgit From 80f1f85036355e5581ec0b99913410345ad3491b Mon Sep 17 00:00:00 2001 From: Luke Nelson Date: Thu, 5 Mar 2020 15:44:12 -0800 Subject: bpf, x32: Fix bug with JMP32 JSET BPF_X checking upper bits The current x32 BPF JIT is incorrect for JMP32 JSET BPF_X when the upper 32 bits of operand registers are non-zero in certain situations. The problem is in the following code: case BPF_JMP | BPF_JSET | BPF_X: case BPF_JMP32 | BPF_JSET | BPF_X: ... /* and dreg_lo,sreg_lo */ EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo)); /* and dreg_hi,sreg_hi */ EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi)); /* or dreg_lo,dreg_hi */ EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi)); This code checks the upper bits of the operand registers regardless if the BPF instruction is BPF_JMP32 or BPF_JMP64. Registers dreg_hi and dreg_lo are not loaded from the stack for BPF_JMP32, however, they can still be polluted with values from previous instructions. The following BPF program demonstrates the bug. The jset64 instruction loads the temporary registers and performs the jump, since ((u64)r7 & (u64)r8) is non-zero. The jset32 should _not_ be taken, as the lower 32 bits are all zero, however, the current JIT will take the branch due the pollution of temporary registers from the earlier jset64. mov64 r0, 0 ld64 r7, 0x8000000000000000 ld64 r8, 0x8000000000000000 jset64 r7, r8, 1 exit jset32 r7, r8, 1 mov64 r0, 2 exit The expected return value of this program is 2; under the buggy x32 JIT it returns 0. The fix is to skip using the upper 32 bits for jset32 and compare the upper 32 bits for jset64 only. All tests in test_bpf.ko and selftests/bpf/test_verifier continue to pass with this change. We found this bug using our automated verification tool, Serval. Fixes: 69f827eb6e14 ("x32: bpf: implement jitting of JMP32") Co-developed-by: Xi Wang Signed-off-by: Xi Wang Signed-off-by: Luke Nelson Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200305234416.31597-1-luke.r.nels@gmail.com --- arch/x86/net/bpf_jit_comp32.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 393d251798c0..4d2a7a764602 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -2039,10 +2039,12 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, } /* and dreg_lo,sreg_lo */ EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo)); - /* and dreg_hi,sreg_hi */ - EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi)); - /* or dreg_lo,dreg_hi */ - EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi)); + if (is_jmp64) { + /* and dreg_hi,sreg_hi */ + EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi)); + /* or dreg_lo,dreg_hi */ + EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi)); + } goto emit_cond_jmp; } case BPF_JMP | BPF_JSET | BPF_K: -- cgit From 93e5fbb18cec70b3b5c614f67b65388829113bdd Mon Sep 17 00:00:00 2001 From: Luke Nelson Date: Thu, 5 Mar 2020 15:44:13 -0800 Subject: selftests: bpf: Add test for JMP32 JSET BPF_X with upper bits set The existing tests attempt to check that JMP32 JSET ignores the upper bits in the operand registers. However, the tests missed one such bug in the x32 JIT that is only uncovered when a previous instruction pollutes the upper 32 bits of the registers. This patch adds a new test case that catches the bug by first executing a 64-bit JSET to pollute the upper 32-bits of the temporary registers, followed by a 32-bit JSET which should ignore the upper 32 bits. Co-developed-by: Xi Wang Signed-off-by: Xi Wang Signed-off-by: Luke Nelson Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200305234416.31597-2-luke.r.nels@gmail.com --- tools/testing/selftests/bpf/verifier/jmp32.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/jmp32.c b/tools/testing/selftests/bpf/verifier/jmp32.c index bf0322eb5346..bd5cae4a7f73 100644 --- a/tools/testing/selftests/bpf/verifier/jmp32.c +++ b/tools/testing/selftests/bpf/verifier/jmp32.c @@ -61,6 +61,21 @@ }, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, +{ + "jset32: ignores upper bits", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_LD_IMM64(BPF_REG_7, 0x8000000000000000), + BPF_LD_IMM64(BPF_REG_8, 0x8000000000000000), + BPF_JMP_REG(BPF_JSET, BPF_REG_7, BPF_REG_8, 1), + BPF_EXIT_INSN(), + BPF_JMP32_REG(BPF_JSET, BPF_REG_7, BPF_REG_8, 1), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 2, +}, { "jset32: min/max deduction", .insns = { -- cgit From 62039c30c19dcab96621e074aeeb90da7100def7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 9 Mar 2020 15:27:55 -0700 Subject: bpf: Initialize storage pointers to NULL to prevent freeing garbage pointer Local storage array isn't initialized, so if cgroup storage allocation fails for BPF_CGROUP_STORAGE_SHARED, error handling code will attempt to free uninitialized pointer for BPF_CGROUP_STORAGE_PERCPU storage type. Avoid this by always initializing storage pointers to NULLs. Fixes: 8bad74f9840f ("bpf: extend cgroup bpf core to allow multiple cgroup storage types") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200309222756.1018737-1-andriin@fb.com --- kernel/bpf/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9a500fadbef5..b2bc4c335bb1 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -302,8 +302,8 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], - *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_prog_list *pl, *replace_pl = NULL; enum bpf_cgroup_storage_type stype; int err; -- cgit From 1d8006abaab4cb90f81add86e8d1bf9411add05a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 9 Mar 2020 15:40:17 -0700 Subject: bpf: Fix cgroup ref leak in cgroup_bpf_inherit on out-of-memory There is no compensating cgroup_bpf_put() for each ancestor cgroup in cgroup_bpf_inherit(). If compute_effective_progs returns error, those cgroups won't be freed ever. Fix it by putting them in cleanup code path. Fixes: e10360f815ca ("bpf: cgroup: prevent out-of-order release of cgroup bpf") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Roman Gushchin Link: https://lore.kernel.org/bpf/20200309224017.1063297-1-andriin@fb.com --- kernel/bpf/cgroup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index b2bc4c335bb1..4f1472409ef8 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -227,6 +227,9 @@ cleanup: for (i = 0; i < NR; i++) bpf_prog_array_free(arrays[i]); + for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) + cgroup_bpf_put(p); + percpu_ref_exit(&cgrp->bpf.refcnt); return -ENOMEM; -- cgit From da6c7faeb103c493e505e87643272f70be586635 Mon Sep 17 00:00:00 2001 From: Yoshiki Komachi Date: Tue, 10 Mar 2020 16:32:29 +0900 Subject: bpf/btf: Fix BTF verification of enum members in struct/union btf_enum_check_member() was currently sure to recognize the size of "enum" type members in struct/union as the size of "int" even if its size was packed. This patch fixes BTF enum verification to use the correct size of member in BPF programs. Fixes: 179cde8cef7e ("bpf: btf: Check members of struct/union") Signed-off-by: Yoshiki Komachi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1583825550-18606-2-git-send-email-komachi.yoshiki@gmail.com --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 787140095e58..32ab9225026e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2418,7 +2418,7 @@ static int btf_enum_check_member(struct btf_verifier_env *env, struct_size = struct_type->size; bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); - if (struct_size - bytes_offset < sizeof(int)) { + if (struct_size - bytes_offset < member_type->size) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; -- cgit From 6ffe559a77d1c963a3567f7a39a5419bdcdc4f1c Mon Sep 17 00:00:00 2001 From: Yoshiki Komachi Date: Tue, 10 Mar 2020 16:32:30 +0900 Subject: selftests/bpf: Add test for the packed enum member in struct/union Add a simple test to the existing selftest program in order to make sure that a packed enum member in struct unexceeds the struct_size. Signed-off-by: Yoshiki Komachi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1583825550-18606-3-git-send-email-komachi.yoshiki@gmail.com --- tools/testing/selftests/bpf/test_btf.c | 42 ++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c index 93040ca83e60..8da77cda5f4a 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/test_btf.c @@ -1062,6 +1062,48 @@ static struct btf_raw_test raw_tests[] = { .err_str = "Member exceeds struct_size", }, +/* Test member unexceeds the size of struct + * + * enum E { + * E0, + * E1, + * }; + * + * struct A { + * char m; + * enum E __attribute__((packed)) n; + * }; + */ +{ + .descr = "size check test #5", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, sizeof(int)), + /* char */ /* [2] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 8, 1), + /* enum E { */ /* [3] */ + BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 2), 1), + BTF_ENUM_ENC(NAME_TBD, 0), + BTF_ENUM_ENC(NAME_TBD, 1), + /* } */ + /* struct A { */ /* [4] */ + BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 2), + BTF_MEMBER_ENC(NAME_TBD, 2, 0), /* char m; */ + BTF_MEMBER_ENC(NAME_TBD, 3, 8),/* enum E __attribute__((packed)) n; */ + /* } */ + BTF_END_RAW, + }, + .str_sec = "\0E\0E0\0E1\0A\0m\0n", + .str_sec_size = sizeof("\0E\0E0\0E1\0A\0m\0n"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "size_check5_map", + .key_size = sizeof(int), + .value_size = 2, + .key_type_id = 1, + .value_type_id = 4, + .max_entries = 4, +}, + /* typedef const void * const_void_ptr; * struct A { * const_void_ptr m; -- cgit From 90db6d772f749e38171d04619a5e3cd8804a6d02 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 10 Mar 2020 09:41:48 -0700 Subject: bpf, sockmap: Remove bucket->lock from sock_{hash|map}_free The bucket->lock is not needed in the sock_hash_free and sock_map_free calls, in fact it is causing a splat due to being inside rcu block. | BUG: sleeping function called from invalid context at net/core/sock.c:2935 | in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 62, name: kworker/0:1 | 3 locks held by kworker/0:1/62: | #0: ffff88813b019748 ((wq_completion)events){+.+.}, at: process_one_work+0x1d7/0x5e0 | #1: ffffc900000abe50 ((work_completion)(&map->work)){+.+.}, at: process_one_work+0x1d7/0x5e0 | #2: ffff8881381f6df8 (&stab->lock){+...}, at: sock_map_free+0x26/0x180 | CPU: 0 PID: 62 Comm: kworker/0:1 Not tainted 5.5.0-04008-g7b083332376e #454 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 | Workqueue: events bpf_map_free_deferred | Call Trace: | dump_stack+0x71/0xa0 | ___might_sleep.cold+0xa6/0xb6 | lock_sock_nested+0x28/0x90 | sock_map_free+0x5f/0x180 | bpf_map_free_deferred+0x58/0x80 | process_one_work+0x260/0x5e0 | worker_thread+0x4d/0x3e0 | kthread+0x108/0x140 | ? process_one_work+0x5e0/0x5e0 | ? kthread_park+0x90/0x90 | ret_from_fork+0x3a/0x50 The reason we have stab->lock and bucket->locks in sockmap code is to handle checking EEXIST in update/delete cases. We need to be careful during an update operation that we check for EEXIST and we need to ensure that the psock object is not in some partial state of removal/insertion while we do this. So both map_update_common and sock_map_delete need to guard from being run together potentially deleting an entry we are checking, etc. But by the time we get to the tear-down code in sock_{ma[|hash}_free we have already disconnected the map and we just did synchronize_rcu() in the line above so no updates/deletes should be in flight. Because of this we can drop the bucket locks from the map free'ing code, noting no update/deletes can be in-flight. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Reported-by: Jakub Sitnicki Suggested-by: Jakub Sitnicki Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/158385850787.30597.8346421465837046618.stgit@john-Precision-5820-Tower --- net/core/sock_map.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 085cef5857bb..b70c844a88ec 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -233,8 +233,11 @@ static void sock_map_free(struct bpf_map *map) struct bpf_stab *stab = container_of(map, struct bpf_stab, map); int i; + /* After the sync no updates or deletes will be in-flight so it + * is safe to walk map and remove entries without risking a race + * in EEXIST update case. + */ synchronize_rcu(); - raw_spin_lock_bh(&stab->lock); for (i = 0; i < stab->map.max_entries; i++) { struct sock **psk = &stab->sks[i]; struct sock *sk; @@ -248,7 +251,6 @@ static void sock_map_free(struct bpf_map *map) release_sock(sk); } } - raw_spin_unlock_bh(&stab->lock); /* wait for psock readers accessing its map link */ synchronize_rcu(); @@ -863,10 +865,13 @@ static void sock_hash_free(struct bpf_map *map) struct hlist_node *node; int i; + /* After the sync no updates or deletes will be in-flight so it + * is safe to walk map and remove entries without risking a race + * in EEXIST update case. + */ synchronize_rcu(); for (i = 0; i < htab->buckets_num; i++) { bucket = sock_hash_select_bucket(htab, i); - raw_spin_lock_bh(&bucket->lock); hlist_for_each_entry_safe(elem, node, &bucket->head, node) { hlist_del_rcu(&elem->node); lock_sock(elem->sk); @@ -875,7 +880,6 @@ static void sock_hash_free(struct bpf_map *map) rcu_read_unlock(); release_sock(elem->sk); } - raw_spin_unlock_bh(&bucket->lock); } /* wait for psock readers accessing its map link */ -- cgit