From 5fad52bee30414270104525e3a0266327a6e9d11 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 2 Dec 2023 09:56:56 -0800 Subject: bpf: provide correct register name for exception callback retval check bpf_throw() is checking R1, so let's report R1 in the log. Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231202175705.885270-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8e7b6072e3f4..25b9d470957e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11805,7 +11805,7 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env, return 0; } -static int check_return_code(struct bpf_verifier_env *env, int regno); +static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name); static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) @@ -11942,7 +11942,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, * to bpf_throw becomes the return value of the program. */ if (!env->exception_callback_subprog) { - err = check_return_code(env, BPF_REG_1); + err = check_return_code(env, BPF_REG_1, "R1"); if (err < 0) return err; } @@ -14972,7 +14972,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } -static int check_return_code(struct bpf_verifier_env *env, int regno) +static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name) { struct tnum enforce_attach_type_range = tnum_unknown; const struct bpf_prog *prog = env->prog; @@ -15026,7 +15026,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno) } if (!tnum_in(const_0, reg->var_off)) { - verbose_invalid_scalar(env, reg, &const_0, "async callback", "R0"); + verbose_invalid_scalar(env, reg, &const_0, "async callback", reg_name); return -EINVAL; } return 0; @@ -15126,7 +15126,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno) } if (!tnum_in(range, reg->var_off)) { - verbose_invalid_scalar(env, reg, &range, "program exit", "R0"); + verbose_invalid_scalar(env, reg, &range, "program exit", reg_name); if (prog->expected_attach_type == BPF_LSM_CGROUP && prog_type == BPF_PROG_TYPE_LSM && !prog->aux->attach_func_proto->type) @@ -17410,7 +17410,7 @@ process_bpf_exit_full: continue; } - err = check_return_code(env, BPF_REG_0); + err = check_return_code(env, BPF_REG_0, "R0"); if (err) return err; process_bpf_exit: -- cgit From 0acd03a5bd188b0c501d285d938439618bd855c4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 2 Dec 2023 09:56:57 -0800 Subject: bpf: enforce precision of R0 on callback return Given verifier checks actual value, r0 has to be precise, so we need to propagate precision properly. r0 also has to be marked as read, otherwise subsequent state comparisons will ignore such register as unimportant and precision won't really help here. Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231202175705.885270-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 25b9d470957e..849fbf47b5f3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9590,6 +9590,13 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) verbose(env, "R0 not a scalar value\n"); return -EACCES; } + + /* we are going to rely on register's precise value */ + err = mark_reg_read(env, r0, r0->parent, REG_LIVE_READ64); + err = err ?: mark_chain_precision(env, BPF_REG_0); + if (err) + return err; + if (!tnum_in(range, r0->var_off)) { verbose_invalid_scalar(env, r0, &range, "callback return", "R0"); return -EINVAL; -- cgit From 8fa4ecd49b81ccd9d1d87f1c8b2260e218644878 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 2 Dec 2023 09:56:58 -0800 Subject: bpf: enforce exact retval range on subprog/callback exit Instead of relying on potentially imprecise tnum representation of expected return value range for callbacks and subprogs, validate that smin/smax range satisfy exact expected range of return values. E.g., if callback would need to return [0, 2] range, tnum can't represent this precisely and instead will allow [0, 3] range. By checking smin/smax range, we can make sure that subprog/callback indeed returns only valid [0, 2] range. Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231202175705.885270-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 849fbf47b5f3..f3d9d7de68da 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2305,6 +2305,11 @@ static void init_reg_state(struct bpf_verifier_env *env, regs[BPF_REG_FP].frameno = state->frameno; } +static struct bpf_retval_range retval_range(s32 minval, s32 maxval) +{ + return (struct bpf_retval_range){ minval, maxval }; +} + #define BPF_MAIN_FUNC (-1) static void init_func_state(struct bpf_verifier_env *env, struct bpf_func_state *state, @@ -2313,7 +2318,7 @@ static void init_func_state(struct bpf_verifier_env *env, state->callsite = callsite; state->frameno = frameno; state->subprogno = subprogno; - state->callback_ret_range = tnum_range(0, 0); + state->callback_ret_range = retval_range(0, 0); init_reg_state(env, state); mark_verifier_state_scratched(env); } @@ -9396,7 +9401,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, return err; callee->in_callback_fn = true; - callee->callback_ret_range = tnum_range(0, 1); + callee->callback_ret_range = retval_range(0, 1); return 0; } @@ -9418,7 +9423,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; - callee->callback_ret_range = tnum_range(0, 1); + callee->callback_ret_range = retval_range(0, 1); return 0; } @@ -9448,7 +9453,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_async_callback_fn = true; - callee->callback_ret_range = tnum_range(0, 1); + callee->callback_ret_range = retval_range(0, 1); return 0; } @@ -9476,7 +9481,7 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; - callee->callback_ret_range = tnum_range(0, 1); + callee->callback_ret_range = retval_range(0, 1); return 0; } @@ -9499,7 +9504,7 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; - callee->callback_ret_range = tnum_range(0, 1); + callee->callback_ret_range = retval_range(0, 1); return 0; } @@ -9531,7 +9536,7 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; - callee->callback_ret_range = tnum_range(0, 1); + callee->callback_ret_range = retval_range(0, 1); return 0; } @@ -9560,6 +9565,11 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env) return is_rbtree_lock_required_kfunc(kfunc_btf_id); } +static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg) +{ + return range.minval <= reg->smin_value && reg->smax_value <= range.maxval; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state, *prev_st; @@ -9583,9 +9593,6 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) caller = state->frame[state->curframe - 1]; if (callee->in_callback_fn) { - /* enforce R0 return value range [0, 1]. */ - struct tnum range = callee->callback_ret_range; - if (r0->type != SCALAR_VALUE) { verbose(env, "R0 not a scalar value\n"); return -EACCES; @@ -9597,7 +9604,11 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) if (err) return err; - if (!tnum_in(range, r0->var_off)) { + /* enforce R0 return value range */ + if (!retval_range_within(callee->callback_ret_range, r0)) { + struct tnum range = tnum_range(callee->callback_ret_range.minval, + callee->callback_ret_range.maxval); + verbose_invalid_scalar(env, r0, &range, "callback return", "R0"); return -EINVAL; } -- cgit From c871d0e00f0e8c207ce8ff89025e35cc49a8a3c3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 2 Dec 2023 09:57:00 -0800 Subject: bpf: enforce precise retval range on program exit Similarly to subprog/callback logic, enforce return value of BPF program using more precise smin/smax range. We need to adjust a bunch of tests due to a changed format of an error message. Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231202175705.885270-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 56 ++++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 27 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f3d9d7de68da..9411c1046268 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -362,20 +362,23 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) static void verbose_invalid_scalar(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - struct tnum *range, const char *ctx, + struct bpf_retval_range range, const char *ctx, const char *reg_name) { - char tn_buf[48]; + bool unknown = true; - verbose(env, "At %s the register %s ", ctx, reg_name); - if (!tnum_is_unknown(reg->var_off)) { - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "has value %s", tn_buf); - } else { - verbose(env, "has unknown scalar value"); + verbose(env, "At %s the register %s has", ctx, reg_name); + if (reg->smin_value > S64_MIN) { + verbose(env, " smin=%lld", reg->smin_value); + unknown = false; } - tnum_strn(tn_buf, sizeof(tn_buf), *range); - verbose(env, " should have been in %s\n", tn_buf); + if (reg->smax_value < S64_MAX) { + verbose(env, " smax=%lld", reg->smax_value); + unknown = false; + } + if (unknown) + verbose(env, " unknown scalar value"); + verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval); } static bool type_may_be_null(u32 type) @@ -9606,10 +9609,8 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) /* enforce R0 return value range */ if (!retval_range_within(callee->callback_ret_range, r0)) { - struct tnum range = tnum_range(callee->callback_ret_range.minval, - callee->callback_ret_range.maxval); - - verbose_invalid_scalar(env, r0, &range, "callback return", "R0"); + verbose_invalid_scalar(env, r0, callee->callback_ret_range, + "callback return", "R0"); return -EINVAL; } if (!calls_callback(env, callee->callsite)) { @@ -14995,7 +14996,8 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char struct tnum enforce_attach_type_range = tnum_unknown; const struct bpf_prog *prog = env->prog; struct bpf_reg_state *reg; - struct tnum range = tnum_range(0, 1), const_0 = tnum_const(0); + struct bpf_retval_range range = retval_range(0, 1); + struct bpf_retval_range const_0 = retval_range(0, 0); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); int err; struct bpf_func_state *frame = env->cur_state->frame[0]; @@ -15043,8 +15045,8 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char return -EINVAL; } - if (!tnum_in(const_0, reg->var_off)) { - verbose_invalid_scalar(env, reg, &const_0, "async callback", reg_name); + if (!retval_range_within(const_0, reg)) { + verbose_invalid_scalar(env, reg, const_0, "async callback", reg_name); return -EINVAL; } return 0; @@ -15070,14 +15072,14 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME || env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME) - range = tnum_range(1, 1); + range = retval_range(1, 1); if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND || env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND) - range = tnum_range(0, 3); + range = retval_range(0, 3); break; case BPF_PROG_TYPE_CGROUP_SKB: if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { - range = tnum_range(0, 3); + range = retval_range(0, 3); enforce_attach_type_range = tnum_range(2, 3); } break; @@ -15090,13 +15092,13 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char case BPF_PROG_TYPE_RAW_TRACEPOINT: if (!env->prog->aux->attach_btf_id) return 0; - range = tnum_const(0); + range = retval_range(0, 0); break; case BPF_PROG_TYPE_TRACING: switch (env->prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: - range = tnum_const(0); + range = retval_range(0, 0); break; case BPF_TRACE_RAW_TP: case BPF_MODIFY_RETURN: @@ -15108,7 +15110,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char } break; case BPF_PROG_TYPE_SK_LOOKUP: - range = tnum_range(SK_DROP, SK_PASS); + range = retval_range(SK_DROP, SK_PASS); break; case BPF_PROG_TYPE_LSM: @@ -15122,12 +15124,12 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char /* Make sure programs that attach to void * hooks don't try to modify return value. */ - range = tnum_range(1, 1); + range = retval_range(1, 1); } break; case BPF_PROG_TYPE_NETFILTER: - range = tnum_range(NF_DROP, NF_ACCEPT); + range = retval_range(NF_DROP, NF_ACCEPT); break; case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value @@ -15143,8 +15145,8 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char return -EINVAL; } - if (!tnum_in(range, reg->var_off)) { - verbose_invalid_scalar(env, reg, &range, "program exit", reg_name); + if (!retval_range_within(range, reg)) { + verbose_invalid_scalar(env, reg, range, "program exit", reg_name); if (prog->expected_attach_type == BPF_LSM_CGROUP && prog_type == BPF_PROG_TYPE_LSM && !prog->aux->attach_func_proto->type) -- cgit From 0ef24c8dfae24a4b8aa2e92eac20faecdc5502e5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 2 Dec 2023 09:57:01 -0800 Subject: bpf: unify async callback and program retval checks Use common logic to verify program return values and async callback return values. This allows to avoid duplication of any extra steps necessary, like precision marking, which will be added in the next patch. Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231202175705.885270-8-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9411c1046268..c54944af1bcc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -367,7 +367,7 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env, { bool unknown = true; - verbose(env, "At %s the register %s has", ctx, reg_name); + verbose(env, "%s the register %s has", ctx, reg_name); if (reg->smin_value > S64_MIN) { verbose(env, " smin=%lld", reg->smin_value); unknown = false; @@ -9610,7 +9610,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) /* enforce R0 return value range */ if (!retval_range_within(callee->callback_ret_range, r0)) { verbose_invalid_scalar(env, r0, callee->callback_ret_range, - "callback return", "R0"); + "At callback return", "R0"); return -EINVAL; } if (!calls_callback(env, callee->callsite)) { @@ -14993,11 +14993,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name) { + const char *exit_ctx = "At program exit"; struct tnum enforce_attach_type_range = tnum_unknown; const struct bpf_prog *prog = env->prog; struct bpf_reg_state *reg; struct bpf_retval_range range = retval_range(0, 1); - struct bpf_retval_range const_0 = retval_range(0, 0); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); int err; struct bpf_func_state *frame = env->cur_state->frame[0]; @@ -15039,17 +15039,9 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char if (frame->in_async_callback_fn) { /* enforce return zero from async callbacks like timer */ - if (reg->type != SCALAR_VALUE) { - verbose(env, "In async callback the register R%d is not a known value (%s)\n", - regno, reg_type_str(env, reg->type)); - return -EINVAL; - } - - if (!retval_range_within(const_0, reg)) { - verbose_invalid_scalar(env, reg, const_0, "async callback", reg_name); - return -EINVAL; - } - return 0; + exit_ctx = "At async callback return"; + range = retval_range(0, 0); + goto enforce_retval; } if (is_subprog && !frame->in_exception_callback_fn) { @@ -15139,15 +15131,17 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char return 0; } +enforce_retval: if (reg->type != SCALAR_VALUE) { - verbose(env, "At program exit the register R%d is not a known value (%s)\n", - regno, reg_type_str(env, reg->type)); + verbose(env, "%s the register R%d is not a known value (%s)\n", + exit_ctx, regno, reg_type_str(env, reg->type)); return -EINVAL; } if (!retval_range_within(range, reg)) { - verbose_invalid_scalar(env, reg, range, "program exit", reg_name); - if (prog->expected_attach_type == BPF_LSM_CGROUP && + verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name); + if (!is_subprog && + prog->expected_attach_type == BPF_LSM_CGROUP && prog_type == BPF_PROG_TYPE_LSM && !prog->aux->attach_func_proto->type) verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); -- cgit From eabe518de533a4291996020977054a7a7b78c7d3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 2 Dec 2023 09:57:02 -0800 Subject: bpf: enforce precision of R0 on program/async callback return Given we enforce a valid range for program and async callback return value, we must mark R0 as precise to avoid incorrect state pruning. Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231202175705.885270-9-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c54944af1bcc..2cd150d6d141 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15138,6 +15138,10 @@ enforce_retval: return -EINVAL; } + err = mark_chain_precision(env, regno); + if (err) + return err; + if (!retval_range_within(range, reg)) { verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name); if (!is_subprog && -- cgit From 81eff2e36481c5cf4a2ac906ae56c3fbd3e6f305 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 2 Dec 2023 09:57:05 -0800 Subject: bpf: simplify tnum output if a fully known constant Emit tnum representation as just a constant if all bits are known. Use decimal-vs-hex logic to determine exact format of emitted constant value, just like it's done for register range values. For that move tnum_strn() to kernel/bpf/log.c to reuse decimal-vs-hex determination logic and constants. Acked-by: Shung-Hsi Yu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231202175705.885270-12-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/log.c | 13 +++++++++++++ kernel/bpf/tnum.c | 6 ------ 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 3505f3e5ae96..55d019f30e91 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -539,6 +539,19 @@ static void verbose_snum(struct bpf_verifier_env *env, s64 num) verbose(env, "%#llx", num); } +int tnum_strn(char *str, size_t size, struct tnum a) +{ + /* print as a constant, if tnum is fully known */ + if (a.mask == 0) { + if (is_unum_decimal(a.value)) + return snprintf(str, size, "%llu", a.value); + else + return snprintf(str, size, "%#llx", a.value); + } + return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask); +} +EXPORT_SYMBOL_GPL(tnum_strn); + static void print_scalar_ranges(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, const char **sep) diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index f4c91c9b27d7..9dbc31b25e3d 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -172,12 +172,6 @@ bool tnum_in(struct tnum a, struct tnum b) return a.value == b.value; } -int tnum_strn(char *str, size_t size, struct tnum a) -{ - return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask); -} -EXPORT_SYMBOL_GPL(tnum_strn); - int tnum_sbin(char *str, size_t size, struct tnum a) { size_t n; -- cgit From 5bd90cdc65ef9ef5e13c9ff23620079db5c608a0 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Sun, 3 Dec 2023 20:12:48 -0500 Subject: bpf: Minor logging improvement One place where we were logging a register was only logging the variable part, not also the fixed part. Signed-off-by: Andrei Matei Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20231204011248.2040084-1-andreimatei1@gmail.com --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2cd150d6d141..cdb4f5f0ba79 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6585,8 +6585,8 @@ static int check_stack_access_within_bounds( char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid variable-offset%s stack R%d var_off=%s size=%d\n", - err_extra, regno, tn_buf, access_size); + verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n", + err_extra, regno, tn_buf, off, access_size); } } return err; -- cgit From 169410eba271afc9f0fb476d996795aa26770c6d Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:19 +0800 Subject: bpf: Check rcu_read_lock_trace_held() before calling bpf map helpers These three bpf_map_{lookup,update,delete}_elem() helpers are also available for sleepable bpf program, so add the corresponding lock assertion for sleepable bpf program, otherwise the following warning will be reported when a sleepable bpf program manipulates bpf map under interpreter mode (aka bpf_jit_enable=0): WARNING: CPU: 3 PID: 4985 at kernel/bpf/helpers.c:40 ...... CPU: 3 PID: 4985 Comm: test_progs Not tainted 6.6.0+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ...... RIP: 0010:bpf_map_lookup_elem+0x54/0x60 ...... Call Trace: ? __warn+0xa5/0x240 ? bpf_map_lookup_elem+0x54/0x60 ? report_bug+0x1ba/0x1f0 ? handle_bug+0x40/0x80 ? exc_invalid_op+0x18/0x50 ? asm_exc_invalid_op+0x1b/0x20 ? __pfx_bpf_map_lookup_elem+0x10/0x10 ? rcu_lockdep_current_cpu_online+0x65/0xb0 ? rcu_is_watching+0x23/0x50 ? bpf_map_lookup_elem+0x54/0x60 ? __pfx_bpf_map_lookup_elem+0x10/0x10 ___bpf_prog_run+0x513/0x3b70 __bpf_prog_run32+0x9d/0xd0 ? __bpf_prog_enter_sleepable_recur+0xad/0x120 ? __bpf_prog_enter_sleepable_recur+0x3e/0x120 bpf_trampoline_6442580665+0x4d/0x1000 __x64_sys_getpgid+0x5/0x30 ? do_syscall_64+0x36/0xb0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b45a8381f9bd..ee9bdf29246a 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -32,12 +32,13 @@ * * Different map implementations will rely on rcu in map methods * lookup/update/delete, therefore eBPF programs must run under rcu lock - * if program is allowed to access maps, so check rcu_read_lock_held in - * all three functions. + * if program is allowed to access maps, so check rcu_read_lock_held() or + * rcu_read_lock_trace_held() in all three functions. */ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); return (unsigned long) map->ops->map_lookup_elem(map, key); } @@ -53,7 +54,8 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = { BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, void *, value, u64, flags) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); return map->ops->map_update_elem(map, key, value, flags); } @@ -70,7 +72,8 @@ const struct bpf_func_proto bpf_map_update_elem_proto = { BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); return map->ops->map_delete_elem(map, key); } -- cgit From 20c20bd11a0702ce4dc9300c3da58acf551d9725 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:20 +0800 Subject: bpf: Add map and need_defer parameters to .map_fd_put_ptr() map is the pointer of outer map, and need_defer needs some explanation. need_defer tells the implementation to defer the reference release of the passed element and ensure that the element is still alive before the bpf program, which may manipulate it, exits. The following three cases will invoke map_fd_put_ptr() and different need_defer values will be passed to these callers: 1) release the reference of the old element in the map during map update or map deletion. The release must be deferred, otherwise the bpf program may incur use-after-free problem, so need_defer needs to be true. 2) release the reference of the to-be-added element in the error path of map update. The to-be-added element is not visible to any bpf program, so it is OK to pass false for need_defer parameter. 3) release the references of all elements in the map during map release. Any bpf program which has access to the map must have been exited and released, so need_defer=false will be OK. These two parameters will be used by the following patches to fix the potential use-after-free problem for map-in-map. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 12 +++++++----- kernel/bpf/hashtab.c | 6 +++--- kernel/bpf/map_in_map.c | 2 +- kernel/bpf/map_in_map.h | 2 +- 4 files changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 2058e89b5ddd..f9aed5909d6e 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -867,7 +867,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, } if (old_ptr) - map->ops->map_fd_put_ptr(old_ptr); + map->ops->map_fd_put_ptr(map, old_ptr, true); return 0; } @@ -890,7 +890,7 @@ static long fd_array_map_delete_elem(struct bpf_map *map, void *key) } if (old_ptr) { - map->ops->map_fd_put_ptr(old_ptr); + map->ops->map_fd_put_ptr(map, old_ptr, true); return 0; } else { return -ENOENT; @@ -913,8 +913,9 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, return prog; } -static void prog_fd_array_put_ptr(void *ptr) +static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { + /* bpf_prog is freed after one RCU or tasks trace grace period */ bpf_prog_put(ptr); } @@ -1239,8 +1240,9 @@ err_out: return ee; } -static void perf_event_fd_array_put_ptr(void *ptr) +static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { + /* bpf_perf_event is freed after one RCU grace period */ bpf_event_entry_free_rcu(ptr); } @@ -1294,7 +1296,7 @@ static void *cgroup_fd_array_get_ptr(struct bpf_map *map, return cgroup_get_from_fd(fd); } -static void cgroup_fd_array_put_ptr(void *ptr) +static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { /* cgroup_put free cgrp after a rcu grace period */ cgroup_put(ptr); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index fd8d4b0addfc..5b9146fa825f 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -897,7 +897,7 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) if (map->ops->map_fd_put_ptr) { ptr = fd_htab_map_get_ptr(map, l); - map->ops->map_fd_put_ptr(ptr); + map->ops->map_fd_put_ptr(map, ptr, true); } } @@ -2484,7 +2484,7 @@ static void fd_htab_map_free(struct bpf_map *map) hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { void *ptr = fd_htab_map_get_ptr(map, l); - map->ops->map_fd_put_ptr(ptr); + map->ops->map_fd_put_ptr(map, ptr, false); } } @@ -2525,7 +2525,7 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, ret = htab_map_update_elem(map, key, &ptr, map_flags); if (ret) - map->ops->map_fd_put_ptr(ptr); + map->ops->map_fd_put_ptr(map, ptr, false); return ret; } diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index cd5eafaba97e..2dfeb5835e16 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -127,7 +127,7 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, return inner_map; } -void bpf_map_fd_put_ptr(void *ptr) +void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { /* ptr->ops->map_free() has to go through one * rcu grace period by itself. diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index bcb7534afb3c..7d61602354de 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h @@ -13,7 +13,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); void bpf_map_meta_free(struct bpf_map *map_meta); void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd); -void bpf_map_fd_put_ptr(void *ptr); +void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer); u32 bpf_map_fd_sys_lookup_elem(void *ptr); #endif -- cgit From 79d93b3c6ffd79abcd8e43345980aa1e904879c4 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:21 +0800 Subject: bpf: Set need_defer as false when clearing fd array during map free Both map deletion operation, map release and map free operation use fd_array_map_delete_elem() to remove the element from fd array and need_defer is always true in fd_array_map_delete_elem(). For the map deletion operation and map release operation, need_defer=true is necessary, because the bpf program, which accesses the element in fd array, may still alive. However for map free operation, it is certain that the bpf program which owns the fd array has already been exited, so setting need_defer as false is appropriate for map free operation. So fix it by adding need_defer parameter to bpf_fd_array_map_clear() and adding a new helper __fd_array_map_delete_elem() to handle the map deletion, map release and map free operations correspondingly. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-4-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index f9aed5909d6e..4a4a67956e21 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -871,7 +871,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, return 0; } -static long fd_array_map_delete_elem(struct bpf_map *map, void *key) +static long __fd_array_map_delete_elem(struct bpf_map *map, void *key, bool need_defer) { struct bpf_array *array = container_of(map, struct bpf_array, map); void *old_ptr; @@ -890,13 +890,18 @@ static long fd_array_map_delete_elem(struct bpf_map *map, void *key) } if (old_ptr) { - map->ops->map_fd_put_ptr(map, old_ptr, true); + map->ops->map_fd_put_ptr(map, old_ptr, need_defer); return 0; } else { return -ENOENT; } } +static long fd_array_map_delete_elem(struct bpf_map *map, void *key) +{ + return __fd_array_map_delete_elem(map, key, true); +} + static void *prog_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { @@ -925,13 +930,13 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr) } /* decrement refcnt of all bpf_progs that are stored in this map */ -static void bpf_fd_array_map_clear(struct bpf_map *map) +static void bpf_fd_array_map_clear(struct bpf_map *map, bool need_defer) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; for (i = 0; i < array->map.max_entries; i++) - fd_array_map_delete_elem(map, &i); + __fd_array_map_delete_elem(map, &i, need_defer); } static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, @@ -1110,7 +1115,7 @@ static void prog_array_map_clear_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_array_aux, work)->map; - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, true); bpf_map_put(map); } @@ -1260,7 +1265,7 @@ static void perf_event_fd_array_release(struct bpf_map *map, for (i = 0; i < array->map.max_entries; i++) { ee = READ_ONCE(array->ptrs[i]); if (ee && ee->map_file == map_file) - fd_array_map_delete_elem(map, &i); + __fd_array_map_delete_elem(map, &i, true); } rcu_read_unlock(); } @@ -1268,7 +1273,7 @@ static void perf_event_fd_array_release(struct bpf_map *map, static void perf_event_fd_array_map_free(struct bpf_map *map) { if (map->map_flags & BPF_F_PRESERVE_ELEMS) - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } @@ -1304,7 +1309,7 @@ static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_de static void cgroup_fd_array_free(struct bpf_map *map) { - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } @@ -1349,7 +1354,7 @@ static void array_of_map_free(struct bpf_map *map) * is protected by fdget/fdput. */ bpf_map_meta_free(map->inner_map_meta); - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } -- cgit From 876673364161da50eed6b472d746ef88242b2368 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:22 +0800 Subject: bpf: Defer the free of inner map when necessary When updating or deleting an inner map in map array or map htab, the map may still be accessed by non-sleepable program or sleepable program. However bpf_map_fd_put_ptr() decreases the ref-counter of the inner map directly through bpf_map_put(), if the ref-counter is the last one (which is true for most cases), the inner map will be freed by ops->map_free() in a kworker. But for now, most .map_free() callbacks don't use synchronize_rcu() or its variants to wait for the elapse of a RCU grace period, so after the invocation of ops->map_free completes, the bpf program which is accessing the inner map may incur use-after-free problem. Fix the free of inner map by invoking bpf_map_free_deferred() after both one RCU grace period and one tasks trace RCU grace period if the inner map has been removed from the outer map before. The deferment is accomplished by using call_rcu() or call_rcu_tasks_trace() when releasing the last ref-counter of bpf map. The newly-added rcu_head field in bpf_map shares the same storage space with work field to reduce the size of bpf_map. Fixes: bba1dc0b55ac ("bpf: Remove redundant synchronize_rcu.") Fixes: 638e4b825d52 ("bpf: Allows per-cpu maps and map-in-map in sleepable programs") Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-5-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/map_in_map.c | 11 ++++++++--- kernel/bpf/syscall.c | 32 +++++++++++++++++++++++++++----- 2 files changed, 35 insertions(+), 8 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 2dfeb5835e16..3248ff5d8161 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -129,10 +129,15 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { - /* ptr->ops->map_free() has to go through one - * rcu grace period by itself. + struct bpf_map *inner_map = ptr; + + /* The inner map may still be used by both non-sleepable and sleepable + * bpf program, so free it after one RCU grace period and one tasks + * trace RCU grace period. */ - bpf_map_put(ptr); + if (need_defer) + WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true); + bpf_map_put(inner_map); } u32 bpf_map_fd_sys_lookup_elem(void *ptr) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5e43ddd1b83f..dd515f6b9741 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -719,6 +719,28 @@ static void bpf_map_put_uref(struct bpf_map *map) } } +static void bpf_map_free_in_work(struct bpf_map *map) +{ + INIT_WORK(&map->work, bpf_map_free_deferred); + /* Avoid spawning kworkers, since they all might contend + * for the same mutex like slab_mutex. + */ + queue_work(system_unbound_wq, &map->work); +} + +static void bpf_map_free_rcu_gp(struct rcu_head *rcu) +{ + bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu)); +} + +static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu) +{ + if (rcu_trace_implies_rcu_gp()) + bpf_map_free_rcu_gp(rcu); + else + call_rcu(rcu, bpf_map_free_rcu_gp); +} + /* decrement map refcnt and schedule it for freeing via workqueue * (underlying map implementation ops->map_free() might sleep) */ @@ -728,11 +750,11 @@ void bpf_map_put(struct bpf_map *map) /* bpf_map_free_id() must be called first */ bpf_map_free_id(map); btf_put(map->btf); - INIT_WORK(&map->work, bpf_map_free_deferred); - /* Avoid spawning kworkers, since they all might contend - * for the same mutex like slab_mutex. - */ - queue_work(system_unbound_wq, &map->work); + + if (READ_ONCE(map->free_after_mult_rcu_gp)) + call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp); + else + bpf_map_free_in_work(map); } } EXPORT_SYMBOL_GPL(bpf_map_put); -- cgit From af66bfd3c8538ed21cf72af18426fc4a408665cf Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:23 +0800 Subject: bpf: Optimize the free of inner map When removing the inner map from the outer map, the inner map will be freed after one RCU grace period and one RCU tasks trace grace period, so it is certain that the bpf program, which may access the inner map, has exited before the inner map is freed. However there is no need to wait for one RCU tasks trace grace period if the outer map is only accessed by non-sleepable program. So adding sleepable_refcnt in bpf_map and increasing sleepable_refcnt when adding the outer map into env->used_maps for sleepable program. Although the max number of bpf program is INT_MAX - 1, the number of bpf programs which are being loaded may be greater than INT_MAX, so using atomic64_t instead of atomic_t for sleepable_refcnt. When removing the inner map from the outer map, using sleepable_refcnt to decide whether or not a RCU tasks trace grace period is needed before freeing the inner map. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-6-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 4 ++++ kernel/bpf/map_in_map.c | 14 +++++++++----- kernel/bpf/syscall.c | 8 ++++++++ kernel/bpf/verifier.c | 4 +++- 4 files changed, 24 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index cd3afe57ece3..4b813da8d6c0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2664,12 +2664,16 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux, struct bpf_map **used_maps, u32 len) { struct bpf_map *map; + bool sleepable; u32 i; + sleepable = aux->sleepable; for (i = 0; i < len; i++) { map = used_maps[i]; if (map->ops->map_poke_untrack) map->ops->map_poke_untrack(map, aux); + if (sleepable) + atomic64_dec(&map->sleepable_refcnt); bpf_map_put(map); } } diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 3248ff5d8161..8ef269e66ba5 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -131,12 +131,16 @@ void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { struct bpf_map *inner_map = ptr; - /* The inner map may still be used by both non-sleepable and sleepable - * bpf program, so free it after one RCU grace period and one tasks - * trace RCU grace period. + /* Defer the freeing of inner map according to the sleepable attribute + * of bpf program which owns the outer map, so unnecessary waiting for + * RCU tasks trace grace period can be avoided. */ - if (need_defer) - WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true); + if (need_defer) { + if (atomic64_read(&map->sleepable_refcnt)) + WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true); + else + WRITE_ONCE(inner_map->free_after_rcu_gp, true); + } bpf_map_put(inner_map); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dd515f6b9741..ebaccf77d56e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -751,8 +751,11 @@ void bpf_map_put(struct bpf_map *map) bpf_map_free_id(map); btf_put(map->btf); + WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt)); if (READ_ONCE(map->free_after_mult_rcu_gp)) call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp); + else if (READ_ONCE(map->free_after_rcu_gp)) + call_rcu(&map->rcu, bpf_map_free_rcu_gp); else bpf_map_free_in_work(map); } @@ -5345,6 +5348,11 @@ static int bpf_prog_bind_map(union bpf_attr *attr) goto out_unlock; } + /* The bpf program will not access the bpf map, but for the sake of + * simplicity, increase sleepable_refcnt for sleepable program as well. + */ + if (prog->aux->sleepable) + atomic64_inc(&map->sleepable_refcnt); memcpy(used_maps_new, used_maps_old, sizeof(used_maps_old[0]) * prog->aux->used_map_cnt); used_maps_new[prog->aux->used_map_cnt] = map; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cdb4f5f0ba79..1ed39665f802 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17889,10 +17889,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) return -E2BIG; } + if (env->prog->aux->sleepable) + atomic64_inc(&map->sleepable_refcnt); /* hold the map. If the program is rejected by verifier, * the map will be released by release_maps() or it * will be used by the valid program until it's unloaded - * and all maps are released in free_used_maps() + * and all maps are released in bpf_free_used_maps() */ bpf_map_inc(map); -- cgit From 41f6f64e6999a837048b1bd13a2f8742964eca6b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 5 Dec 2023 10:42:39 -0800 Subject: bpf: support non-r10 register spill/fill to/from stack in precision tracking Use instruction (jump) history to record instructions that performed register spill/fill to/from stack, regardless if this was done through read-only r10 register, or any other register after copying r10 into it *and* potentially adjusting offset. To make this work reliably, we push extra per-instruction flags into instruction history, encoding stack slot index (spi) and stack frame number in extra 10 bit flags we take away from prev_idx in instruction history. We don't touch idx field for maximum performance, as it's checked most frequently during backtracking. This change removes basically the last remaining practical limitation of precision backtracking logic in BPF verifier. It fixes known deficiencies, but also opens up new opportunities to reduce number of verified states, explored in the subsequent patches. There are only three differences in selftests' BPF object files according to veristat, all in the positive direction (less states). File Program Insns (A) Insns (B) Insns (DIFF) States (A) States (B) States (DIFF) -------------------------------------- ------------- --------- --------- ------------- ---------- ---------- ------------- test_cls_redirect_dynptr.bpf.linked3.o cls_redirect 2987 2864 -123 (-4.12%) 240 231 -9 (-3.75%) xdp_synproxy_kern.bpf.linked3.o syncookie_tc 82848 82661 -187 (-0.23%) 5107 5073 -34 (-0.67%) xdp_synproxy_kern.bpf.linked3.o syncookie_xdp 85116 84964 -152 (-0.18%) 5162 5130 -32 (-0.62%) Note, I avoided renaming jmp_history to more generic insn_hist to minimize number of lines changed and potential merge conflicts between bpf and bpf-next trees. Notice also cur_hist_entry pointer reset to NULL at the beginning of instruction verification loop. This pointer avoids the problem of relying on last jump history entry's insn_idx to determine whether we already have entry for current instruction or not. It can happen that we added jump history entry because current instruction is_jmp_point(), but also we need to add instruction flags for stack access. In this case, we don't want to entries, so we need to reuse last added entry, if it is present. Relying on insn_idx comparison has the same ambiguity problem as the one that was fixed recently in [0], so we avoid that. [0] https://patchwork.kernel.org/project/netdevbpf/patch/20231110002638.4168352-3-andrii@kernel.org/ Acked-by: Eduard Zingerman Reported-by: Tao Lyu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231205184248.1502704-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 175 +++++++++++++++++++++++++++++--------------------- 1 file changed, 102 insertions(+), 73 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1ed39665f802..9bc16dc66465 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1355,8 +1355,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, int i, err; dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history, - src->jmp_history_cnt, sizeof(struct bpf_idx_pair), - GFP_USER); + src->jmp_history_cnt, sizeof(*dst_state->jmp_history), + GFP_USER); if (!dst_state->jmp_history) return -ENOMEM; dst_state->jmp_history_cnt = src->jmp_history_cnt; @@ -3221,6 +3221,21 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return __check_reg_arg(env, state->regs, regno, t); } +static int insn_stack_access_flags(int frameno, int spi) +{ + return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno; +} + +static int insn_stack_access_spi(int insn_flags) +{ + return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK; +} + +static int insn_stack_access_frameno(int insn_flags) +{ + return insn_flags & INSN_F_FRAMENO_MASK; +} + static void mark_jmp_point(struct bpf_verifier_env *env, int idx) { env->insn_aux_data[idx].jmp_point = true; @@ -3232,28 +3247,51 @@ static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx) } /* for any branch, call, exit record the history of jmps in the given state */ -static int push_jmp_history(struct bpf_verifier_env *env, - struct bpf_verifier_state *cur) +static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, + int insn_flags) { u32 cnt = cur->jmp_history_cnt; - struct bpf_idx_pair *p; + struct bpf_jmp_history_entry *p; size_t alloc_size; - if (!is_jmp_point(env, env->insn_idx)) + /* combine instruction flags if we already recorded this instruction */ + if (env->cur_hist_ent) { + /* atomic instructions push insn_flags twice, for READ and + * WRITE sides, but they should agree on stack slot + */ + WARN_ONCE((env->cur_hist_ent->flags & insn_flags) && + (env->cur_hist_ent->flags & insn_flags) != insn_flags, + "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n", + env->insn_idx, env->cur_hist_ent->flags, insn_flags); + env->cur_hist_ent->flags |= insn_flags; return 0; + } cnt++; alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); p = krealloc(cur->jmp_history, alloc_size, GFP_USER); if (!p) return -ENOMEM; - p[cnt - 1].idx = env->insn_idx; - p[cnt - 1].prev_idx = env->prev_insn_idx; cur->jmp_history = p; + + p = &cur->jmp_history[cnt - 1]; + p->idx = env->insn_idx; + p->prev_idx = env->prev_insn_idx; + p->flags = insn_flags; cur->jmp_history_cnt = cnt; + env->cur_hist_ent = p; + return 0; } +static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st, + u32 hist_end, int insn_idx) +{ + if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx) + return &st->jmp_history[hist_end - 1]; + return NULL; +} + /* Backtrack one insn at a time. If idx is not at the top of recorded * history then previous instruction came from straight line execution. * Return -ENOENT if we exhausted all instructions within given state. @@ -3415,9 +3453,14 @@ static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg) return bt->reg_masks[bt->frame] & (1 << reg); } +static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot) +{ + return bt->stack_masks[frame] & (1ull << slot); +} + static inline bool bt_is_slot_set(struct backtrack_state *bt, u32 slot) { - return bt->stack_masks[bt->frame] & (1ull << slot); + return bt_is_frame_slot_set(bt, bt->frame, slot); } /* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */ @@ -3471,7 +3514,7 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); * - *was* processed previously during backtracking. */ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, - struct backtrack_state *bt) + struct bpf_jmp_history_entry *hist, struct backtrack_state *bt) { const struct bpf_insn_cbs cbs = { .cb_call = disasm_kfunc_name, @@ -3484,7 +3527,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, u8 mode = BPF_MODE(insn->code); u32 dreg = insn->dst_reg; u32 sreg = insn->src_reg; - u32 spi, i; + u32 spi, i, fr; if (insn->code == 0) return 0; @@ -3545,20 +3588,15 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * by 'precise' mark in corresponding register of this state. * No further tracking necessary. */ - if (insn->src_reg != BPF_REG_FP) + if (!hist || !(hist->flags & INSN_F_STACK_ACCESS)) return 0; - /* dreg = *(u64 *)[fp - off] was a fill from the stack. * that [fp - off] slot contains scalar that needs to be * tracked with precision */ - spi = (-insn->off - 1) / BPF_REG_SIZE; - if (spi >= 64) { - verbose(env, "BUG spi %d\n", spi); - WARN_ONCE(1, "verifier backtracking bug"); - return -EFAULT; - } - bt_set_slot(bt, spi); + spi = insn_stack_access_spi(hist->flags); + fr = insn_stack_access_frameno(hist->flags); + bt_set_frame_slot(bt, fr, spi); } else if (class == BPF_STX || class == BPF_ST) { if (bt_is_reg_set(bt, dreg)) /* stx & st shouldn't be using _scalar_ dst_reg @@ -3567,17 +3605,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, */ return -ENOTSUPP; /* scalars can only be spilled into stack */ - if (insn->dst_reg != BPF_REG_FP) + if (!hist || !(hist->flags & INSN_F_STACK_ACCESS)) return 0; - spi = (-insn->off - 1) / BPF_REG_SIZE; - if (spi >= 64) { - verbose(env, "BUG spi %d\n", spi); - WARN_ONCE(1, "verifier backtracking bug"); - return -EFAULT; - } - if (!bt_is_slot_set(bt, spi)) + spi = insn_stack_access_spi(hist->flags); + fr = insn_stack_access_frameno(hist->flags); + if (!bt_is_frame_slot_set(bt, fr, spi)) return 0; - bt_clear_slot(bt, spi); + bt_clear_frame_slot(bt, fr, spi); if (class == BPF_STX) bt_set_reg(bt, sreg); } else if (class == BPF_JMP || class == BPF_JMP32) { @@ -3621,10 +3655,14 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, WARN_ONCE(1, "verifier backtracking bug"); return -EFAULT; } - /* we don't track register spills perfectly, - * so fallback to force-precise instead of failing */ - if (bt_stack_mask(bt) != 0) - return -ENOTSUPP; + /* we are now tracking register spills correctly, + * so any instance of leftover slots is a bug + */ + if (bt_stack_mask(bt) != 0) { + verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt)); + WARN_ONCE(1, "verifier backtracking bug (subprog leftover stack slots)"); + return -EFAULT; + } /* propagate r1-r5 to the caller */ for (i = BPF_REG_1; i <= BPF_REG_5; i++) { if (bt_is_reg_set(bt, i)) { @@ -3649,8 +3687,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, WARN_ONCE(1, "verifier backtracking bug"); return -EFAULT; } - if (bt_stack_mask(bt) != 0) - return -ENOTSUPP; + if (bt_stack_mask(bt) != 0) { + verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt)); + WARN_ONCE(1, "verifier backtracking bug (callback leftover stack slots)"); + return -EFAULT; + } /* clear r1-r5 in callback subprog's mask */ for (i = BPF_REG_1; i <= BPF_REG_5; i++) bt_clear_reg(bt, i); @@ -4087,6 +4128,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) for (;;) { DECLARE_BITMAP(mask, 64); u32 history = st->jmp_history_cnt; + struct bpf_jmp_history_entry *hist; if (env->log.level & BPF_LOG_LEVEL2) { verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n", @@ -4150,7 +4192,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) err = 0; skip_first = false; } else { - err = backtrack_insn(env, i, subseq_idx, bt); + hist = get_jmp_hist_entry(st, history, i); + err = backtrack_insn(env, i, subseq_idx, hist, bt); } if (err == -ENOTSUPP) { mark_all_scalars_precise(env, env->cur_state); @@ -4203,22 +4246,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr)); for_each_set_bit(i, mask, 64) { if (i >= func->allocated_stack / BPF_REG_SIZE) { - /* the sequence of instructions: - * 2: (bf) r3 = r10 - * 3: (7b) *(u64 *)(r3 -8) = r0 - * 4: (79) r4 = *(u64 *)(r10 -8) - * doesn't contain jmps. It's backtracked - * as a single block. - * During backtracking insn 3 is not recognized as - * stack access, so at the end of backtracking - * stack slot fp-8 is still marked in stack_mask. - * However the parent state may not have accessed - * fp-8 and it's "unallocated" stack space. - * In such case fallback to conservative. - */ - mark_all_scalars_precise(env, env->cur_state); - bt_reset(bt); - return 0; + verbose(env, "BUG backtracking (stack slot %d, total slots %d)\n", + i, func->allocated_stack / BPF_REG_SIZE); + WARN_ONCE(1, "verifier backtracking bug (stack slot out of bounds)"); + return -EFAULT; } if (!is_spilled_scalar_reg(&func->stack[i])) { @@ -4391,7 +4422,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; struct bpf_reg_state *reg = NULL; - u32 dst_reg = insn->dst_reg; + int insn_flags = insn_stack_access_flags(state->frameno, spi); err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE)); if (err) @@ -4432,17 +4463,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, mark_stack_slot_scratched(env, spi); if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && !register_is_null(reg) && env->bpf_capable) { - if (dst_reg != BPF_REG_FP) { - /* The backtracking logic can only recognize explicit - * stack slot address like [fp - 8]. Other spill of - * scalar via different register has to be conservative. - * Backtrack from here and mark all registers as precise - * that contributed into 'reg' being a constant. - */ - err = mark_chain_precision(env, value_regno); - if (err) - return err; - } save_register_state(state, spi, reg, size); /* Break the relation on a narrowing spill. */ if (fls64(reg->umax_value) > BITS_PER_BYTE * size) @@ -4454,6 +4474,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, __mark_reg_known(&fake_reg, insn->imm); fake_reg.type = SCALAR_VALUE; save_register_state(state, spi, &fake_reg, size); + insn_flags = 0; /* not a register spill */ } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { @@ -4499,9 +4520,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, /* Mark slots affected by this stack write. */ for (i = 0; i < size; i++) - state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = - type; + state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = type; + insn_flags = 0; /* not a register spill */ } + + if (insn_flags) + return push_jmp_history(env, env->cur_state, insn_flags); return 0; } @@ -4694,6 +4718,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; struct bpf_reg_state *reg; u8 *stype, type; + int insn_flags = insn_stack_access_flags(reg_state->frameno, spi); stype = reg_state->stack[spi].slot_type; reg = ®_state->stack[spi].spilled_ptr; @@ -4739,12 +4764,10 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, return -EACCES; } mark_reg_unknown(env, state->regs, dst_regno); + insn_flags = 0; /* not restoring original register state */ } state->regs[dst_regno].live |= REG_LIVE_WRITTEN; - return 0; - } - - if (dst_regno >= 0) { + } else if (dst_regno >= 0) { /* restore register state from stack */ copy_register_state(&state->regs[dst_regno], reg); /* mark reg as written since spilled pointer state likely @@ -4780,7 +4803,10 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (dst_regno >= 0) mark_reg_stack_read(env, reg_state, off, off + size, dst_regno); + insn_flags = 0; /* we are not restoring spilled register */ } + if (insn_flags) + return push_jmp_history(env, env->cur_state, insn_flags); return 0; } @@ -6940,7 +6966,6 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); if (err) return err; - return 0; } @@ -16910,7 +16935,8 @@ hit: * the precision needs to be propagated back in * the current state. */ - err = err ? : push_jmp_history(env, cur); + if (is_jmp_point(env, env->insn_idx)) + err = err ? : push_jmp_history(env, cur, 0); err = err ? : propagate_precision(env, &sl->state); if (err) return err; @@ -17135,6 +17161,9 @@ static int do_check(struct bpf_verifier_env *env) u8 class; int err; + /* reset current history entry on each new instruction */ + env->cur_hist_ent = NULL; + env->prev_insn_idx = prev_insn_idx; if (env->insn_idx >= insn_cnt) { verbose(env, "invalid insn idx %d insn_cnt %d\n", @@ -17174,7 +17203,7 @@ static int do_check(struct bpf_verifier_env *env) } if (is_jmp_point(env, env->insn_idx)) { - err = push_jmp_history(env, state); + err = push_jmp_history(env, state, 0); if (err) return err; } -- cgit From ab125ed3ec1c10ccc36bc98c7a4256ad114a3dae Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 5 Dec 2023 10:42:41 -0800 Subject: bpf: fix check for attempt to corrupt spilled pointer When register is spilled onto a stack as a 1/2/4-byte register, we set slot_type[BPF_REG_SIZE - 1] (plus potentially few more below it, depending on actual spill size). So to check if some stack slot has spilled register we need to consult slot_type[7], not slot_type[0]. To avoid the need to remember and double-check this in the future, just use is_spilled_reg() helper. Fixes: 27113c59b6d0 ("bpf: Check the other end of slot_type for STACK_SPILL") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231205184248.1502704-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9bc16dc66465..3edca06de9fd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4431,7 +4431,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, * so it's aligned access and [off, off + size) are within stack limits */ if (!env->allow_ptr_leaks && - state->stack[spi].slot_type[0] == STACK_SPILL && + is_spilled_reg(&state->stack[spi]) && size != BPF_REG_SIZE) { verbose(env, "attempt to corrupt spilled pointer on stack\n"); return -EACCES; -- cgit From eaf18febd6ebc381aeb61543705148b3e28c7c47 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 5 Dec 2023 10:42:42 -0800 Subject: bpf: preserve STACK_ZERO slots on partial reg spills Instead of always forcing STACK_ZERO slots to STACK_MISC, preserve it in situations where this is possible. E.g., when spilling register as 1/2/4-byte subslots on the stack, all the remaining bytes in the stack slot do not automatically become unknown. If we knew they contained zeroes, we can preserve those STACK_ZERO markers. Add a helper mark_stack_slot_misc(), similar to scrub_spilled_slot(), but that doesn't overwrite either STACK_INVALID nor STACK_ZERO. Note that we need to take into account possibility of being in unprivileged mode, in which case STACK_INVALID is forced to STACK_MISC for correctness, as treating STACK_INVALID as equivalent STACK_MISC is only enabled in privileged mode. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231205184248.1502704-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3edca06de9fd..93de39a6e36e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1144,6 +1144,21 @@ static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack) stack->spilled_ptr.type == SCALAR_VALUE; } +/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which + * case they are equivalent, or it's STACK_ZERO, in which case we preserve + * more precise STACK_ZERO. + * Note, in uprivileged mode leaving STACK_INVALID is wrong, so we take + * env->allow_ptr_leaks into account and force STACK_MISC, if necessary. + */ +static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype) +{ + if (*stype == STACK_ZERO) + return; + if (env->allow_ptr_leaks && *stype == STACK_INVALID) + return; + *stype = STACK_MISC; +} + static void scrub_spilled_slot(u8 *stype) { if (*stype != STACK_INVALID) @@ -4386,7 +4401,8 @@ static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_ dst->live = live; } -static void save_register_state(struct bpf_func_state *state, +static void save_register_state(struct bpf_verifier_env *env, + struct bpf_func_state *state, int spi, struct bpf_reg_state *reg, int size) { @@ -4401,7 +4417,7 @@ static void save_register_state(struct bpf_func_state *state, /* size < 8 bytes spill */ for (; i; i--) - scrub_spilled_slot(&state->stack[spi].slot_type[i - 1]); + mark_stack_slot_misc(env, &state->stack[spi].slot_type[i - 1]); } static bool is_bpf_st_mem(struct bpf_insn *insn) @@ -4463,7 +4479,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, mark_stack_slot_scratched(env, spi); if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && !register_is_null(reg) && env->bpf_capable) { - save_register_state(state, spi, reg, size); + save_register_state(env, state, spi, reg, size); /* Break the relation on a narrowing spill. */ if (fls64(reg->umax_value) > BITS_PER_BYTE * size) state->stack[spi].spilled_ptr.id = 0; @@ -4473,7 +4489,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, __mark_reg_known(&fake_reg, insn->imm); fake_reg.type = SCALAR_VALUE; - save_register_state(state, spi, &fake_reg, size); + save_register_state(env, state, spi, &fake_reg, size); insn_flags = 0; /* not a register spill */ } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ @@ -4486,7 +4502,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); return -EINVAL; } - save_register_state(state, spi, reg, size); + save_register_state(env, state, spi, reg, size); } else { u8 type = STACK_MISC; @@ -4757,6 +4773,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, continue; if (type == STACK_MISC) continue; + if (type == STACK_ZERO) + continue; if (type == STACK_INVALID && env->allow_uninit_stack) continue; verbose(env, "invalid read from stack off %d+%d size %d\n", -- cgit From e322f0bcb8d371f4606eaf141c7f967e1a79bcb7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 5 Dec 2023 10:42:44 -0800 Subject: bpf: preserve constant zero when doing partial register restore Similar to special handling of STACK_ZERO, when reading 1/2/4 bytes from stack from slot that has register spilled into it and that register has a constant value zero, preserve that zero and mark spilled register as precise for that. This makes spilled const zero register and STACK_ZERO cases equivalent in their behavior. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231205184248.1502704-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 93de39a6e36e..1ebe76c98451 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4767,22 +4767,39 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, copy_register_state(&state->regs[dst_regno], reg); state->regs[dst_regno].subreg_def = subreg_def; } else { + int spill_cnt = 0, zero_cnt = 0; + for (i = 0; i < size; i++) { type = stype[(slot - i) % BPF_REG_SIZE]; - if (type == STACK_SPILL) + if (type == STACK_SPILL) { + spill_cnt++; continue; + } if (type == STACK_MISC) continue; - if (type == STACK_ZERO) + if (type == STACK_ZERO) { + zero_cnt++; continue; + } if (type == STACK_INVALID && env->allow_uninit_stack) continue; verbose(env, "invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; } - mark_reg_unknown(env, state->regs, dst_regno); - insn_flags = 0; /* not restoring original register state */ + + if (spill_cnt == size && + tnum_is_const(reg->var_off) && reg->var_off.value == 0) { + __mark_reg_const_zero(&state->regs[dst_regno]); + /* this IS register fill, so keep insn_flags */ + } else if (zero_cnt == size) { + /* similarly to mark_reg_stack_read(), preserve zeroes */ + __mark_reg_const_zero(&state->regs[dst_regno]); + insn_flags = 0; /* not restoring original register state */ + } else { + mark_reg_unknown(env, state->regs, dst_regno); + insn_flags = 0; /* not restoring original register state */ + } } state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } else if (dst_regno >= 0) { -- cgit From 18a433b62061e3d787bfc3e670fa711fecbd7cb4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 5 Dec 2023 10:42:46 -0800 Subject: bpf: track aligned STACK_ZERO cases as imprecise spilled registers Now that precision backtracing is supporting register spill/fill to/from stack, there is another oportunity to be exploited here: minimizing precise STACK_ZERO cases. With a simple code change we can rely on initially imprecise register spill tracking for cases when register spilled to stack was a known zero. This is a very common case for initializing on the stack variables, including rather large structures. Often times zero has no special meaning for the subsequent BPF program logic and is often overwritten with non-zero values soon afterwards. But due to STACK_ZERO vs STACK_MISC tracking, such initial zero initialization actually causes duplication of verifier states as STACK_ZERO is clearly different than STACK_MISC or spilled SCALAR_VALUE register. The effect of this (now) trivial change is huge, as can be seen below. These are differences between BPF selftests, Cilium, and Meta-internal BPF object files relative to previous patch in this series. You can see improvements ranging from single-digit percentage improvement for instructions and states, all the way to 50-60% reduction for some of Meta-internal host agent programs, and even some Cilium programs. For Meta-internal ones I left only the differences for largest BPF object files by states/instructions, as there were too many differences in the overall output. All the differences were improvements, reducting number of states and thus instructions validated. Note, Meta-internal BPF object file names are not printed below. Many copies of balancer_ingress are actually many different configurations of Katran, so they are different BPF programs, which explains state reduction going from -16% all the way to 31%, depending on BPF program logic complexity. I also tooked a closer look at a few small-ish BPF programs to validate the behavior. Let's take bpf_iter_netrlink.bpf.o (first row below). While it's just 8 vs 5 states, verifier log is still pretty long to include it here. But the reduction in states is due to the following piece of C code: unsigned long ino; ... sk = s->sk_socket; if (!sk) { ino = 0; } else { inode = SOCK_INODE(sk); bpf_probe_read_kernel(&ino, sizeof(ino), &inode->i_ino); } BPF_SEQ_PRINTF(seq, "%-8u %-8lu\n", s->sk_drops.counter, ino); return 0; You can see that in some situations `ino` is zero-initialized, while in others it's unknown value filled out by bpf_probe_read_kernel(). Before this change code after if/else branches have to be validated twice. Once with (precise) ino == 0, due to eager STACK_ZERO logic, and then again for when ino is just STACK_MISC. But BPF_SEQ_PRINTF() doesn't care about precise value of ino, so with the change in this patch verifier is able to prune states from after one of the branches, reducing number of total states (and instructions) required for successful validation. Similar principle applies to bigger real-world applications, just at a much larger scale. SELFTESTS ========= File Program Insns (A) Insns (B) Insns (DIFF) States (A) States (B) States (DIFF) --------------------------------------- ----------------------- --------- --------- --------------- ---------- ---------- ------------- bpf_iter_netlink.bpf.linked3.o dump_netlink 148 104 -44 (-29.73%) 8 5 -3 (-37.50%) bpf_iter_unix.bpf.linked3.o dump_unix 8474 8404 -70 (-0.83%) 151 147 -4 (-2.65%) bpf_loop.bpf.linked3.o stack_check 560 324 -236 (-42.14%) 42 24 -18 (-42.86%) local_storage_bench.bpf.linked3.o get_local 120 77 -43 (-35.83%) 9 6 -3 (-33.33%) loop6.bpf.linked3.o trace_virtqueue_add_sgs 10167 9868 -299 (-2.94%) 226 206 -20 (-8.85%) pyperf600_bpf_loop.bpf.linked3.o on_event 4872 3423 -1449 (-29.74%) 322 229 -93 (-28.88%) strobemeta.bpf.linked3.o on_event 180697 176036 -4661 (-2.58%) 4780 4734 -46 (-0.96%) test_cls_redirect.bpf.linked3.o cls_redirect 65594 65401 -193 (-0.29%) 4230 4212 -18 (-0.43%) test_global_func_args.bpf.linked3.o test_cls 145 136 -9 (-6.21%) 10 9 -1 (-10.00%) test_l4lb.bpf.linked3.o balancer_ingress 4760 2612 -2148 (-45.13%) 113 102 -11 (-9.73%) test_l4lb_noinline.bpf.linked3.o balancer_ingress 4845 4877 +32 (+0.66%) 219 221 +2 (+0.91%) test_l4lb_noinline_dynptr.bpf.linked3.o balancer_ingress 2072 2087 +15 (+0.72%) 97 98 +1 (+1.03%) test_seg6_loop.bpf.linked3.o __add_egr_x 12440 9975 -2465 (-19.82%) 364 353 -11 (-3.02%) test_tcp_hdr_options.bpf.linked3.o estab 2558 2572 +14 (+0.55%) 179 180 +1 (+0.56%) test_xdp_dynptr.bpf.linked3.o _xdp_tx_iptunnel 645 596 -49 (-7.60%) 26 24 -2 (-7.69%) test_xdp_noinline.bpf.linked3.o balancer_ingress_v6 3520 3516 -4 (-0.11%) 216 216 +0 (+0.00%) xdp_synproxy_kern.bpf.linked3.o syncookie_tc 82661 81241 -1420 (-1.72%) 5073 5155 +82 (+1.62%) xdp_synproxy_kern.bpf.linked3.o syncookie_xdp 84964 82297 -2667 (-3.14%) 5130 5157 +27 (+0.53%) META-INTERNAL ============= Program Insns (A) Insns (B) Insns (DIFF) States (A) States (B) States (DIFF) -------------------------------------- --------- --------- ----------------- ---------- ---------- --------------- balancer_ingress 27925 23608 -4317 (-15.46%) 1488 1482 -6 (-0.40%) balancer_ingress 31824 27546 -4278 (-13.44%) 1658 1652 -6 (-0.36%) balancer_ingress 32213 27935 -4278 (-13.28%) 1689 1683 -6 (-0.36%) balancer_ingress 32213 27935 -4278 (-13.28%) 1689 1683 -6 (-0.36%) balancer_ingress 31824 27546 -4278 (-13.44%) 1658 1652 -6 (-0.36%) balancer_ingress 38647 29562 -9085 (-23.51%) 2069 1835 -234 (-11.31%) balancer_ingress 38647 29562 -9085 (-23.51%) 2069 1835 -234 (-11.31%) balancer_ingress 40339 30792 -9547 (-23.67%) 2193 1934 -259 (-11.81%) balancer_ingress 37321 29055 -8266 (-22.15%) 1972 1795 -177 (-8.98%) balancer_ingress 38176 29753 -8423 (-22.06%) 2008 1831 -177 (-8.81%) balancer_ingress 29193 20910 -8283 (-28.37%) 1599 1422 -177 (-11.07%) balancer_ingress 30013 21452 -8561 (-28.52%) 1645 1447 -198 (-12.04%) balancer_ingress 28691 24290 -4401 (-15.34%) 1545 1531 -14 (-0.91%) balancer_ingress 34223 28965 -5258 (-15.36%) 1984 1875 -109 (-5.49%) balancer_ingress 35481 26158 -9323 (-26.28%) 2095 1806 -289 (-13.79%) balancer_ingress 35481 26158 -9323 (-26.28%) 2095 1806 -289 (-13.79%) balancer_ingress 35868 26455 -9413 (-26.24%) 2140 1827 -313 (-14.63%) balancer_ingress 35868 26455 -9413 (-26.24%) 2140 1827 -313 (-14.63%) balancer_ingress 35481 26158 -9323 (-26.28%) 2095 1806 -289 (-13.79%) balancer_ingress 35481 26158 -9323 (-26.28%) 2095 1806 -289 (-13.79%) balancer_ingress 34844 29485 -5359 (-15.38%) 2036 1918 -118 (-5.80%) fbflow_egress 3256 2652 -604 (-18.55%) 218 192 -26 (-11.93%) fbflow_ingress 1026 944 -82 (-7.99%) 70 63 -7 (-10.00%) sslwall_tc_egress 8424 7360 -1064 (-12.63%) 498 458 -40 (-8.03%) syar_accept_protect 15040 9539 -5501 (-36.58%) 364 220 -144 (-39.56%) syar_connect_tcp_v6 15036 9535 -5501 (-36.59%) 360 216 -144 (-40.00%) syar_connect_udp_v4 15039 9538 -5501 (-36.58%) 361 217 -144 (-39.89%) syar_connect_connect4_protect4 24805 15833 -8972 (-36.17%) 756 480 -276 (-36.51%) syar_lsm_file_open 167772 151813 -15959 (-9.51%) 1836 1667 -169 (-9.20%) syar_namespace_create_new 14805 9304 -5501 (-37.16%) 353 209 -144 (-40.79%) syar_python3_detect 17531 12030 -5501 (-31.38%) 391 247 -144 (-36.83%) syar_ssh_post_fork 16412 10911 -5501 (-33.52%) 405 261 -144 (-35.56%) syar_enter_execve 14728 9227 -5501 (-37.35%) 345 201 -144 (-41.74%) syar_enter_execveat 14728 9227 -5501 (-37.35%) 345 201 -144 (-41.74%) syar_exit_execve 16622 11121 -5501 (-33.09%) 376 232 -144 (-38.30%) syar_exit_execveat 16622 11121 -5501 (-33.09%) 376 232 -144 (-38.30%) syar_syscalls_kill 15288 9787 -5501 (-35.98%) 398 254 -144 (-36.18%) syar_task_enter_pivot_root 14898 9397 -5501 (-36.92%) 357 213 -144 (-40.34%) syar_syscalls_setreuid 16678 11177 -5501 (-32.98%) 429 285 -144 (-33.57%) syar_syscalls_setuid 16678 11177 -5501 (-32.98%) 429 285 -144 (-33.57%) syar_syscalls_process_vm_readv 14959 9458 -5501 (-36.77%) 364 220 -144 (-39.56%) syar_syscalls_process_vm_writev 15757 10256 -5501 (-34.91%) 390 246 -144 (-36.92%) do_uprobe 15519 10018 -5501 (-35.45%) 373 229 -144 (-38.61%) edgewall 179715 55783 -123932 (-68.96%) 12607 3999 -8608 (-68.28%) bictcp_state 7570 4131 -3439 (-45.43%) 496 269 -227 (-45.77%) cubictcp_state 7570 4131 -3439 (-45.43%) 496 269 -227 (-45.77%) tcp_rate_skb_delivered 447 272 -175 (-39.15%) 29 18 -11 (-37.93%) kprobe__bbr_set_state 4566 2615 -1951 (-42.73%) 209 124 -85 (-40.67%) kprobe__bictcp_state 4566 2615 -1951 (-42.73%) 209 124 -85 (-40.67%) inet_sock_set_state 1501 1337 -164 (-10.93%) 93 85 -8 (-8.60%) tcp_retransmit_skb 1145 981 -164 (-14.32%) 67 59 -8 (-11.94%) tcp_retransmit_synack 1183 951 -232 (-19.61%) 67 55 -12 (-17.91%) bpf_tcptuner 1459 1187 -272 (-18.64%) 99 80 -19 (-19.19%) tw_egress 801 776 -25 (-3.12%) 69 66 -3 (-4.35%) tw_ingress 795 770 -25 (-3.14%) 69 66 -3 (-4.35%) ttls_tc_ingress 19025 19383 +358 (+1.88%) 470 465 -5 (-1.06%) ttls_nat_egress 490 299 -191 (-38.98%) 33 20 -13 (-39.39%) ttls_nat_ingress 448 285 -163 (-36.38%) 32 21 -11 (-34.38%) tw_twfw_egress 511127 212071 -299056 (-58.51%) 16733 8504 -8229 (-49.18%) tw_twfw_ingress 500095 212069 -288026 (-57.59%) 16223 8504 -7719 (-47.58%) tw_twfw_tc_eg 511113 212064 -299049 (-58.51%) 16732 8504 -8228 (-49.18%) tw_twfw_tc_in 500095 212069 -288026 (-57.59%) 16223 8504 -7719 (-47.58%) tw_twfw_egress 12632 12435 -197 (-1.56%) 276 260 -16 (-5.80%) tw_twfw_ingress 12631 12454 -177 (-1.40%) 278 261 -17 (-6.12%) tw_twfw_tc_eg 12595 12435 -160 (-1.27%) 274 259 -15 (-5.47%) tw_twfw_tc_in 12631 12454 -177 (-1.40%) 278 261 -17 (-6.12%) tw_xdp_dump 266 209 -57 (-21.43%) 9 8 -1 (-11.11%) CILIUM ========= File Program Insns (A) Insns (B) Insns (DIFF) States (A) States (B) States (DIFF) ------------- -------------------------------- --------- --------- ---------------- ---------- ---------- -------------- bpf_host.o cil_to_netdev 6047 4578 -1469 (-24.29%) 362 249 -113 (-31.22%) bpf_host.o handle_lxc_traffic 2227 1585 -642 (-28.83%) 156 103 -53 (-33.97%) bpf_host.o tail_handle_ipv4_from_netdev 2244 1458 -786 (-35.03%) 163 106 -57 (-34.97%) bpf_host.o tail_handle_nat_fwd_ipv4 21022 10479 -10543 (-50.15%) 1289 670 -619 (-48.02%) bpf_host.o tail_handle_nat_fwd_ipv6 15433 11375 -4058 (-26.29%) 905 643 -262 (-28.95%) bpf_host.o tail_ipv4_host_policy_ingress 2219 1367 -852 (-38.40%) 161 96 -65 (-40.37%) bpf_host.o tail_nodeport_nat_egress_ipv4 22460 19862 -2598 (-11.57%) 1469 1293 -176 (-11.98%) bpf_host.o tail_nodeport_nat_ingress_ipv4 5526 3534 -1992 (-36.05%) 366 243 -123 (-33.61%) bpf_host.o tail_nodeport_nat_ingress_ipv6 5132 4256 -876 (-17.07%) 241 219 -22 (-9.13%) bpf_host.o tail_nodeport_nat_ipv6_egress 3702 3542 -160 (-4.32%) 215 205 -10 (-4.65%) bpf_lxc.o tail_handle_nat_fwd_ipv4 21022 10479 -10543 (-50.15%) 1289 670 -619 (-48.02%) bpf_lxc.o tail_handle_nat_fwd_ipv6 15433 11375 -4058 (-26.29%) 905 643 -262 (-28.95%) bpf_lxc.o tail_ipv4_ct_egress 5073 3374 -1699 (-33.49%) 262 172 -90 (-34.35%) bpf_lxc.o tail_ipv4_ct_ingress 5093 3385 -1708 (-33.54%) 262 172 -90 (-34.35%) bpf_lxc.o tail_ipv4_ct_ingress_policy_only 5093 3385 -1708 (-33.54%) 262 172 -90 (-34.35%) bpf_lxc.o tail_ipv6_ct_egress 4593 3878 -715 (-15.57%) 194 151 -43 (-22.16%) bpf_lxc.o tail_ipv6_ct_ingress 4606 3891 -715 (-15.52%) 194 151 -43 (-22.16%) bpf_lxc.o tail_ipv6_ct_ingress_policy_only 4606 3891 -715 (-15.52%) 194 151 -43 (-22.16%) bpf_lxc.o tail_nodeport_nat_ingress_ipv4 5526 3534 -1992 (-36.05%) 366 243 -123 (-33.61%) bpf_lxc.o tail_nodeport_nat_ingress_ipv6 5132 4256 -876 (-17.07%) 241 219 -22 (-9.13%) bpf_overlay.o tail_handle_nat_fwd_ipv4 20524 10114 -10410 (-50.72%) 1271 638 -633 (-49.80%) bpf_overlay.o tail_nodeport_nat_egress_ipv4 22718 19490 -3228 (-14.21%) 1475 1275 -200 (-13.56%) bpf_overlay.o tail_nodeport_nat_ingress_ipv4 5526 3534 -1992 (-36.05%) 366 243 -123 (-33.61%) bpf_overlay.o tail_nodeport_nat_ingress_ipv6 5132 4256 -876 (-17.07%) 241 219 -22 (-9.13%) bpf_overlay.o tail_nodeport_nat_ipv6_egress 3638 3548 -90 (-2.47%) 209 203 -6 (-2.87%) bpf_overlay.o tail_rev_nodeport_lb4 4368 3820 -548 (-12.55%) 248 215 -33 (-13.31%) bpf_overlay.o tail_rev_nodeport_lb6 2867 2428 -439 (-15.31%) 167 140 -27 (-16.17%) bpf_sock.o cil_sock6_connect 1718 1703 -15 (-0.87%) 100 99 -1 (-1.00%) bpf_xdp.o tail_handle_nat_fwd_ipv4 12917 12443 -474 (-3.67%) 875 849 -26 (-2.97%) bpf_xdp.o tail_handle_nat_fwd_ipv6 13515 13264 -251 (-1.86%) 715 702 -13 (-1.82%) bpf_xdp.o tail_lb_ipv4 39492 36367 -3125 (-7.91%) 2430 2251 -179 (-7.37%) bpf_xdp.o tail_lb_ipv6 80441 78058 -2383 (-2.96%) 3647 3523 -124 (-3.40%) bpf_xdp.o tail_nodeport_ipv6_dsr 1038 901 -137 (-13.20%) 61 55 -6 (-9.84%) bpf_xdp.o tail_nodeport_nat_egress_ipv4 13027 12096 -931 (-7.15%) 868 809 -59 (-6.80%) bpf_xdp.o tail_nodeport_nat_ingress_ipv4 7617 5900 -1717 (-22.54%) 522 413 -109 (-20.88%) bpf_xdp.o tail_nodeport_nat_ingress_ipv6 7575 7395 -180 (-2.38%) 383 374 -9 (-2.35%) bpf_xdp.o tail_rev_nodeport_lb4 6808 6739 -69 (-1.01%) 403 396 -7 (-1.74%) bpf_xdp.o tail_rev_nodeport_lb6 16173 15847 -326 (-2.02%) 1010 990 -20 (-1.98%) Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231205184248.1502704-9-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1ebe76c98451..e5ce530641ba 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4477,8 +4477,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, return err; mark_stack_slot_scratched(env, spi); - if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && - !register_is_null(reg) && env->bpf_capable) { + if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && env->bpf_capable) { save_register_state(env, state, spi, reg, size); /* Break the relation on a narrowing spill. */ if (fls64(reg->umax_value) > BITS_PER_BYTE * size) @@ -4527,7 +4526,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, /* when we zero initialize stack slots mark them as such */ if ((reg && register_is_null(reg)) || (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) { - /* backtracking doesn't work for STACK_ZERO yet. */ + /* STACK_ZERO case happened because register spill + * wasn't properly aligned at the stack slot boundary, + * so it's not a register spill anymore; force + * originating register to be precise to make + * STACK_ZERO correct for subsequent states + */ err = mark_chain_precision(env, value_regno); if (err) return err; -- cgit From 909fa05dd3c181e5b403912889057f7cdbf3906c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:13 -0800 Subject: bpf: align CAP_NET_ADMIN checks with bpf_capable() approach Within BPF syscall handling code CAP_NET_ADMIN checks stand out a bit compared to CAP_BPF and CAP_PERFMON checks. For the latter, CAP_BPF or CAP_PERFMON are checked first, but if they are not set, CAP_SYS_ADMIN takes over and grants whatever part of BPF syscall is required. Similar kind of checks that involve CAP_NET_ADMIN are not so consistent. One out of four uses does follow CAP_BPF/CAP_PERFMON model: during BPF_PROG_LOAD, if the type of BPF program is "network-related" either CAP_NET_ADMIN or CAP_SYS_ADMIN is required to proceed. But in three other cases CAP_NET_ADMIN is required even if CAP_SYS_ADMIN is set: - when creating DEVMAP/XDKMAP/CPU_MAP maps; - when attaching CGROUP_SKB programs; - when handling BPF_PROG_QUERY command. This patch is changing the latter three cases to follow BPF_PROG_LOAD model, that is allowing to proceed under either CAP_NET_ADMIN or CAP_SYS_ADMIN. This also makes it cleaner in subsequent BPF token patches to switch wholesomely to a generic bpf_token_capable(int cap) check, that always falls back to CAP_SYS_ADMIN if requested capability is missing. Cc: Jakub Kicinski Acked-by: Yafang Shao Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ebaccf77d56e..ee33a52abf18 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1121,6 +1121,11 @@ free_map_tab: return ret; } +static bool bpf_net_capable(void) +{ + return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); +} + #define BPF_MAP_CREATE_LAST_FIELD map_extra /* called via syscall */ static int map_create(union bpf_attr *attr) @@ -1224,7 +1229,7 @@ static int map_create(union bpf_attr *attr) case BPF_MAP_TYPE_DEVMAP: case BPF_MAP_TYPE_DEVMAP_HASH: case BPF_MAP_TYPE_XSKMAP: - if (!capable(CAP_NET_ADMIN)) + if (!bpf_net_capable()) return -EPERM; break; default: @@ -2625,7 +2630,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) !bpf_capable()) return -EPERM; - if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN)) + if (is_net_admin_prog_type(type) && !bpf_net_capable()) return -EPERM; if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; @@ -3777,7 +3782,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_SK_LOOKUP: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: - if (!capable(CAP_NET_ADMIN)) + if (!bpf_net_capable()) /* cg-skb progs can be loaded by unpriv user. * check permissions at attach time. */ @@ -3980,7 +3985,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { - if (!capable(CAP_NET_ADMIN)) + if (!bpf_net_capable()) return -EPERM; if (CHECK_ATTR(BPF_PROG_QUERY)) return -EINVAL; -- cgit From 40bba140c60fbb3ee8df6203c82fbd3de9f19d95 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:14 -0800 Subject: bpf: add BPF token delegation mount options to BPF FS Add few new mount options to BPF FS that allow to specify that a given BPF FS instance allows creation of BPF token (added in the next patch), and what sort of operations are allowed under BPF token. As such, we get 4 new mount options, each is a bit mask - `delegate_cmds` allow to specify which bpf() syscall commands are allowed with BPF token derived from this BPF FS instance; - if BPF_MAP_CREATE command is allowed, `delegate_maps` specifies a set of allowable BPF map types that could be created with BPF token; - if BPF_PROG_LOAD command is allowed, `delegate_progs` specifies a set of allowable BPF program types that could be loaded with BPF token; - if BPF_PROG_LOAD command is allowed, `delegate_attachs` specifies a set of allowable BPF program attach types that could be loaded with BPF token; delegate_progs and delegate_attachs are meant to be used together, as full BPF program type is, in general, determined through both program type and program attach type. Currently, these mount options accept the following forms of values: - a special value "any", that enables all possible values of a given bit set; - numeric value (decimal or hexadecimal, determined by kernel automatically) that specifies a bit mask value directly; - all the values for a given mount option are combined, if specified multiple times. E.g., `mount -t bpf nodev /path/to/mount -o delegate_maps=0x1 -o delegate_maps=0x2` will result in a combined 0x3 mask. Ideally, more convenient (for humans) symbolic form derived from corresponding UAPI enums would be accepted (e.g., `-o delegate_progs=kprobe|tracepoint`) and I intend to implement this, but it requires a bunch of UAPI header churn, so I postponed it until this feature lands upstream or at least there is a definite consensus that this feature is acceptable and is going to make it, just to minimize amount of wasted effort and not increase amount of non-essential code to be reviewed. Attentive reader will notice that BPF FS is now marked as FS_USERNS_MOUNT, which theoretically makes it mountable inside non-init user namespace as long as the process has sufficient *namespaced* capabilities within that user namespace. But in reality we still restrict BPF FS to be mountable only by processes with CAP_SYS_ADMIN *in init userns* (extra check in bpf_fill_super()). FS_USERNS_MOUNT is added to allow creating BPF FS context object (i.e., fsopen("bpf")) from inside unprivileged process inside non-init userns, to capture that userns as the owning userns. It will still be required to pass this context object back to privileged process to instantiate and mount it. This manipulation is important, because capturing non-init userns as the owning userns of BPF FS instance (super block) allows to use that userns to constraint BPF token to that userns later on (see next patch). So creating BPF FS with delegation inside unprivileged userns will restrict derived BPF token objects to only "work" inside that intended userns, making it scoped to a intended "container". Also, setting these delegation options requires capable(CAP_SYS_ADMIN), so unprivileged process cannot set this up without involvement of a privileged process. There is a set of selftests at the end of the patch set that simulates this sequence of steps and validates that everything works as intended. But careful review is requested to make sure there are no missed gaps in the implementation and testing. This somewhat subtle set of aspects is the result of previous discussions ([0]) about various user namespace implications and interactions with BPF token functionality and is necessary to contain BPF token inside intended user namespace. [0] https://lore.kernel.org/bpf/20230704-hochverdient-lehne-eeb9eeef785e@brauner/ Acked-by: Christian Brauner Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/inode.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 78 insertions(+), 10 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 1aafb2ff2e95..220fe0f99095 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "preload/bpf_preload.h" enum bpf_type { @@ -599,10 +600,31 @@ EXPORT_SYMBOL(bpf_prog_get_type_path); */ static int bpf_show_options(struct seq_file *m, struct dentry *root) { + struct bpf_mount_opts *opts = root->d_sb->s_fs_info; umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; if (mode != S_IRWXUGO) seq_printf(m, ",mode=%o", mode); + + if (opts->delegate_cmds == ~0ULL) + seq_printf(m, ",delegate_cmds=any"); + else if (opts->delegate_cmds) + seq_printf(m, ",delegate_cmds=0x%llx", opts->delegate_cmds); + + if (opts->delegate_maps == ~0ULL) + seq_printf(m, ",delegate_maps=any"); + else if (opts->delegate_maps) + seq_printf(m, ",delegate_maps=0x%llx", opts->delegate_maps); + + if (opts->delegate_progs == ~0ULL) + seq_printf(m, ",delegate_progs=any"); + else if (opts->delegate_progs) + seq_printf(m, ",delegate_progs=0x%llx", opts->delegate_progs); + + if (opts->delegate_attachs == ~0ULL) + seq_printf(m, ",delegate_attachs=any"); + else if (opts->delegate_attachs) + seq_printf(m, ",delegate_attachs=0x%llx", opts->delegate_attachs); return 0; } @@ -626,22 +648,27 @@ static const struct super_operations bpf_super_ops = { enum { OPT_MODE, + OPT_DELEGATE_CMDS, + OPT_DELEGATE_MAPS, + OPT_DELEGATE_PROGS, + OPT_DELEGATE_ATTACHS, }; static const struct fs_parameter_spec bpf_fs_parameters[] = { fsparam_u32oct ("mode", OPT_MODE), + fsparam_string ("delegate_cmds", OPT_DELEGATE_CMDS), + fsparam_string ("delegate_maps", OPT_DELEGATE_MAPS), + fsparam_string ("delegate_progs", OPT_DELEGATE_PROGS), + fsparam_string ("delegate_attachs", OPT_DELEGATE_ATTACHS), {} }; -struct bpf_mount_opts { - umode_t mode; -}; - static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) { - struct bpf_mount_opts *opts = fc->fs_private; + struct bpf_mount_opts *opts = fc->s_fs_info; struct fs_parse_result result; - int opt; + int opt, err; + u64 msk; opt = fs_parse(fc, bpf_fs_parameters, param, &result); if (opt < 0) { @@ -665,6 +692,28 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) case OPT_MODE: opts->mode = result.uint_32 & S_IALLUGO; break; + case OPT_DELEGATE_CMDS: + case OPT_DELEGATE_MAPS: + case OPT_DELEGATE_PROGS: + case OPT_DELEGATE_ATTACHS: + if (strcmp(param->string, "any") == 0) { + msk = ~0ULL; + } else { + err = kstrtou64(param->string, 0, &msk); + if (err) + return err; + } + /* Setting delegation mount options requires privileges */ + if (msk && !capable(CAP_SYS_ADMIN)) + return -EPERM; + switch (opt) { + case OPT_DELEGATE_CMDS: opts->delegate_cmds |= msk; break; + case OPT_DELEGATE_MAPS: opts->delegate_maps |= msk; break; + case OPT_DELEGATE_PROGS: opts->delegate_progs |= msk; break; + case OPT_DELEGATE_ATTACHS: opts->delegate_attachs |= msk; break; + default: return -EINVAL; + } + break; } return 0; @@ -739,10 +788,14 @@ out: static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr bpf_rfiles[] = { { "" } }; - struct bpf_mount_opts *opts = fc->fs_private; + struct bpf_mount_opts *opts = sb->s_fs_info; struct inode *inode; int ret; + /* Mounting an instance of BPF FS requires privileges */ + if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN)) + return -EPERM; + ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); if (ret) return ret; @@ -764,7 +817,7 @@ static int bpf_get_tree(struct fs_context *fc) static void bpf_free_fc(struct fs_context *fc) { - kfree(fc->fs_private); + kfree(fc->s_fs_info); } static const struct fs_context_operations bpf_context_ops = { @@ -786,17 +839,32 @@ static int bpf_init_fs_context(struct fs_context *fc) opts->mode = S_IRWXUGO; - fc->fs_private = opts; + /* start out with no BPF token delegation enabled */ + opts->delegate_cmds = 0; + opts->delegate_maps = 0; + opts->delegate_progs = 0; + opts->delegate_attachs = 0; + + fc->s_fs_info = opts; fc->ops = &bpf_context_ops; return 0; } +static void bpf_kill_super(struct super_block *sb) +{ + struct bpf_mount_opts *opts = sb->s_fs_info; + + kill_litter_super(sb); + kfree(opts); +} + static struct file_system_type bpf_fs_type = { .owner = THIS_MODULE, .name = "bpf", .init_fs_context = bpf_init_fs_context, .parameters = bpf_fs_parameters, - .kill_sb = kill_litter_super, + .kill_sb = bpf_kill_super, + .fs_flags = FS_USERNS_MOUNT, }; static int __init bpf_init(void) -- cgit From 4527358b76861dfd64ee34aba45d81648fbc8a61 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:15 -0800 Subject: bpf: introduce BPF token object Add new kind of BPF kernel object, BPF token. BPF token is meant to allow delegating privileged BPF functionality, like loading a BPF program or creating a BPF map, from privileged process to a *trusted* unprivileged process, all while having a good amount of control over which privileged operations could be performed using provided BPF token. This is achieved through mounting BPF FS instance with extra delegation mount options, which determine what operations are delegatable, and also constraining it to the owning user namespace (as mentioned in the previous patch). BPF token itself is just a derivative from BPF FS and can be created through a new bpf() syscall command, BPF_TOKEN_CREATE, which accepts BPF FS FD, which can be attained through open() API by opening BPF FS mount point. Currently, BPF token "inherits" delegated command, map types, prog type, and attach type bit sets from BPF FS as is. In the future, having an BPF token as a separate object with its own FD, we can allow to further restrict BPF token's allowable set of things either at the creation time or after the fact, allowing the process to guard itself further from unintentionally trying to load undesired kind of BPF programs. But for now we keep things simple and just copy bit sets as is. When BPF token is created from BPF FS mount, we take reference to the BPF super block's owning user namespace, and then use that namespace for checking all the {CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN} capabilities that are normally only checked against init userns (using capable()), but now we check them using ns_capable() instead (if BPF token is provided). See bpf_token_capable() for details. Such setup means that BPF token in itself is not sufficient to grant BPF functionality. User namespaced process has to *also* have necessary combination of capabilities inside that user namespace. So while previously CAP_BPF was useless when granted within user namespace, now it gains a meaning and allows container managers and sys admins to have a flexible control over which processes can and need to use BPF functionality within the user namespace (i.e., container in practice). And BPF FS delegation mount options and derived BPF tokens serve as a per-container "flag" to grant overall ability to use bpf() (plus further restrict on which parts of bpf() syscalls are treated as namespaced). Note also, BPF_TOKEN_CREATE command itself requires ns_capable(CAP_BPF) within the BPF FS owning user namespace, rounding up the ns_capable() story of BPF token. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/Makefile | 2 +- kernel/bpf/inode.c | 12 +-- kernel/bpf/syscall.c | 17 ++++ kernel/bpf/token.c | 214 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 239 insertions(+), 6 deletions(-) create mode 100644 kernel/bpf/token.c (limited to 'kernel/bpf') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index f526b7573e97..4ce95acfcaa7 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 220fe0f99095..6ce3f9696e72 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -99,9 +99,9 @@ static const struct inode_operations bpf_prog_iops = { }; static const struct inode_operations bpf_map_iops = { }; static const struct inode_operations bpf_link_iops = { }; -static struct inode *bpf_get_inode(struct super_block *sb, - const struct inode *dir, - umode_t mode) +struct inode *bpf_get_inode(struct super_block *sb, + const struct inode *dir, + umode_t mode) { struct inode *inode; @@ -602,11 +602,13 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) { struct bpf_mount_opts *opts = root->d_sb->s_fs_info; umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; + u64 mask; if (mode != S_IRWXUGO) seq_printf(m, ",mode=%o", mode); - if (opts->delegate_cmds == ~0ULL) + mask = (1ULL << __MAX_BPF_CMD) - 1; + if ((opts->delegate_cmds & mask) == mask) seq_printf(m, ",delegate_cmds=any"); else if (opts->delegate_cmds) seq_printf(m, ",delegate_cmds=0x%llx", opts->delegate_cmds); @@ -639,7 +641,7 @@ static void bpf_free_inode(struct inode *inode) free_inode_nonrcu(inode); } -static const struct super_operations bpf_super_ops = { +const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = bpf_show_options, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ee33a52abf18..a156d549b356 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5377,6 +5377,20 @@ out_prog_put: return ret; } +#define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd + +static int token_create(union bpf_attr *attr) +{ + if (CHECK_ATTR(BPF_TOKEN_CREATE)) + return -EINVAL; + + /* no flags are supported yet */ + if (attr->token_create.flags) + return -EINVAL; + + return bpf_token_create(attr); +} + static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; @@ -5510,6 +5524,9 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) case BPF_PROG_BIND_MAP: err = bpf_prog_bind_map(&attr); break; + case BPF_TOKEN_CREATE: + err = token_create(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c new file mode 100644 index 000000000000..e18aaecc67e9 --- /dev/null +++ b/kernel/bpf/token.c @@ -0,0 +1,214 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +bool bpf_token_capable(const struct bpf_token *token, int cap) +{ + /* BPF token allows ns_capable() level of capabilities, but only if + * token's userns is *exactly* the same as current user's userns + */ + if (token && current_user_ns() == token->userns) { + if (ns_capable(token->userns, cap)) + return true; + if (cap != CAP_SYS_ADMIN && ns_capable(token->userns, CAP_SYS_ADMIN)) + return true; + } + /* otherwise fallback to capable() checks */ + return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN)); +} + +void bpf_token_inc(struct bpf_token *token) +{ + atomic64_inc(&token->refcnt); +} + +static void bpf_token_free(struct bpf_token *token) +{ + put_user_ns(token->userns); + kvfree(token); +} + +static void bpf_token_put_deferred(struct work_struct *work) +{ + struct bpf_token *token = container_of(work, struct bpf_token, work); + + bpf_token_free(token); +} + +void bpf_token_put(struct bpf_token *token) +{ + if (!token) + return; + + if (!atomic64_dec_and_test(&token->refcnt)) + return; + + INIT_WORK(&token->work, bpf_token_put_deferred); + schedule_work(&token->work); +} + +static int bpf_token_release(struct inode *inode, struct file *filp) +{ + struct bpf_token *token = filp->private_data; + + bpf_token_put(token); + return 0; +} + +static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp) +{ + struct bpf_token *token = filp->private_data; + u64 mask; + + BUILD_BUG_ON(__MAX_BPF_CMD >= 64); + mask = (1ULL << __MAX_BPF_CMD) - 1; + if ((token->allowed_cmds & mask) == mask) + seq_printf(m, "allowed_cmds:\tany\n"); + else + seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds); +} + +#define BPF_TOKEN_INODE_NAME "bpf-token" + +static const struct inode_operations bpf_token_iops = { }; + +static const struct file_operations bpf_token_fops = { + .release = bpf_token_release, + .show_fdinfo = bpf_token_show_fdinfo, +}; + +int bpf_token_create(union bpf_attr *attr) +{ + struct bpf_mount_opts *mnt_opts; + struct bpf_token *token = NULL; + struct user_namespace *userns; + struct inode *inode; + struct file *file; + struct path path; + struct fd f; + umode_t mode; + int err, fd; + + f = fdget(attr->token_create.bpffs_fd); + if (!f.file) + return -EBADF; + + path = f.file->f_path; + path_get(&path); + fdput(f); + + if (path.dentry != path.mnt->mnt_sb->s_root) { + err = -EINVAL; + goto out_path; + } + if (path.mnt->mnt_sb->s_op != &bpf_super_ops) { + err = -EINVAL; + goto out_path; + } + err = path_permission(&path, MAY_ACCESS); + if (err) + goto out_path; + + userns = path.dentry->d_sb->s_user_ns; + /* + * Enforce that creators of BPF tokens are in the same user + * namespace as the BPF FS instance. This makes reasoning about + * permissions a lot easier and we can always relax this later. + */ + if (current_user_ns() != userns) { + err = -EPERM; + goto out_path; + } + if (!ns_capable(userns, CAP_BPF)) { + err = -EPERM; + goto out_path; + } + + mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); + inode = bpf_get_inode(path.mnt->mnt_sb, NULL, mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_path; + } + + inode->i_op = &bpf_token_iops; + inode->i_fop = &bpf_token_fops; + clear_nlink(inode); /* make sure it is unlinked */ + + file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops); + if (IS_ERR(file)) { + iput(inode); + err = PTR_ERR(file); + goto out_path; + } + + token = kvzalloc(sizeof(*token), GFP_USER); + if (!token) { + err = -ENOMEM; + goto out_file; + } + + atomic64_set(&token->refcnt, 1); + + /* remember bpffs owning userns for future ns_capable() checks */ + token->userns = get_user_ns(userns); + + mnt_opts = path.dentry->d_sb->s_fs_info; + token->allowed_cmds = mnt_opts->delegate_cmds; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) { + err = fd; + goto out_token; + } + + file->private_data = token; + fd_install(fd, file); + + path_put(&path); + return fd; + +out_token: + bpf_token_free(token); +out_file: + fput(file); +out_path: + path_put(&path); + return err; +} + +struct bpf_token *bpf_token_get_from_fd(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_token *token; + + if (!f.file) + return ERR_PTR(-EBADF); + if (f.file->f_op != &bpf_token_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + token = f.file->private_data; + bpf_token_inc(token); + fdput(f); + + return token; +} + +bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd) +{ + /* BPF token can be used only within exactly the same userns in which + * it was created + */ + if (!token || current_user_ns() != token->userns) + return false; + + return token->allowed_cmds & (1ULL << cmd); +} -- cgit From 688b7270b3cb75e8ac78123d719967db40336e5b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:16 -0800 Subject: bpf: add BPF token support to BPF_MAP_CREATE command Allow providing token_fd for BPF_MAP_CREATE command to allow controlled BPF map creation from unprivileged process through delegated BPF token. Wire through a set of allowed BPF map types to BPF token, derived from BPF FS at BPF token creation time. This, in combination with allowed_cmds allows to create a narrowly-focused BPF token (controlled by privileged agent) with a restrictive set of BPF maps that application can attempt to create. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/inode.c | 3 ++- kernel/bpf/syscall.c | 52 ++++++++++++++++++++++++++++++++++++++-------------- kernel/bpf/token.c | 16 ++++++++++++++++ 3 files changed, 56 insertions(+), 15 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 6ce3f9696e72..9c7865d1c53d 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -613,7 +613,8 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) else if (opts->delegate_cmds) seq_printf(m, ",delegate_cmds=0x%llx", opts->delegate_cmds); - if (opts->delegate_maps == ~0ULL) + mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1; + if ((opts->delegate_maps & mask) == mask) seq_printf(m, ",delegate_maps=any"); else if (opts->delegate_maps) seq_printf(m, ",delegate_maps=0x%llx", opts->delegate_maps); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a156d549b356..22e14124cd61 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1009,8 +1009,8 @@ int map_check_no_btf(const struct bpf_map *map, return -ENOTSUPP; } -static int map_check_btf(struct bpf_map *map, const struct btf *btf, - u32 btf_key_id, u32 btf_value_id) +static int map_check_btf(struct bpf_map *map, struct bpf_token *token, + const struct btf *btf, u32 btf_key_id, u32 btf_value_id) { const struct btf_type *key_type, *value_type; u32 key_size, value_size; @@ -1038,7 +1038,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, if (!IS_ERR_OR_NULL(map->record)) { int i; - if (!bpf_capable()) { + if (!bpf_token_capable(token, CAP_BPF)) { ret = -EPERM; goto free_map_tab; } @@ -1126,11 +1126,12 @@ static bool bpf_net_capable(void) return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); } -#define BPF_MAP_CREATE_LAST_FIELD map_extra +#define BPF_MAP_CREATE_LAST_FIELD map_token_fd /* called via syscall */ static int map_create(union bpf_attr *attr) { const struct bpf_map_ops *ops; + struct bpf_token *token = NULL; int numa_node = bpf_map_attr_numa_node(attr); u32 map_type = attr->map_type; struct bpf_map *map; @@ -1181,14 +1182,32 @@ static int map_create(union bpf_attr *attr) if (!ops->map_mem_usage) return -EINVAL; + if (attr->map_token_fd) { + token = bpf_token_get_from_fd(attr->map_token_fd); + if (IS_ERR(token)) + return PTR_ERR(token); + + /* if current token doesn't grant map creation permissions, + * then we can't use this token, so ignore it and rely on + * system-wide capabilities checks + */ + if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) || + !bpf_token_allow_map_type(token, attr->map_type)) { + bpf_token_put(token); + token = NULL; + } + } + + err = -EPERM; + /* Intent here is for unprivileged_bpf_disabled to block BPF map * creation for unprivileged users; other actions depend * on fd availability and access to bpffs, so are dependent on * object creation success. Even with unprivileged BPF disabled, * capability checks are still carried out. */ - if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) - return -EPERM; + if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF)) + goto put_token; /* check privileged map type permissions */ switch (map_type) { @@ -1221,25 +1240,27 @@ static int map_create(union bpf_attr *attr) case BPF_MAP_TYPE_LRU_PERCPU_HASH: case BPF_MAP_TYPE_STRUCT_OPS: case BPF_MAP_TYPE_CPUMAP: - if (!bpf_capable()) - return -EPERM; + if (!bpf_token_capable(token, CAP_BPF)) + goto put_token; break; case BPF_MAP_TYPE_SOCKMAP: case BPF_MAP_TYPE_SOCKHASH: case BPF_MAP_TYPE_DEVMAP: case BPF_MAP_TYPE_DEVMAP_HASH: case BPF_MAP_TYPE_XSKMAP: - if (!bpf_net_capable()) - return -EPERM; + if (!bpf_token_capable(token, CAP_NET_ADMIN)) + goto put_token; break; default: WARN(1, "unsupported map type %d", map_type); - return -EPERM; + goto put_token; } map = ops->map_alloc(attr); - if (IS_ERR(map)) - return PTR_ERR(map); + if (IS_ERR(map)) { + err = PTR_ERR(map); + goto put_token; + } map->ops = ops; map->map_type = map_type; @@ -1276,7 +1297,7 @@ static int map_create(union bpf_attr *attr) map->btf = btf; if (attr->btf_value_type_id) { - err = map_check_btf(map, btf, attr->btf_key_type_id, + err = map_check_btf(map, token, btf, attr->btf_key_type_id, attr->btf_value_type_id); if (err) goto free_map; @@ -1297,6 +1318,7 @@ static int map_create(union bpf_attr *attr) goto free_map_sec; bpf_map_save_memcg(map); + bpf_token_put(token); err = bpf_map_new_fd(map, f_flags); if (err < 0) { @@ -1317,6 +1339,8 @@ free_map_sec: free_map: btf_put(map->btf); map->ops->map_free(map); +put_token: + bpf_token_put(token); return err; } diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index e18aaecc67e9..06c34dae658e 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -72,6 +72,13 @@ static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp) seq_printf(m, "allowed_cmds:\tany\n"); else seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds); + + BUILD_BUG_ON(__MAX_BPF_MAP_TYPE >= 64); + mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1; + if ((token->allowed_maps & mask) == mask) + seq_printf(m, "allowed_maps:\tany\n"); + else + seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps); } #define BPF_TOKEN_INODE_NAME "bpf-token" @@ -161,6 +168,7 @@ int bpf_token_create(union bpf_attr *attr) mnt_opts = path.dentry->d_sb->s_fs_info; token->allowed_cmds = mnt_opts->delegate_cmds; + token->allowed_maps = mnt_opts->delegate_maps; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) { @@ -212,3 +220,11 @@ bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd) return token->allowed_cmds & (1ULL << cmd); } + +bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type) +{ + if (!token || type >= __MAX_BPF_MAP_TYPE) + return false; + + return token->allowed_maps & (1ULL << type); +} -- cgit From ee54b1a910e4d49c9a104f31ae3f5b979131adf8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:17 -0800 Subject: bpf: add BPF token support to BPF_BTF_LOAD command Accept BPF token FD in BPF_BTF_LOAD command to allow BTF data loading through delegated BPF token. BTF loading is a pretty straightforward operation, so as long as BPF token is created with allow_cmds granting BPF_BTF_LOAD command, kernel proceeds to parsing BTF data and creating BTF object. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 22e14124cd61..d87c5c27cde3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4777,15 +4777,31 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, return err; } -#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size +#define BPF_BTF_LOAD_LAST_FIELD btf_token_fd static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { + struct bpf_token *token = NULL; + if (CHECK_ATTR(BPF_BTF_LOAD)) return -EINVAL; - if (!bpf_capable()) + if (attr->btf_token_fd) { + token = bpf_token_get_from_fd(attr->btf_token_fd); + if (IS_ERR(token)) + return PTR_ERR(token); + if (!bpf_token_allow_cmd(token, BPF_BTF_LOAD)) { + bpf_token_put(token); + token = NULL; + } + } + + if (!bpf_token_capable(token, CAP_BPF)) { + bpf_token_put(token); return -EPERM; + } + + bpf_token_put(token); return btf_new_fd(attr, uattr, uattr_size); } -- cgit From e1cef620f598853a90f17701fcb1057a6768f7b8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:18 -0800 Subject: bpf: add BPF token support to BPF_PROG_LOAD command Add basic support of BPF token to BPF_PROG_LOAD. Wire through a set of allowed BPF program types and attach types, derived from BPF FS at BPF token creation time. Then make sure we perform bpf_token_capable() checks everywhere where it's relevant. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 1 + kernel/bpf/inode.c | 6 ++-- kernel/bpf/syscall.c | 87 +++++++++++++++++++++++++++++++++++++--------------- kernel/bpf/token.c | 27 ++++++++++++++++ 4 files changed, 95 insertions(+), 26 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4b813da8d6c0..47085839af8d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2751,6 +2751,7 @@ void bpf_prog_free(struct bpf_prog *fp) if (aux->dst_prog) bpf_prog_put(aux->dst_prog); + bpf_token_put(aux->token); INIT_WORK(&aux->work, bpf_prog_free_deferred); schedule_work(&aux->work); } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 9c7865d1c53d..5359a0929c35 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -619,12 +619,14 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) else if (opts->delegate_maps) seq_printf(m, ",delegate_maps=0x%llx", opts->delegate_maps); - if (opts->delegate_progs == ~0ULL) + mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1; + if ((opts->delegate_progs & mask) == mask) seq_printf(m, ",delegate_progs=any"); else if (opts->delegate_progs) seq_printf(m, ",delegate_progs=0x%llx", opts->delegate_progs); - if (opts->delegate_attachs == ~0ULL) + mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1; + if ((opts->delegate_attachs & mask) == mask) seq_printf(m, ",delegate_attachs=any"); else if (opts->delegate_attachs) seq_printf(m, ",delegate_attachs=0x%llx", opts->delegate_attachs); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d87c5c27cde3..2c8393c21b8c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2608,13 +2608,15 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD log_true_size +#define BPF_PROG_LOAD_LAST_FIELD prog_token_fd static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog, *dst_prog = NULL; struct btf *attach_btf = NULL; + struct bpf_token *token = NULL; + bool bpf_cap; int err; char license[128]; @@ -2631,10 +2633,31 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) BPF_F_TEST_REG_INVARIANTS)) return -EINVAL; + bpf_prog_load_fixup_attach_type(attr); + + if (attr->prog_token_fd) { + token = bpf_token_get_from_fd(attr->prog_token_fd); + if (IS_ERR(token)) + return PTR_ERR(token); + /* if current token doesn't grant prog loading permissions, + * then we can't use this token, so ignore it and rely on + * system-wide capabilities checks + */ + if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) || + !bpf_token_allow_prog_type(token, attr->prog_type, + attr->expected_attach_type)) { + bpf_token_put(token); + token = NULL; + } + } + + bpf_cap = bpf_token_capable(token, CAP_BPF); + err = -EPERM; + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && - !bpf_capable()) - return -EPERM; + !bpf_cap) + goto put_token; /* Intent here is for unprivileged_bpf_disabled to block BPF program * creation for unprivileged users; other actions depend @@ -2643,21 +2666,23 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) * capability checks are still carried out for these * and other operations. */ - if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) - return -EPERM; + if (sysctl_unprivileged_bpf_disabled && !bpf_cap) + goto put_token; if (attr->insn_cnt == 0 || - attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) - return -E2BIG; + attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) { + err = -E2BIG; + goto put_token; + } if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && - !bpf_capable()) - return -EPERM; + !bpf_cap) + goto put_token; - if (is_net_admin_prog_type(type) && !bpf_net_capable()) - return -EPERM; - if (is_perfmon_prog_type(type) && !perfmon_capable()) - return -EPERM; + if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN)) + goto put_token; + if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON)) + goto put_token; /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog * or btf, we need to check which one it is @@ -2667,27 +2692,33 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (IS_ERR(dst_prog)) { dst_prog = NULL; attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd); - if (IS_ERR(attach_btf)) - return -EINVAL; + if (IS_ERR(attach_btf)) { + err = -EINVAL; + goto put_token; + } if (!btf_is_kernel(attach_btf)) { /* attaching through specifying bpf_prog's BTF * objects directly might be supported eventually */ btf_put(attach_btf); - return -ENOTSUPP; + err = -ENOTSUPP; + goto put_token; } } } else if (attr->attach_btf_id) { /* fall back to vmlinux BTF, if BTF type ID is specified */ attach_btf = bpf_get_btf_vmlinux(); - if (IS_ERR(attach_btf)) - return PTR_ERR(attach_btf); - if (!attach_btf) - return -EINVAL; + if (IS_ERR(attach_btf)) { + err = PTR_ERR(attach_btf); + goto put_token; + } + if (!attach_btf) { + err = -EINVAL; + goto put_token; + } btf_get(attach_btf); } - bpf_prog_load_fixup_attach_type(attr); if (bpf_prog_load_check_attach(type, attr->expected_attach_type, attach_btf, attr->attach_btf_id, dst_prog)) { @@ -2695,7 +2726,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) bpf_prog_put(dst_prog); if (attach_btf) btf_put(attach_btf); - return -EINVAL; + err = -EINVAL; + goto put_token; } /* plain bpf_prog allocation */ @@ -2705,7 +2737,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) bpf_prog_put(dst_prog); if (attach_btf) btf_put(attach_btf); - return -ENOMEM; + err = -EINVAL; + goto put_token; } prog->expected_attach_type = attr->expected_attach_type; @@ -2716,6 +2749,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; + /* move token into prog->aux, reuse taken refcnt */ + prog->aux->token = token; + token = NULL; + err = security_bpf_prog_alloc(prog->aux); if (err) goto free_prog; @@ -2817,6 +2854,8 @@ free_prog: if (prog->aux->attach_btf) btf_put(prog->aux->attach_btf); bpf_prog_free(prog); +put_token: + bpf_token_put(token); return err; } @@ -3806,7 +3845,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_SK_LOOKUP: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: - if (!bpf_net_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN)) /* cg-skb progs can be loaded by unpriv user. * check permissions at attach time. */ diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index 06c34dae658e..5a51e6b8f6bf 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -79,6 +79,20 @@ static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp) seq_printf(m, "allowed_maps:\tany\n"); else seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps); + + BUILD_BUG_ON(__MAX_BPF_PROG_TYPE >= 64); + mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1; + if ((token->allowed_progs & mask) == mask) + seq_printf(m, "allowed_progs:\tany\n"); + else + seq_printf(m, "allowed_progs:\t0x%llx\n", token->allowed_progs); + + BUILD_BUG_ON(__MAX_BPF_ATTACH_TYPE >= 64); + mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1; + if ((token->allowed_attachs & mask) == mask) + seq_printf(m, "allowed_attachs:\tany\n"); + else + seq_printf(m, "allowed_attachs:\t0x%llx\n", token->allowed_attachs); } #define BPF_TOKEN_INODE_NAME "bpf-token" @@ -169,6 +183,8 @@ int bpf_token_create(union bpf_attr *attr) mnt_opts = path.dentry->d_sb->s_fs_info; token->allowed_cmds = mnt_opts->delegate_cmds; token->allowed_maps = mnt_opts->delegate_maps; + token->allowed_progs = mnt_opts->delegate_progs; + token->allowed_attachs = mnt_opts->delegate_attachs; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) { @@ -228,3 +244,14 @@ bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type t return token->allowed_maps & (1ULL << type); } + +bool bpf_token_allow_prog_type(const struct bpf_token *token, + enum bpf_prog_type prog_type, + enum bpf_attach_type attach_type) +{ + if (!token || prog_type >= __MAX_BPF_PROG_TYPE || attach_type >= __MAX_BPF_ATTACH_TYPE) + return false; + + return (token->allowed_progs & (1ULL << prog_type)) && + (token->allowed_attachs & (1ULL << attach_type)); +} -- cgit From 4cbb270e115bc197ff2046aeb54cc951666b16ec Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:19 -0800 Subject: bpf: take into account BPF token when fetching helper protos Instead of performing unconditional system-wide bpf_capable() and perfmon_capable() calls inside bpf_base_func_proto() function (and other similar ones) to determine eligibility of a given BPF helper for a given program, use previously recorded BPF token during BPF_PROG_LOAD command handling to inform the decision. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-8-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 6 +++--- kernel/bpf/helpers.c | 6 +++--- kernel/bpf/syscall.c | 5 +++-- 3 files changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 491d20038cbe..98e0e3835b28 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1630,7 +1630,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } @@ -2191,7 +2191,7 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } @@ -2348,7 +2348,7 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ee9bdf29246a..b3be5742d6f1 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1679,7 +1679,7 @@ const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; const struct bpf_func_proto bpf_task_pt_regs_proto __weak; const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id) +bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -1730,7 +1730,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) break; } - if (!bpf_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return NULL; switch (func_id) { @@ -1788,7 +1788,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) break; } - if (!perfmon_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) return NULL; switch (func_id) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2c8393c21b8c..1cc03f08c9cd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5712,7 +5712,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = { const struct bpf_func_proto * __weak tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } BPF_CALL_1(bpf_sys_close, u32, fd) @@ -5762,7 +5762,8 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_sys_bpf: - return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto; + return !bpf_token_capable(prog->aux->token, CAP_PERFMON) + ? NULL : &bpf_sys_bpf_proto; case BPF_FUNC_btf_find_by_name_kind: return &bpf_btf_find_by_name_kind_proto; case BPF_FUNC_sys_close: -- cgit From 8062fb12de99b2da33754c6a3be1bfc30d9a35f4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:20 -0800 Subject: bpf: consistently use BPF token throughout BPF verifier logic Remove remaining direct queries to perfmon_capable() and bpf_capable() in BPF verifier logic and instead use BPF token (if available) to make decisions about privileges. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-9-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 2 +- kernel/bpf/core.c | 2 +- kernel/bpf/verifier.c | 13 ++++++------- 3 files changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 4a4a67956e21..8d365bda9a8b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -82,7 +82,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; - bool bypass_spec_v1 = bpf_bypass_spec_v1(); + bool bypass_spec_v1 = bpf_bypass_spec_v1(NULL); u64 array_size, mask64; struct bpf_array *array; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 47085839af8d..ced511f44174 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -675,7 +675,7 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || - !bpf_capable()) + !bpf_token_capable(fp->aux->token, CAP_BPF)) return; bpf_prog_ksym_set_addr(fp); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e5ce530641ba..45e85fb76d82 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20597,7 +20597,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel); - is_priv = bpf_capable(); + + env->allow_ptr_leaks = bpf_allow_ptr_leaks(env->prog->aux->token); + env->allow_uninit_stack = bpf_allow_uninit_stack(env->prog->aux->token); + env->bypass_spec_v1 = bpf_bypass_spec_v1(env->prog->aux->token); + env->bypass_spec_v4 = bpf_bypass_spec_v4(env->prog->aux->token); + env->bpf_capable = is_priv = bpf_token_capable(env->prog->aux->token, CAP_BPF); bpf_get_btf_vmlinux(); @@ -20629,12 +20634,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; - env->allow_ptr_leaks = bpf_allow_ptr_leaks(); - env->allow_uninit_stack = bpf_allow_uninit_stack(); - env->bypass_spec_v1 = bpf_bypass_spec_v1(); - env->bypass_spec_v4 = bpf_bypass_spec_v4(); - env->bpf_capable = bpf_capable(); - if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS; -- cgit From c3dd6e94df7193f33f45d33303f5e85afb2a72dc Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:21 -0800 Subject: bpf,lsm: refactor bpf_prog_alloc/bpf_prog_free LSM hooks Based on upstream discussion ([0]), rework existing bpf_prog_alloc_security LSM hook. Rename it to bpf_prog_load and instead of passing bpf_prog_aux, pass proper bpf_prog pointer for a full BPF program struct. Also, we pass bpf_attr union with all the user-provided arguments for BPF_PROG_LOAD command. This will give LSMs as much information as we can basically provide. The hook is also BPF token-aware now, and optional bpf_token struct is passed as a third argument. bpf_prog_load LSM hook is called after a bunch of sanity checks were performed, bpf_prog and bpf_prog_aux were allocated and filled out, but right before performing full-fledged BPF verification step. bpf_prog_free LSM hook is now accepting struct bpf_prog argument, for consistency. SELinux code is adjusted to all new names, types, and signatures. Note, given that bpf_prog_load (previously bpf_prog_alloc) hook can be used by some LSMs to allocate extra security blob, but also by other LSMs to reject BPF program loading, we need to make sure that bpf_prog_free LSM hook is called after bpf_prog_load/bpf_prog_alloc one *even* if the hook itself returned error. If we don't do that, we run the risk of leaking memory. This seems to be possible today when combining SELinux and BPF LSM, as one example, depending on their relative ordering. Also, for BPF LSM setup, add bpf_prog_load and bpf_prog_free to sleepable LSM hooks list, as they are both executed in sleepable context. Also drop bpf_prog_load hook from untrusted, as there is no issue with refcount or anything else anymore, that originally forced us to add it to untrusted list in c0c852dd1876 ("bpf: Do not mark certain LSM hook arguments as trusted"). We now trigger this hook much later and it should not be an issue anymore. [0] https://lore.kernel.org/bpf/9fe88aef7deabbe87d3fc38c4aea3c69.paul@paul-moore.com/ Acked-by: Paul Moore Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-10-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_lsm.c | 5 +++-- kernel/bpf/syscall.c | 25 +++++++++++++------------ 2 files changed, 16 insertions(+), 14 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index e14c822f8911..3e956f6302f3 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -263,6 +263,8 @@ BTF_ID(func, bpf_lsm_bpf_map) BTF_ID(func, bpf_lsm_bpf_map_alloc_security) BTF_ID(func, bpf_lsm_bpf_map_free_security) BTF_ID(func, bpf_lsm_bpf_prog) +BTF_ID(func, bpf_lsm_bpf_prog_load) +BTF_ID(func, bpf_lsm_bpf_prog_free) BTF_ID(func, bpf_lsm_bprm_check_security) BTF_ID(func, bpf_lsm_bprm_committed_creds) BTF_ID(func, bpf_lsm_bprm_committing_creds) @@ -346,8 +348,7 @@ BTF_SET_END(sleepable_lsm_hooks) BTF_SET_START(untrusted_lsm_hooks) BTF_ID(func, bpf_lsm_bpf_map_free_security) -BTF_ID(func, bpf_lsm_bpf_prog_alloc_security) -BTF_ID(func, bpf_lsm_bpf_prog_free_security) +BTF_ID(func, bpf_lsm_bpf_prog_free) BTF_ID(func, bpf_lsm_file_alloc_security) BTF_ID(func, bpf_lsm_file_free_security) #ifdef CONFIG_SECURITY_NETWORK diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1cc03f08c9cd..7717c7c7b95d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2162,7 +2162,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) kvfree(aux->func_info); kfree(aux->func_info_aux); free_uid(aux->user); - security_bpf_prog_free(aux); + security_bpf_prog_free(aux->prog); bpf_prog_free(aux->prog); } @@ -2753,10 +2753,6 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) prog->aux->token = token; token = NULL; - err = security_bpf_prog_alloc(prog->aux); - if (err) - goto free_prog; - prog->aux->user = get_current_user(); prog->len = attr->insn_cnt; @@ -2764,12 +2760,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (copy_from_bpfptr(prog->insns, make_bpfptr(attr->insns, uattr.is_kernel), bpf_prog_insn_size(prog)) != 0) - goto free_prog_sec; + goto free_prog; /* copy eBPF program license from user space */ if (strncpy_from_bpfptr(license, make_bpfptr(attr->license, uattr.is_kernel), sizeof(license) - 1) < 0) - goto free_prog_sec; + goto free_prog; license[sizeof(license) - 1] = 0; /* eBPF programs must be GPL compatible to use GPL-ed functions */ @@ -2783,25 +2779,29 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (bpf_prog_is_dev_bound(prog->aux)) { err = bpf_prog_dev_bound_init(prog, attr); if (err) - goto free_prog_sec; + goto free_prog; } if (type == BPF_PROG_TYPE_EXT && dst_prog && bpf_prog_is_dev_bound(dst_prog->aux)) { err = bpf_prog_dev_bound_inherit(prog, dst_prog); if (err) - goto free_prog_sec; + goto free_prog; } /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); if (err < 0) - goto free_prog_sec; + goto free_prog; prog->aux->load_time = ktime_get_boottime_ns(); err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, sizeof(attr->prog_name)); if (err < 0) + goto free_prog; + + err = security_bpf_prog_load(prog, attr, token); + if (err) goto free_prog_sec; /* run eBPF verifier */ @@ -2847,10 +2847,11 @@ free_used_maps: */ __bpf_prog_put_noref(prog, prog->aux->real_func_cnt); return err; + free_prog_sec: - free_uid(prog->aux->user); - security_bpf_prog_free(prog->aux); + security_bpf_prog_free(prog); free_prog: + free_uid(prog->aux->user); if (prog->aux->attach_btf) btf_put(prog->aux->attach_btf); bpf_prog_free(prog); -- cgit From 66d636d70a79c1d37e3eea67ab50969e6aaef983 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:22 -0800 Subject: bpf,lsm: refactor bpf_map_alloc/bpf_map_free LSM hooks Similarly to bpf_prog_alloc LSM hook, rename and extend bpf_map_alloc hook into bpf_map_create, taking not just struct bpf_map, but also bpf_attr and bpf_token, to give a fuller context to LSMs. Unlike bpf_prog_alloc, there is no need to move the hook around, as it currently is firing right before allocating BPF map ID and FD, which seems to be a sweet spot. But like bpf_prog_alloc/bpf_prog_free combo, make sure that bpf_map_free LSM hook is called even if bpf_map_create hook returned error, as if few LSMs are combined together it could be that one LSM successfully allocated security blob for its needs, while subsequent LSM rejected BPF map creation. The former LSM would still need to free up LSM blob, so we need to ensure security_bpf_map_free() is called regardless of the outcome. Acked-by: Paul Moore Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-11-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_lsm.c | 6 +++--- kernel/bpf/syscall.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 3e956f6302f3..9e4e615f11eb 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -260,8 +260,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) BTF_SET_START(sleepable_lsm_hooks) BTF_ID(func, bpf_lsm_bpf) BTF_ID(func, bpf_lsm_bpf_map) -BTF_ID(func, bpf_lsm_bpf_map_alloc_security) -BTF_ID(func, bpf_lsm_bpf_map_free_security) +BTF_ID(func, bpf_lsm_bpf_map_create) +BTF_ID(func, bpf_lsm_bpf_map_free) BTF_ID(func, bpf_lsm_bpf_prog) BTF_ID(func, bpf_lsm_bpf_prog_load) BTF_ID(func, bpf_lsm_bpf_prog_free) @@ -347,7 +347,7 @@ BTF_ID(func, bpf_lsm_userns_create) BTF_SET_END(sleepable_lsm_hooks) BTF_SET_START(untrusted_lsm_hooks) -BTF_ID(func, bpf_lsm_bpf_map_free_security) +BTF_ID(func, bpf_lsm_bpf_map_free) BTF_ID(func, bpf_lsm_bpf_prog_free) BTF_ID(func, bpf_lsm_file_alloc_security) BTF_ID(func, bpf_lsm_file_free_security) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7717c7c7b95d..aff045eed375 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1309,9 +1309,9 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_alloc(map); + err = security_bpf_map_create(map, attr, token); if (err) - goto free_map; + goto free_map_sec; err = bpf_map_alloc_id(map); if (err) -- cgit From d734ca7b33dbf60eb15dcf7c44f3da7073356777 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:23 -0800 Subject: bpf,lsm: add BPF token LSM hooks Wire up bpf_token_create and bpf_token_free LSM hooks, which allow to allocate LSM security blob (we add `void *security` field to struct bpf_token for that), but also control who can instantiate BPF token. This follows existing pattern for BPF map and BPF prog. Also add security_bpf_token_allow_cmd() and security_bpf_token_capable() LSM hooks that allow LSM implementation to control and negate (if necessary) BPF token's delegation of a specific bpf_cmd and capability, respectively. Acked-by: Paul Moore Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-12-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_lsm.c | 4 ++++ kernel/bpf/token.c | 18 ++++++++++++------ 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 9e4e615f11eb..7d2f96413a57 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -265,6 +265,10 @@ BTF_ID(func, bpf_lsm_bpf_map_free) BTF_ID(func, bpf_lsm_bpf_prog) BTF_ID(func, bpf_lsm_bpf_prog_load) BTF_ID(func, bpf_lsm_bpf_prog_free) +BTF_ID(func, bpf_lsm_bpf_token_create) +BTF_ID(func, bpf_lsm_bpf_token_free) +BTF_ID(func, bpf_lsm_bpf_token_cmd) +BTF_ID(func, bpf_lsm_bpf_token_capable) BTF_ID(func, bpf_lsm_bprm_check_security) BTF_ID(func, bpf_lsm_bprm_committed_creds) BTF_ID(func, bpf_lsm_bprm_committing_creds) diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index 5a51e6b8f6bf..17212efcde60 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -7,6 +7,7 @@ #include #include #include +#include bool bpf_token_capable(const struct bpf_token *token, int cap) { @@ -14,10 +15,9 @@ bool bpf_token_capable(const struct bpf_token *token, int cap) * token's userns is *exactly* the same as current user's userns */ if (token && current_user_ns() == token->userns) { - if (ns_capable(token->userns, cap)) - return true; - if (cap != CAP_SYS_ADMIN && ns_capable(token->userns, CAP_SYS_ADMIN)) - return true; + if (ns_capable(token->userns, cap) || + (cap != CAP_SYS_ADMIN && ns_capable(token->userns, CAP_SYS_ADMIN))) + return security_bpf_token_capable(token, cap) == 0; } /* otherwise fallback to capable() checks */ return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN)); @@ -30,6 +30,7 @@ void bpf_token_inc(struct bpf_token *token) static void bpf_token_free(struct bpf_token *token) { + security_bpf_token_free(token); put_user_ns(token->userns); kvfree(token); } @@ -186,6 +187,10 @@ int bpf_token_create(union bpf_attr *attr) token->allowed_progs = mnt_opts->delegate_progs; token->allowed_attachs = mnt_opts->delegate_attachs; + err = security_bpf_token_create(token, attr, &path); + if (err) + goto out_token; + fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) { err = fd; @@ -233,8 +238,9 @@ bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd) */ if (!token || current_user_ns() != token->userns) return false; - - return token->allowed_cmds & (1ULL << cmd); + if (!(token->allowed_cmds & (1ULL << cmd))) + return false; + return security_bpf_token_cmd(token, cmd) == 0; } bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type) -- cgit From f08a1c658257c73697a819c4ded3a84b6f0ead74 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:48 -0800 Subject: bpf: Let bpf_prog_pack_free handle any pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, bpf_prog_pack_free only can only free pointer to struct bpf_binary_header, which is not flexible. Add a size argument to bpf_prog_pack_free so that it can handle any pointer. Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Reviewed-by: Björn Töpel Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20231206224054.492250-2-song@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 21 ++++++++++----------- kernel/bpf/dispatcher.c | 5 +---- 2 files changed, 11 insertions(+), 15 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ced511f44174..c34513d645c4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -928,20 +928,20 @@ out: return ptr; } -void bpf_prog_pack_free(struct bpf_binary_header *hdr) +void bpf_prog_pack_free(void *ptr, u32 size) { struct bpf_prog_pack *pack = NULL, *tmp; unsigned int nbits; unsigned long pos; mutex_lock(&pack_mutex); - if (hdr->size > BPF_PROG_PACK_SIZE) { - bpf_jit_free_exec(hdr); + if (size > BPF_PROG_PACK_SIZE) { + bpf_jit_free_exec(ptr); goto out; } list_for_each_entry(tmp, &pack_list, list) { - if ((void *)hdr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > (void *)hdr) { + if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) { pack = tmp; break; } @@ -950,10 +950,10 @@ void bpf_prog_pack_free(struct bpf_binary_header *hdr) if (WARN_ONCE(!pack, "bpf_prog_pack bug\n")) goto out; - nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size); - pos = ((unsigned long)hdr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT; + nbits = BPF_PROG_SIZE_TO_NBITS(size); + pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT; - WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size), + WARN_ONCE(bpf_arch_text_invalidate(ptr, size), "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n"); bitmap_clear(pack->bitmap, pos, nbits); @@ -1100,8 +1100,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, *rw_header = kvmalloc(size, GFP_KERNEL); if (!*rw_header) { - bpf_arch_text_copy(&ro_header->size, &size, sizeof(size)); - bpf_prog_pack_free(ro_header); + bpf_prog_pack_free(ro_header, size); bpf_jit_uncharge_modmem(size); return NULL; } @@ -1132,7 +1131,7 @@ int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, kvfree(rw_header); if (IS_ERR(ptr)) { - bpf_prog_pack_free(ro_header); + bpf_prog_pack_free(ro_header, ro_header->size); return PTR_ERR(ptr); } return 0; @@ -1153,7 +1152,7 @@ void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, { u32 size = ro_header->size; - bpf_prog_pack_free(ro_header); + bpf_prog_pack_free(ro_header, size); kvfree(rw_header); bpf_jit_uncharge_modmem(size); } diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index fa3e9225aedc..56760fc10e78 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -150,10 +150,7 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, goto out; d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE); if (!d->rw_image) { - u32 size = PAGE_SIZE; - - bpf_arch_text_copy(d->image, &size, sizeof(size)); - bpf_prog_pack_free((struct bpf_binary_header *)d->image); + bpf_prog_pack_free(d->image, PAGE_SIZE); d->image = NULL; goto out; } -- cgit From 7a3d9a159b178e87306a6e989071ed9a114a1a31 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:49 -0800 Subject: bpf: Adjust argument names of arch_prepare_bpf_trampoline() We are using "im" for "struct bpf_tramp_image" and "tr" for "struct bpf_trampoline" in most of the code base. The only exception is the prototype and fallback version of arch_prepare_bpf_trampoline(). Update them to match the rest of the code base. We mix "orig_call" and "func_addr" for the argument in different versions of arch_prepare_bpf_trampoline(). s/orig_call/func_addr/g so they match. Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20231206224054.492250-3-song@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index e97aeda3a86b..e114a1c7961e 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -1032,10 +1032,10 @@ bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog) } int __weak -arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, +arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, - void *orig_call) + void *func_addr) { return -ENOTSUPP; } -- cgit From 82583daa2efc2e336962b231a46bad03a280b3e0 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:50 -0800 Subject: bpf: Add helpers for trampoline image management As BPF trampoline of different archs moves from bpf_jit_[alloc|free]_exec() to bpf_prog_pack_[alloc|free](), we need to use different _alloc, _free for different archs during the transition. Add the following helpers for this transition: void *arch_alloc_bpf_trampoline(unsigned int size); void arch_free_bpf_trampoline(void *image, unsigned int size); void arch_protect_bpf_trampoline(void *image, unsigned int size); void arch_unprotect_bpf_trampoline(void *image, unsigned int size); The fallback version of these helpers require size <= PAGE_SIZE, but they are only called with size == PAGE_SIZE. They will be called with size < PAGE_SIZE when arch_bpf_trampoline_size() helper is introduced later. Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20231206224054.492250-4-song@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_struct_ops.c | 12 +++++------- kernel/bpf/trampoline.c | 46 ++++++++++++++++++++++++++++++++++++++------- 2 files changed, 44 insertions(+), 14 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index db6176fb64dc..e9e95879bce2 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -515,7 +515,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (err) goto reset_unlock; } - set_memory_rox((long)st_map->image, 1); + arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); /* Let bpf_link handle registration & unregistration. * * Pair with smp_load_acquire() during lookup_elem(). @@ -524,7 +524,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto unlock; } - set_memory_rox((long)st_map->image, 1); + arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); err = st_ops->reg(kdata); if (likely(!err)) { /* This refcnt increment on the map here after @@ -547,8 +547,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, * there was a race in registering the struct_ops (under the same name) to * a sub-system through different struct_ops's maps. */ - set_memory_nx((long)st_map->image, 1); - set_memory_rw((long)st_map->image, 1); + arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE); reset_unlock: bpf_struct_ops_map_put_progs(st_map); @@ -616,7 +615,7 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map) bpf_struct_ops_map_put_progs(st_map); bpf_map_area_free(st_map->links); if (st_map->image) { - bpf_jit_free_exec(st_map->image); + arch_free_bpf_trampoline(st_map->image, PAGE_SIZE); bpf_jit_uncharge_modmem(PAGE_SIZE); } bpf_map_area_free(st_map->uvalue); @@ -691,7 +690,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) return ERR_PTR(ret); } - st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); + st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE); if (!st_map->image) { /* __bpf_struct_ops_map_free() uses st_map->image as flag * for "charged or not". In this case, we need to unchange @@ -711,7 +710,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) } mutex_init(&st_map->lock); - set_vm_flush_reset_perms(st_map->image); bpf_map_init_from_attr(map, attr); return map; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index e114a1c7961e..affbcbf7e76e 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -254,7 +254,7 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_a static void bpf_tramp_image_free(struct bpf_tramp_image *im) { bpf_image_ksym_del(&im->ksym); - bpf_jit_free_exec(im->image); + arch_free_bpf_trampoline(im->image, PAGE_SIZE); bpf_jit_uncharge_modmem(PAGE_SIZE); percpu_ref_exit(&im->pcref); kfree_rcu(im, rcu); @@ -365,10 +365,9 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key) goto out_free_im; err = -ENOMEM; - im->image = image = bpf_jit_alloc_exec(PAGE_SIZE); + im->image = image = arch_alloc_bpf_trampoline(PAGE_SIZE); if (!image) goto out_uncharge; - set_vm_flush_reset_perms(image); err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL); if (err) @@ -381,7 +380,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key) return im; out_free_image: - bpf_jit_free_exec(im->image); + arch_free_bpf_trampoline(im->image, PAGE_SIZE); out_uncharge: bpf_jit_uncharge_modmem(PAGE_SIZE); out_free_im: @@ -444,7 +443,7 @@ again: if (err < 0) goto out_free; - set_memory_rox((long)im->image, 1); + arch_protect_bpf_trampoline(im->image, PAGE_SIZE); WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) @@ -465,8 +464,7 @@ again: tr->fops->trampoline = 0; /* reset im->image memory attr for arch_prepare_bpf_trampoline */ - set_memory_nx((long)im->image, 1); - set_memory_rw((long)im->image, 1); + arch_unprotect_bpf_trampoline(im->image, PAGE_SIZE); goto again; } #endif @@ -1040,6 +1038,40 @@ arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image return -ENOTSUPP; } +void * __weak arch_alloc_bpf_trampoline(unsigned int size) +{ + void *image; + + if (WARN_ON_ONCE(size > PAGE_SIZE)) + return NULL; + image = bpf_jit_alloc_exec(PAGE_SIZE); + if (image) + set_vm_flush_reset_perms(image); + return image; +} + +void __weak arch_free_bpf_trampoline(void *image, unsigned int size) +{ + WARN_ON_ONCE(size > PAGE_SIZE); + /* bpf_jit_free_exec doesn't need "size", but + * bpf_prog_pack_free() needs it. + */ + bpf_jit_free_exec(image); +} + +void __weak arch_protect_bpf_trampoline(void *image, unsigned int size) +{ + WARN_ON_ONCE(size > PAGE_SIZE); + set_memory_rox((long)image, 1); +} + +void __weak arch_unprotect_bpf_trampoline(void *image, unsigned int size) +{ + WARN_ON_ONCE(size > PAGE_SIZE); + set_memory_nx((long)image, 1); + set_memory_rw((long)image, 1); +} + static int __init init_trampolines(void) { int i; -- cgit From 96d1b7c081c0c96cbe8901045f4ff15a2e9974a2 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:52 -0800 Subject: bpf: Add arch_bpf_trampoline_size() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This helper will be used to calculate the size of the trampoline before allocating the memory. arch_prepare_bpf_trampoline() for arm64 and riscv64 can use arch_bpf_trampoline_size() to check the trampoline fits in the image. OTOH, arch_prepare_bpf_trampoline() for s390 has to call the JIT process twice, so it cannot use arch_bpf_trampoline_size(). Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Acked-by: Jiri Olsa Acked-by: Björn Töpel Tested-by: Björn Töpel # on riscv Link: https://lore.kernel.org/r/20231206224054.492250-6-song@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index affbcbf7e76e..b553cbd89e55 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -1072,6 +1072,12 @@ void __weak arch_unprotect_bpf_trampoline(void *image, unsigned int size) set_memory_rw((long)image, 1); } +int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, + struct bpf_tramp_links *tlinks, void *func_addr) +{ + return -ENOTSUPP; +} + static int __init init_trampolines(void) { int i; -- cgit From 26ef208c209a0e6eed8942a5d191b39dccfa6e38 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:53 -0800 Subject: bpf: Use arch_bpf_trampoline_size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of blindly allocating PAGE_SIZE for each trampoline, check the size of the trampoline with arch_bpf_trampoline_size(). This size is saved in bpf_tramp_image->size, and used for modmem charge/uncharge. The fallback arch_alloc_bpf_trampoline() still allocates a whole page because we need to use set_memory_* to protect the memory. struct_ops trampoline still uses a whole page for multiple trampolines. With this size check at caller (regular trampoline and struct_ops trampoline), remove arch_bpf_trampoline_size() from arch_prepare_bpf_trampoline() in archs. Also, update bpf_image_ksym_add() to handle symbol of different sizes. Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Acked-by: Jiri Olsa Acked-by: Björn Töpel Tested-by: Björn Töpel # on riscv Link: https://lore.kernel.org/r/20231206224054.492250-7-song@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_struct_ops.c | 7 ++++++ kernel/bpf/dispatcher.c | 2 +- kernel/bpf/trampoline.c | 55 ++++++++++++++++++++++++++++----------------- 3 files changed, 42 insertions(+), 22 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index e9e95879bce2..4d53c53fc5aa 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -355,6 +355,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, void *image, void *image_end) { u32 flags; + int size; tlinks[BPF_TRAMP_FENTRY].links[0] = link; tlinks[BPF_TRAMP_FENTRY].nr_links = 1; @@ -362,6 +363,12 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, * and it must be used alone. */ flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0; + + size = arch_bpf_trampoline_size(model, flags, tlinks, NULL); + if (size < 0) + return size; + if (size > (unsigned long)image_end - (unsigned long)image) + return -E2BIG; return arch_prepare_bpf_trampoline(NULL, image, image_end, model, flags, tlinks, NULL); } diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index 56760fc10e78..70fb82bf1637 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -154,7 +154,7 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, d->image = NULL; goto out; } - bpf_image_ksym_add(d->image, &d->ksym); + bpf_image_ksym_add(d->image, PAGE_SIZE, &d->ksym); } prev_num_progs = d->num_progs; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index b553cbd89e55..d382f5ebe06c 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -115,10 +115,10 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); } -void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym) +void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym) { ksym->start = (unsigned long) data; - ksym->end = ksym->start + PAGE_SIZE; + ksym->end = ksym->start + size; bpf_ksym_add(ksym); perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, PAGE_SIZE, false, ksym->name); @@ -254,8 +254,8 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_a static void bpf_tramp_image_free(struct bpf_tramp_image *im) { bpf_image_ksym_del(&im->ksym); - arch_free_bpf_trampoline(im->image, PAGE_SIZE); - bpf_jit_uncharge_modmem(PAGE_SIZE); + arch_free_bpf_trampoline(im->image, im->size); + bpf_jit_uncharge_modmem(im->size); percpu_ref_exit(&im->pcref); kfree_rcu(im, rcu); } @@ -349,7 +349,7 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im) call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks); } -static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key) +static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size) { struct bpf_tramp_image *im; struct bpf_ksym *ksym; @@ -360,12 +360,13 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key) if (!im) goto out; - err = bpf_jit_charge_modmem(PAGE_SIZE); + err = bpf_jit_charge_modmem(size); if (err) goto out_free_im; + im->size = size; err = -ENOMEM; - im->image = image = arch_alloc_bpf_trampoline(PAGE_SIZE); + im->image = image = arch_alloc_bpf_trampoline(size); if (!image) goto out_uncharge; @@ -376,13 +377,13 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key) ksym = &im->ksym; INIT_LIST_HEAD_RCU(&ksym->lnode); snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key); - bpf_image_ksym_add(image, ksym); + bpf_image_ksym_add(image, size, ksym); return im; out_free_image: - arch_free_bpf_trampoline(im->image, PAGE_SIZE); + arch_free_bpf_trampoline(im->image, im->size); out_uncharge: - bpf_jit_uncharge_modmem(PAGE_SIZE); + bpf_jit_uncharge_modmem(size); out_free_im: kfree(im); out: @@ -395,7 +396,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut struct bpf_tramp_links *tlinks; u32 orig_flags = tr->flags; bool ip_arg = false; - int err, total; + int err, total, size; tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg); if (IS_ERR(tlinks)) @@ -408,12 +409,6 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut goto out; } - im = bpf_tramp_image_alloc(tr->key); - if (IS_ERR(im)) { - err = PTR_ERR(im); - goto out; - } - /* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */ tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX); @@ -437,13 +432,31 @@ again: tr->flags |= BPF_TRAMP_F_ORIG_STACK; #endif - err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE, + size = arch_bpf_trampoline_size(&tr->func.model, tr->flags, + tlinks, tr->func.addr); + if (size < 0) { + err = size; + goto out; + } + + if (size > PAGE_SIZE) { + err = -E2BIG; + goto out; + } + + im = bpf_tramp_image_alloc(tr->key, size); + if (IS_ERR(im)) { + err = PTR_ERR(im); + goto out; + } + + err = arch_prepare_bpf_trampoline(im, im->image, im->image + size, &tr->func.model, tr->flags, tlinks, tr->func.addr); if (err < 0) goto out_free; - arch_protect_bpf_trampoline(im->image, PAGE_SIZE); + arch_protect_bpf_trampoline(im->image, im->size); WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) @@ -463,8 +476,8 @@ again: tr->fops->func = NULL; tr->fops->trampoline = 0; - /* reset im->image memory attr for arch_prepare_bpf_trampoline */ - arch_unprotect_bpf_trampoline(im->image, PAGE_SIZE); + /* free im memory and reallocate later */ + bpf_tramp_image_free(im); goto again; } #endif -- cgit From a833a17aeac73b33f79433d7cee68d5cafd71e4f Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Wed, 6 Dec 2023 23:11:48 -0500 Subject: bpf: Fix verification of indirect var-off stack access This patch fixes a bug around the verification of possibly-zero-sized stack accesses. When the access was done through a var-offset stack pointer, check_stack_access_within_bounds was incorrectly computing the maximum-offset of a zero-sized read to be the same as the register's min offset. Instead, we have to take in account the register's maximum possible value. The patch also simplifies how the max offset is checked; the check is now simpler than for min offset. The bug was allowing accesses to erroneously pass the check_stack_access_within_bounds() checks, only to later crash in check_stack_range_initialized() when all the possibly-affected stack slots are iterated (this time with a correct max offset). check_stack_range_initialized() is relying on check_stack_access_within_bounds() for its accesses to the stack-tracking vector to be within bounds; in the case of zero-sized accesses, we were essentially only verifying that the lowest possible slot was within bounds. We would crash when the max-offset of the stack pointer was >= 0 (which shouldn't pass verification, and hopefully is not something anyone's code attempts to do in practice). Thanks Hao for reporting! Fixes: 01f810ace9ed3 ("bpf: Allow variable-offset stack access") Reported-by: Hao Sun Signed-off-by: Andrei Matei Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231207041150.229139-2-andreimatei1@gmail.com Closes: https://lore.kernel.org/bpf/CACkBjsZGEUaRCHsmaX=h-efVogsRfK1FPxmkgb0Os_frnHiNdw@mail.gmail.com/ --- kernel/bpf/verifier.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 45e85fb76d82..85e4ab61084f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6620,10 +6620,7 @@ static int check_stack_access_within_bounds( if (tnum_is_const(reg->var_off)) { min_off = reg->var_off.value + off; - if (access_size > 0) - max_off = min_off + access_size - 1; - else - max_off = min_off; + max_off = min_off + access_size; } else { if (reg->smax_value >= BPF_MAX_VAR_OFF || reg->smin_value <= -BPF_MAX_VAR_OFF) { @@ -6632,15 +6629,12 @@ static int check_stack_access_within_bounds( return -EACCES; } min_off = reg->smin_value + off; - if (access_size > 0) - max_off = reg->smax_value + off + access_size - 1; - else - max_off = min_off; + max_off = reg->smax_value + off + access_size; } err = check_stack_slot_within_bounds(min_off, state, type); - if (!err) - err = check_stack_slot_within_bounds(max_off, state, type); + if (!err && max_off > 0) + err = -EINVAL; /* out of stack access into non-negative offsets */ if (err) { if (tnum_is_const(reg->var_off)) { -- cgit From 1d38a9ee81570c4bd61f557832dead4d6f816760 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Wed, 6 Dec 2023 23:11:50 -0500 Subject: bpf: Guard stack limits against 32bit overflow This patch promotes the arithmetic around checking stack bounds to be done in the 64-bit domain, instead of the current 32bit. The arithmetic implies adding together a 64-bit register with a int offset. The register was checked to be below 1<<29 when it was variable, but not when it was fixed. The offset either comes from an instruction (in which case it is 16 bit), from another register (in which case the caller checked it to be below 1<<29 [1]), or from the size of an argument to a kfunc (in which case it can be a u32 [2]). Between the register being inconsistently checked to be below 1<<29, and the offset being up to an u32, it appears that we were open to overflowing the `int`s which were currently used for arithmetic. [1] https://github.com/torvalds/linux/blob/815fb87b753055df2d9e50f6cd80eb10235fe3e9/kernel/bpf/verifier.c#L7494-L7498 [2] https://github.com/torvalds/linux/blob/815fb87b753055df2d9e50f6cd80eb10235fe3e9/kernel/bpf/verifier.c#L11904 Reported-by: Andrii Nakryiko Signed-off-by: Andrei Matei Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231207041150.229139-4-andreimatei1@gmail.com --- kernel/bpf/verifier.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 85e4ab61084f..0e77bb52542d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6577,7 +6577,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, * The minimum valid offset is -MAX_BPF_STACK for writes, and * -state->allocated_stack for reads. */ -static int check_stack_slot_within_bounds(int off, +static int check_stack_slot_within_bounds(s64 off, struct bpf_func_state *state, enum bpf_access_type t) { @@ -6606,7 +6606,7 @@ static int check_stack_access_within_bounds( struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; struct bpf_func_state *state = func(env, reg); - int min_off, max_off; + s64 min_off, max_off; int err; char *err_extra; @@ -6619,7 +6619,7 @@ static int check_stack_access_within_bounds( err_extra = " write to"; if (tnum_is_const(reg->var_off)) { - min_off = reg->var_off.value + off; + min_off = (s64)reg->var_off.value + off; max_off = min_off + access_size; } else { if (reg->smax_value >= BPF_MAX_VAR_OFF || -- cgit From 6b4a64bafd107e521c01eec3453ce94a3fb38529 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Thu, 7 Dec 2023 22:25:18 -0500 Subject: bpf: Fix accesses to uninit stack slots Privileged programs are supposed to be able to read uninitialized stack memory (ever since 6715df8d5) but, before this patch, these accesses were permitted inconsistently. In particular, accesses were permitted above state->allocated_stack, but not below it. In other words, if the stack was already "large enough", the access was permitted, but otherwise the access was rejected instead of being allowed to "grow the stack". This undesired rejection was happening in two places: - in check_stack_slot_within_bounds() - in check_stack_range_initialized() This patch arranges for these accesses to be permitted. A bunch of tests that were relying on the old rejection had to change; all of them were changed to add also run unprivileged, in which case the old behavior persists. One tests couldn't be updated - global_func16 - because it can't run unprivileged for other reasons. This patch also fixes the tracking of the stack size for variable-offset reads. This second fix is bundled in the same commit as the first one because they're inter-related. Before this patch, writes to the stack using registers containing a variable offset (as opposed to registers with fixed, known values) were not properly contributing to the function's needed stack size. As a result, it was possible for a program to verify, but then to attempt to read out-of-bounds data at runtime because a too small stack had been allocated for it. Each function tracks the size of the stack it needs in bpf_subprog_info.stack_depth, which is maintained by update_stack_depth(). For regular memory accesses, check_mem_access() was calling update_state_depth() but it was passing in only the fixed part of the offset register, ignoring the variable offset. This was incorrect; the minimum possible value of that register should be used instead. This tracking is now fixed by centralizing the tracking of stack size in grow_stack_state(), and by lifting the calls to grow_stack_state() to check_stack_access_within_bounds() as suggested by Andrii. The code is now simpler and more convincingly tracks the correct maximum stack size. check_stack_range_initialized() can now rely on enough stack having been allocated for the access; this helps with the fix for the first issue. A few tests were changed to also check the stack depth computation. The one that fails without this patch is verifier_var_off:stack_write_priv_vs_unpriv. Fixes: 01f810ace9ed3 ("bpf: Allow variable-offset stack access") Reported-by: Hao Sun Signed-off-by: Andrei Matei Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231208032519.260451-3-andreimatei1@gmail.com Closes: https://lore.kernel.org/bpf/CABWLsev9g8UP_c3a=1qbuZUi20tGoUXoU07FPf-5FLvhOKOY+Q@mail.gmail.com/ --- kernel/bpf/verifier.c | 65 +++++++++++++++++++++------------------------------ 1 file changed, 26 insertions(+), 39 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0e77bb52542d..de1e29fa467e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1259,7 +1259,10 @@ static int resize_reference_state(struct bpf_func_state *state, size_t n) return 0; } -static int grow_stack_state(struct bpf_func_state *state, int size) +/* Possibly update state->allocated_stack to be at least size bytes. Also + * possibly update the function's high-water mark in its bpf_subprog_info. + */ +static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int size) { size_t old_n = state->allocated_stack / BPF_REG_SIZE, n = size / BPF_REG_SIZE; @@ -1271,6 +1274,11 @@ static int grow_stack_state(struct bpf_func_state *state, int size) return -ENOMEM; state->allocated_stack = size; + + /* update known max for given subprogram */ + if (env->subprog_info[state->subprogno].stack_depth < size) + env->subprog_info[state->subprogno].stack_depth = size; + return 0; } @@ -4440,9 +4448,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, struct bpf_reg_state *reg = NULL; int insn_flags = insn_stack_access_flags(state->frameno, spi); - err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE)); - if (err) - return err; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, * so it's aligned access and [off, off + size) are within stack limits */ @@ -4595,10 +4600,6 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, (!value_reg && is_bpf_st_mem(insn) && insn->imm == 0)) writing_zero = true; - err = grow_stack_state(state, round_up(-min_off, BPF_REG_SIZE)); - if (err) - return err; - for (i = min_off; i < max_off; i++) { int spi; @@ -5774,20 +5775,6 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, strict); } -static int update_stack_depth(struct bpf_verifier_env *env, - const struct bpf_func_state *func, - int off) -{ - u16 stack = env->subprog_info[func->subprogno].stack_depth; - - if (stack >= -off) - return 0; - - /* update known max for given subprogram */ - env->subprog_info[func->subprogno].stack_depth = -off; - return 0; -} - /* starting from main bpf function walk all instructions of the function * and recursively walk all callees that given function can call. * Ignore jump and exit insns. @@ -6577,13 +6564,14 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, * The minimum valid offset is -MAX_BPF_STACK for writes, and * -state->allocated_stack for reads. */ -static int check_stack_slot_within_bounds(s64 off, - struct bpf_func_state *state, - enum bpf_access_type t) +static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, + s64 off, + struct bpf_func_state *state, + enum bpf_access_type t) { int min_valid_off; - if (t == BPF_WRITE) + if (t == BPF_WRITE || env->allow_uninit_stack) min_valid_off = -MAX_BPF_STACK; else min_valid_off = -state->allocated_stack; @@ -6632,7 +6620,7 @@ static int check_stack_access_within_bounds( max_off = reg->smax_value + off + access_size; } - err = check_stack_slot_within_bounds(min_off, state, type); + err = check_stack_slot_within_bounds(env, min_off, state, type); if (!err && max_off > 0) err = -EINVAL; /* out of stack access into non-negative offsets */ @@ -6647,8 +6635,10 @@ static int check_stack_access_within_bounds( verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n", err_extra, regno, tn_buf, off, access_size); } + return err; } - return err; + + return grow_stack_state(env, state, round_up(-min_off, BPF_REG_SIZE)); } /* check whether memory at (regno + off) is accessible for t = (read | write) @@ -6663,7 +6653,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; - struct bpf_func_state *state; int size, err = 0; size = bpf_size_to_bytes(bpf_size); @@ -6806,11 +6795,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err) return err; - state = func(env, reg); - err = update_stack_depth(env, state, off); - if (err) - return err; - if (t == BPF_READ) err = check_stack_read(env, regno, off, size, value_regno); @@ -7004,7 +6988,8 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i /* When register 'regno' is used to read the stack (either directly or through * a helper function) make sure that it's within stack boundary and, depending - * on the access type, that all elements of the stack are initialized. + * on the access type and privileges, that all elements of the stack are + * initialized. * * 'off' includes 'regno->off', but not its dynamic part (if any). * @@ -7112,8 +7097,11 @@ static int check_stack_range_initialized( slot = -i - 1; spi = slot / BPF_REG_SIZE; - if (state->allocated_stack <= slot) - goto err; + if (state->allocated_stack <= slot) { + verbose(env, "verifier bug: allocated_stack too small"); + return -EFAULT; + } + stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; if (*stype == STACK_MISC) goto mark; @@ -7137,7 +7125,6 @@ static int check_stack_range_initialized( goto mark; } -err: if (tnum_is_const(reg->var_off)) { verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n", err_extra, regno, min_off, i - min_off, access_size); @@ -7162,7 +7149,7 @@ mark: * helper may write to the entire memory range. */ } - return update_stack_depth(env, state, min_off); + return 0; } static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, -- cgit From 2929bfac006d8f8e22b307d04e0d71bcb84db698 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Thu, 7 Dec 2023 22:25:19 -0500 Subject: bpf: Minor cleanup around stack bounds Push the rounding up of stack offsets into the function responsible for growing the stack, rather than relying on all the callers to do it. Uncertainty about whether the callers did it or not tripped up people in a previous review. Signed-off-by: Andrei Matei Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20231208032519.260451-4-andreimatei1@gmail.com --- kernel/bpf/verifier.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index de1e29fa467e..fb690539d5f6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1264,7 +1264,11 @@ static int resize_reference_state(struct bpf_func_state *state, size_t n) */ static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int size) { - size_t old_n = state->allocated_stack / BPF_REG_SIZE, n = size / BPF_REG_SIZE; + size_t old_n = state->allocated_stack / BPF_REG_SIZE, n; + + /* The stack size is always a multiple of BPF_REG_SIZE. */ + size = round_up(size, BPF_REG_SIZE); + n = size / BPF_REG_SIZE; if (old_n >= n) return 0; @@ -6638,7 +6642,10 @@ static int check_stack_access_within_bounds( return err; } - return grow_stack_state(env, state, round_up(-min_off, BPF_REG_SIZE)); + /* Note that there is no stack access with offset zero, so the needed stack + * size is -min_off, not -min_off+1. + */ + return grow_stack_state(env, state, -min_off /* size */); } /* check whether memory at (regno + off) is accessible for t = (read | write) -- cgit From 73d9eb340d2b95e0e86a656a7f3157c137f10129 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 6 Dec 2023 11:53:24 +0000 Subject: bpf: Enable bpf_cgrp_storage for cgroup1 non-attach case In the current cgroup1 environment, associating operations between cgroups and applications in a BPF program requires storing a mapping of cgroup_id to application either in a hash map or maintaining it in userspace. However, by enabling bpf_cgrp_storage for cgroup1, it becomes possible to conveniently store application-specific information in cgroup-local storage and utilize it within BPF programs. Furthermore, enabling this feature for cgroup1 involves minor modifications for the non-attach case, streamlining the process. However, when it comes to enabling this functionality for the cgroup1 attach case, it presents challenges. Therefore, the decision is to focus on enabling it solely for the cgroup1 non-attach case at present. If attempting to attach to a cgroup1 fd, the operation will simply fail with the error code -EBADF. Signed-off-by: Yafang Shao Acked-by: Tejun Heo Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20231206115326.4295-2-laoar.shao@gmail.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/bpf_cgrp_storage.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index d44fe8dd9732..28efd0a3f220 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -82,7 +82,7 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key) int fd; fd = *(int *)key; - cgroup = cgroup_get_from_fd(fd); + cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return ERR_CAST(cgroup); @@ -101,7 +101,7 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, int fd; fd = *(int *)key; - cgroup = cgroup_get_from_fd(fd); + cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return PTR_ERR(cgroup); @@ -131,7 +131,7 @@ static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) int err, fd; fd = *(int *)key; - cgroup = cgroup_get_from_fd(fd); + cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return PTR_ERR(cgroup); -- cgit From c26f2a8901393c9f81909da0a4324587092bd3a3 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 8 Dec 2023 18:23:49 +0800 Subject: bpf: Remove unnecessary wait from bpf_map_copy_value() Both map_lookup_elem() and generic_map_lookup_batch() use bpf_map_copy_value() to lookup and copy the value, and there is no update operation in bpf_map_copy_value(), so just remove the invocation of maybe_wait_bpf_programs() from it. Fixes: 15c14a3dca42 ("bpf: Add bpf_map_{value_size, update_value, map_copy_value} functions") Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231208102355.2628918-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index aff045eed375..9ad3f527ab37 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -264,7 +264,6 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, } bpf_enable_instrumentation(); - maybe_wait_bpf_programs(map); return err; } -- cgit From 37ba5b59d6adfa08926acd3a833608487a18c2ef Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 8 Dec 2023 18:23:50 +0800 Subject: bpf: Call maybe_wait_bpf_programs() only once for generic_map_update_batch() Just like commit 9087c6ff8dfe ("bpf: Call maybe_wait_bpf_programs() only once from generic_map_delete_batch()"), there is also no need to call maybe_wait_bpf_programs() for each update in batched update, so only call it once in generic_map_update_batch(). Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231208102355.2628918-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9ad3f527ab37..07e671431987 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -203,7 +203,6 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, rcu_read_unlock(); } bpf_enable_instrumentation(); - maybe_wait_bpf_programs(map); return err; } @@ -1577,6 +1576,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) } err = bpf_map_update_value(map, f.file, key, value, attr->flags); + maybe_wait_bpf_programs(map); kvfree(value); free_key: @@ -1816,6 +1816,8 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, kvfree(value); kvfree(key); + + maybe_wait_bpf_programs(map); return err; } -- cgit From 012772581d040607ac1f981f47f6afd2336b4580 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 8 Dec 2023 18:23:51 +0800 Subject: bpf: Add missed maybe_wait_bpf_programs() for htab of maps When doing batched lookup and deletion operations on htab of maps, maybe_wait_bpf_programs() is needed to ensure all programs don't use the inner map after the bpf syscall returns. Instead of adding the wait in __htab_map_lookup_and_delete_batch(), adding the wait in bpf_map_do_batch() and also removing the calling of maybe_wait_bpf_programs() from generic_map_{delete,update}_batch(). Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231208102355.2628918-4-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 07e671431987..2e6ef361da1c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1758,7 +1758,6 @@ int generic_map_delete_batch(struct bpf_map *map, kvfree(key); - maybe_wait_bpf_programs(map); return err; } @@ -1817,7 +1816,6 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, kvfree(value); kvfree(key); - maybe_wait_bpf_programs(map); return err; } @@ -5031,8 +5029,10 @@ static int bpf_map_do_batch(const union bpf_attr *attr, else BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr); err_put: - if (has_write) + if (has_write) { + maybe_wait_bpf_programs(map); bpf_map_write_active_dec(map); + } fdput(f); return err; } -- cgit From 67ad2c73ff29b32bd09135ec07c26e59490dbb3b Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 8 Dec 2023 18:23:52 +0800 Subject: bpf: Only call maybe_wait_bpf_programs() when map operation succeeds There is no need to call maybe_wait_bpf_programs() if update or deletion operation fails. So only call maybe_wait_bpf_programs() if update or deletion operation succeeds. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231208102355.2628918-5-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2e6ef361da1c..dd641475b65f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1576,7 +1576,8 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) } err = bpf_map_update_value(map, f.file, key, value, attr->flags); - maybe_wait_bpf_programs(map); + if (!err) + maybe_wait_bpf_programs(map); kvfree(value); free_key: @@ -1632,7 +1633,8 @@ static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); bpf_enable_instrumentation(); - maybe_wait_bpf_programs(map); + if (!err) + maybe_wait_bpf_programs(map); out: kvfree(key); err_put: -- cgit From 06e5c999f10269a532304e89a6adb2fbfeb0593c Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 8 Dec 2023 18:23:53 +0800 Subject: bpf: Set uattr->batch.count as zero before batched update or deletion generic_map_{delete,update}_batch() doesn't set uattr->batch.count as zero before it tries to allocate memory for key. If the memory allocation fails, the value of uattr->batch.count will be incorrect. Fix it by setting uattr->batch.count as zero beore batched update or deletion. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231208102355.2628918-6-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dd641475b65f..06320d9abf33 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1731,6 +1731,9 @@ int generic_map_delete_batch(struct bpf_map *map, if (!max_count) return 0; + if (put_user(0, &uattr->batch.count)) + return -EFAULT; + key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!key) return -ENOMEM; @@ -1787,6 +1790,9 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, if (!max_count) return 0; + if (put_user(0, &uattr->batch.count)) + return -EFAULT; + key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!key) return -ENOMEM; -- cgit From 482d548d40b0af9af730e4869903d4433e44f014 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 8 Dec 2023 17:09:57 -0800 Subject: bpf: handle fake register spill to stack with BPF_ST_MEM instruction When verifier validates BPF_ST_MEM instruction that stores known constant to stack (e.g., *(u64 *)(r10 - 8) = 123), it effectively spills a fake register with a constant (but initially imprecise) value to a stack slot. Because read-side logic treats it as a proper register fill from stack slot, we need to mark such stack slot initialization as INSN_F_STACK_ACCESS instruction to stop precision backtracking from missing it. Fixes: 41f6f64e6999 ("bpf: support non-r10 register spill/fill to/from stack in precision tracking") Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231209010958.66758-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fb690539d5f6..727a59e4a647 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4498,7 +4498,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, __mark_reg_known(&fake_reg, insn->imm); fake_reg.type = SCALAR_VALUE; save_register_state(env, state, spi, &fake_reg, size); - insn_flags = 0; /* not a register spill */ } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { -- cgit From a6de18f310a511278c1ff16b96eb2d500eada725 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Thu, 7 Dec 2023 15:08:42 -0600 Subject: bpf: Add bpf_cpumask_weight() kfunc It can be useful to query how many bits are set in a cpumask. For example, if you want to perform special logic for the last remaining core that's set in a mask. Let's therefore add a new bpf_cpumask_weight() kfunc which checks how many bits are set in a mask. Signed-off-by: David Vernet Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20231207210843.168466-2-void@manifault.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumask.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index e01c741e54e7..7499b7d8c06f 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -405,6 +405,17 @@ __bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, return cpumask_any_and_distribute(src1, src2); } +/** + * bpf_cpumask_weight() - Return the number of bits in @cpumask. + * @cpumask: The cpumask being queried. + * + * Count the number of set bits in the given cpumask. + */ +__bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask) +{ + return cpumask_weight(cpumask); +} + __bpf_kfunc_end_defs(); BTF_SET8_START(cpumask_kfunc_btf_ids) @@ -432,6 +443,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU) BTF_SET8_END(cpumask_kfunc_btf_ids) static const struct btf_kfunc_id_set cpumask_kfunc_set = { -- cgit From 1e68485d8299860e68c4e1d29589ff0d20db0287 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 4 Dec 2023 15:39:19 -0800 Subject: bpf: log PTR_TO_MEM memory size in verifier log Emit valid memory size addressable through PTR_TO_MEM register. Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231204233931.49758-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/log.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 55d019f30e91..61d7d23a0118 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -682,6 +682,10 @@ static void print_reg_state(struct bpf_verifier_env *env, verbose_a("r="); verbose_unum(env, reg->range); } + if (base_type(t) == PTR_TO_MEM) { + verbose_a("sz="); + verbose_unum(env, reg->mem_size); + } if (tnum_is_const(reg->var_off)) { /* a pointer register with fixed offset */ if (reg->var_off.value) { -- cgit From 22b769bb4f87060774bfdd6facbab438ed3b8453 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 4 Dec 2023 15:39:20 -0800 Subject: bpf: emit more dynptr information in verifier log Emit dynptr type for CONST_PTR_TO_DYNPTR register. Also emit id, ref_obj_id, and dynptr_id fields for STACK_DYNPTR stack slots. Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231204233931.49758-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/log.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 61d7d23a0118..594a234f122b 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -628,6 +628,12 @@ static bool type_is_map_ptr(enum bpf_reg_type t) { } } +/* + * _a stands for append, was shortened to avoid multiline statements below. + * This macro is used to output a comma separated list of attributes. + */ +#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, ##__VA_ARGS__); sep = ","; }) + static void print_reg_state(struct bpf_verifier_env *env, const struct bpf_func_state *state, const struct bpf_reg_state *reg) @@ -643,11 +649,6 @@ static void print_reg_state(struct bpf_verifier_env *env, verbose_snum(env, reg->var_off.value + reg->off); return; } -/* - * _a stands for append, was shortened to avoid multiline statements below. - * This macro is used to output a comma separated list of attributes. - */ -#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, ##__VA_ARGS__); sep = ","; }) verbose(env, "%s", reg_type_str(env, t)); if (t == PTR_TO_STACK) { @@ -686,6 +687,8 @@ static void print_reg_state(struct bpf_verifier_env *env, verbose_a("sz="); verbose_unum(env, reg->mem_size); } + if (t == CONST_PTR_TO_DYNPTR) + verbose_a("type=%s", dynptr_type_str(reg->dynptr.type)); if (tnum_is_const(reg->var_off)) { /* a pointer register with fixed offset */ if (reg->var_off.value) { @@ -702,8 +705,6 @@ static void print_reg_state(struct bpf_verifier_env *env, } } verbose(env, ")"); - -#undef verbose_a } void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state, @@ -727,6 +728,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { char types_buf[BPF_REG_SIZE + 1]; + const char *sep = ""; bool valid = false; u8 slot_type; int j; @@ -765,9 +767,14 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); print_liveness(env, reg->live); - verbose(env, "=dynptr_%s", dynptr_type_str(reg->dynptr.type)); + verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type)); + if (reg->id) + verbose_a("id=%d", reg->id); if (reg->ref_obj_id) - verbose(env, "(ref_id=%d)", reg->ref_obj_id); + verbose_a("ref_id=%d", reg->ref_obj_id); + if (reg->dynptr_id) + verbose_a("dynptr_id=%d", reg->dynptr_id); + verbose(env, ")"); break; case STACK_ITER: /* only main slot has ref_obj_id set; skip others */ -- cgit From 1a1ad782dcbbacd9e8d4e2e7ff1bf14d1db80727 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 4 Dec 2023 15:39:21 -0800 Subject: bpf: tidy up exception callback management a bit Use the fact that we are passing subprog index around and have a corresponding struct bpf_subprog_info in bpf_verifier_env for each subprogram. We don't need to separately pass around a flag whether subprog is exception callback or not, each relevant verifier function can determine this using provided subprog index if we maintain bpf_subprog_info properly. Also move out exception callback-specific logic from btf_prepare_func_args(), keeping it generic. We can enforce all these restriction right before exception callback verification pass. We add out parameter, arg_cnt, for now, but this will be unnecessary with subsequent refactoring and will be removed. Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231204233931.49758-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 11 ++--------- kernel/bpf/verifier.c | 52 ++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 41 insertions(+), 22 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 63cf4128fc05..d56433bf8aba 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6956,7 +6956,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, * (either PTR_TO_CTX or SCALAR_VALUE). */ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs, bool is_ex_cb) + struct bpf_reg_state *regs, u32 *arg_cnt) { struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; @@ -7013,6 +7013,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; } + *arg_cnt = nargs; /* check that function returns int, exception cb also requires this */ t = btf_type_by_id(btf, t->type); while (btf_type_is_modifier(t)) @@ -7062,14 +7063,6 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, i, btf_type_str(t), tname); return -EINVAL; } - /* We have already ensured that the callback returns an integer, just - * like all global subprogs. We need to determine it only has a single - * scalar argument. - */ - if (is_ex_cb && (nargs != 1 || regs[BPF_REG_1].type != SCALAR_VALUE)) { - bpf_log(log, "exception cb only supports single integer argument\n"); - return -EINVAL; - } return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 727a59e4a647..d1755db1b503 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -442,6 +442,25 @@ static struct bpf_func_info_aux *subprog_aux(const struct bpf_verifier_env *env, return &env->prog->aux->func_info_aux[subprog]; } +static struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog) +{ + return &env->subprog_info[subprog]; +} + +static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog) +{ + struct bpf_subprog_info *info = subprog_info(env, subprog); + + info->is_cb = true; + info->is_async_cb = true; + info->is_exception_cb = true; +} + +static bool subprog_is_exc_cb(struct bpf_verifier_env *env, int subprog) +{ + return subprog_info(env, subprog)->is_exception_cb; +} + static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK); @@ -2892,6 +2911,7 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) if (env->subprog_info[i].start != ex_cb_insn) continue; env->exception_callback_subprog = i; + mark_subprog_exc_cb(env, i); break; } } @@ -19166,9 +19186,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) env->exception_callback_subprog = env->subprog_cnt - 1; /* Don't update insn_cnt, as add_hidden_subprog always appends insns */ - env->subprog_info[env->exception_callback_subprog].is_cb = true; - env->subprog_info[env->exception_callback_subprog].is_async_cb = true; - env->subprog_info[env->exception_callback_subprog].is_exception_cb = true; + mark_subprog_exc_cb(env, env->exception_callback_subprog); } for (i = 0; i < insn_cnt; i++, insn++) { @@ -19868,7 +19886,7 @@ static void free_states(struct bpf_verifier_env *env) } } -static int do_check_common(struct bpf_verifier_env *env, int subprog, bool is_ex_cb) +static int do_check_common(struct bpf_verifier_env *env, int subprog) { bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state; @@ -19899,9 +19917,23 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog, bool is_ex regs = state->frame[state->curframe]->regs; if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) { - ret = btf_prepare_func_args(env, subprog, regs, is_ex_cb); + u32 nargs; + + ret = btf_prepare_func_args(env, subprog, regs, &nargs); if (ret) goto out; + if (subprog_is_exc_cb(env, subprog)) { + state->frame[0]->in_exception_callback_fn = true; + /* We have already ensured that the callback returns an integer, just + * like all global subprogs. We need to determine it only has a single + * scalar argument. + */ + if (nargs != 1 || regs[BPF_REG_1].type != SCALAR_VALUE) { + verbose(env, "exception cb only supports single integer argument\n"); + ret = -EINVAL; + goto out; + } + } for (i = BPF_REG_1; i <= BPF_REG_5; i++) { if (regs[i].type == PTR_TO_CTX) mark_reg_known_zero(env, regs, i); @@ -19915,12 +19947,6 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog, bool is_ex regs[i].id = ++env->id_gen; } } - if (is_ex_cb) { - state->frame[0]->in_exception_callback_fn = true; - env->subprog_info[subprog].is_cb = true; - env->subprog_info[subprog].is_async_cb = true; - env->subprog_info[subprog].is_exception_cb = true; - } } else { /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; @@ -20000,7 +20026,7 @@ again: env->insn_idx = env->subprog_info[i].start; WARN_ON_ONCE(env->insn_idx == 0); - ret = do_check_common(env, i, env->exception_callback_subprog == i); + ret = do_check_common(env, i); if (ret) { return ret; } else if (env->log.level & BPF_LOG_LEVEL) { @@ -20030,7 +20056,7 @@ static int do_check_main(struct bpf_verifier_env *env) int ret; env->insn_idx = 0; - ret = do_check_common(env, 0, false); + ret = do_check_common(env, 0); if (!ret) env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; return ret; -- cgit From 56c26d5ad86dfe48a76855a91b523ab4f372c003 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 12 Dec 2023 08:54:36 +0800 Subject: bpf: Remove unused backtrack_state helper functions The function are defined in the verifier.c file, but not called elsewhere, so delete the unused function. kernel/bpf/verifier.c:3448:20: warning: unused function 'bt_set_slot' kernel/bpf/verifier.c:3453:20: warning: unused function 'bt_clear_slot' kernel/bpf/verifier.c:3488:20: warning: unused function 'bt_is_slot_set' Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20231212005436.103829-1-yang.lee@linux.alibaba.com Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7714 --- kernel/bpf/verifier.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d1755db1b503..ef27820a24e3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3465,16 +3465,6 @@ static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u3 bt->stack_masks[frame] &= ~(1ull << slot); } -static inline void bt_set_slot(struct backtrack_state *bt, u32 slot) -{ - bt_set_frame_slot(bt, bt->frame, slot); -} - -static inline void bt_clear_slot(struct backtrack_state *bt, u32 slot) -{ - bt_clear_frame_slot(bt, bt->frame, slot); -} - static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame) { return bt->reg_masks[frame]; @@ -3505,11 +3495,6 @@ static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u return bt->stack_masks[frame] & (1ull << slot); } -static inline bool bt_is_slot_set(struct backtrack_state *bt, u32 slot) -{ - return bt_is_frame_slot_set(bt, bt->frame, slot); -} - /* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */ static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask) { -- cgit From 745e0311306507ddbe1727ac798c8f956812b810 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Sun, 10 Dec 2023 17:51:50 -0500 Subject: bpf: Comment on check_mem_size_reg This patch adds a comment to check_mem_size_reg -- a function whose meaning is not very transparent. The function implicitly deals with two registers connected by convention, which is not obvious. Signed-off-by: Andrei Matei Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231210225149.67639-1-andreimatei1@gmail.com --- kernel/bpf/verifier.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ef27820a24e3..1863826a4ac3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7256,6 +7256,12 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +/* verify arguments to helpers or kfuncs consisting of a pointer and an access + * size. + * + * @regno is the register containing the access size. regno-1 is the register + * containing the pointer. + */ static int check_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, bool zero_size_allowed, -- cgit From 750e785796bb72423b97cac21ecd0fa3b3b65610 Mon Sep 17 00:00:00 2001 From: Jie Jiang Date: Tue, 12 Dec 2023 09:39:23 +0000 Subject: bpf: Support uid and gid when mounting bpffs Parse uid and gid in bpf_parse_param() so that they can be passed in as the `data` parameter when mount() bpffs. This will be useful when we want to control which user/group has the control to the mounted bpffs, otherwise a separate chown() call will be needed. Signed-off-by: Jie Jiang Signed-off-by: Andrii Nakryiko Acked-by: Mike Frysinger Acked-by: Christian Brauner Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231212093923.497838-1-jiejiang@chromium.org --- kernel/bpf/inode.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5359a0929c35..0a8e1188ea46 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -601,9 +601,16 @@ EXPORT_SYMBOL(bpf_prog_get_type_path); static int bpf_show_options(struct seq_file *m, struct dentry *root) { struct bpf_mount_opts *opts = root->d_sb->s_fs_info; - umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; + struct inode *inode = d_inode(root); + umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX; u64 mask; + if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID)) + seq_printf(m, ",uid=%u", + from_kuid_munged(&init_user_ns, inode->i_uid)); + if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID)) + seq_printf(m, ",gid=%u", + from_kgid_munged(&init_user_ns, inode->i_gid)); if (mode != S_IRWXUGO) seq_printf(m, ",mode=%o", mode); @@ -652,6 +659,8 @@ const struct super_operations bpf_super_ops = { }; enum { + OPT_UID, + OPT_GID, OPT_MODE, OPT_DELEGATE_CMDS, OPT_DELEGATE_MAPS, @@ -660,6 +669,8 @@ enum { }; static const struct fs_parameter_spec bpf_fs_parameters[] = { + fsparam_u32 ("uid", OPT_UID), + fsparam_u32 ("gid", OPT_GID), fsparam_u32oct ("mode", OPT_MODE), fsparam_string ("delegate_cmds", OPT_DELEGATE_CMDS), fsparam_string ("delegate_maps", OPT_DELEGATE_MAPS), @@ -672,6 +683,8 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct bpf_mount_opts *opts = fc->s_fs_info; struct fs_parse_result result; + kuid_t uid; + kgid_t gid; int opt, err; u64 msk; @@ -694,6 +707,34 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) } switch (opt) { + case OPT_UID: + uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(uid)) + goto bad_value; + + /* + * The requested uid must be representable in the + * filesystem's idmapping. + */ + if (!kuid_has_mapping(fc->user_ns, uid)) + goto bad_value; + + opts->uid = uid; + break; + case OPT_GID: + gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(gid)) + goto bad_value; + + /* + * The requested gid must be representable in the + * filesystem's idmapping. + */ + if (!kgid_has_mapping(fc->user_ns, gid)) + goto bad_value; + + opts->gid = gid; + break; case OPT_MODE: opts->mode = result.uint_32 & S_IALLUGO; break; @@ -722,6 +763,9 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) } return 0; + +bad_value: + return invalfc(fc, "Bad value for '%s'", param->key); } struct bpf_preload_ops *bpf_preload_ops; @@ -808,6 +852,8 @@ static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_op = &bpf_super_ops; inode = sb->s_root->d_inode; + inode->i_uid = opts->uid; + inode->i_gid = opts->gid; inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; populate_bpffs(sb->s_root); @@ -843,6 +889,8 @@ static int bpf_init_fs_context(struct fs_context *fc) return -ENOMEM; opts->mode = S_IRWXUGO; + opts->uid = current_fsuid(); + opts->gid = current_fsgid(); /* start out with no BPF token delegation enabled */ opts->delegate_cmds = 0; -- cgit From f5fdb51fb980077a4c6c78f3f775821f611fb38b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 13 Dec 2023 11:08:33 -0800 Subject: bpf: fail BPF_TOKEN_CREATE if no delegation option was set on BPF FS It's quite confusing in practice when it's possible to successfully create a BPF token from BPF FS that didn't have any of delegate_xxx mount options set up. While it's not wrong, it's actually more meaningful to reject BPF_TOKEN_CREATE with specific error code (-ENOENT) to let user-space know that no token delegation is setup up. So, instead of creating empty BPF token that will be always ignored because it doesn't have any of the allow_xxx bits set, reject it with -ENOENT. If we ever need empty BPF token to be possible, we can support that with extra flag passed into BPF_TOKEN_CREATE. Acked-by: Christian Brauner Acked-by: John Fastabend Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231213190842.3844987-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/token.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index 17212efcde60..a86fccd57e2d 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -152,6 +152,15 @@ int bpf_token_create(union bpf_attr *attr) goto out_path; } + mnt_opts = path.dentry->d_sb->s_fs_info; + if (mnt_opts->delegate_cmds == 0 && + mnt_opts->delegate_maps == 0 && + mnt_opts->delegate_progs == 0 && + mnt_opts->delegate_attachs == 0) { + err = -ENOENT; /* no BPF token delegation is set up */ + goto out_path; + } + mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); inode = bpf_get_inode(path.mnt->mnt_sb, NULL, mode); if (IS_ERR(inode)) { @@ -181,7 +190,6 @@ int bpf_token_create(union bpf_attr *attr) /* remember bpffs owning userns for future ns_capable() checks */ token->userns = get_user_ns(userns); - mnt_opts = path.dentry->d_sb->s_fs_info; token->allowed_cmds = mnt_opts->delegate_cmds; token->allowed_maps = mnt_opts->delegate_maps; token->allowed_progs = mnt_opts->delegate_progs; -- cgit From b13cddf633562b9b2c34fd63471d377019704ebe Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Fri, 8 Dec 2023 15:32:48 +0000 Subject: bpf: add small subset of SECURITY_PATH hooks to BPF sleepable_lsm_hooks list security_path_* based LSM hooks appear to be generally missing from the sleepable_lsm_hooks list. Initially add a small subset of them to the preexisting sleepable_lsm_hooks list so that sleepable BPF helpers like bpf_d_path() can be used from sleepable BPF LSM based programs. The security_path_* hooks added in this patch are similar to the security_inode_* counterparts that already exist in the sleepable_lsm_hooks list, and are called in roughly similar points and contexts. Presumably, making them OK to be also annotated as sleepable. Building a kernel with DEBUG_ATOMIC_SLEEP options enabled and running reasonable workloads stimulating activity that would be intercepted by such security hooks didn't show any splats. Notably, I haven't added all the security_path_* LSM hooks that are available as I don't need them at this point in time. Signed-off-by: Matt Bobrowski Acked-by: KP Singh Link: https://lore.kernel.org/r/ZXM3IHHXpNY9y82a@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_lsm.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 7d2f96413a57..63b4dc495125 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -304,6 +304,18 @@ BTF_ID(func, bpf_lsm_kernel_module_request) BTF_ID(func, bpf_lsm_kernel_read_file) BTF_ID(func, bpf_lsm_kernfs_init_security) +#ifdef CONFIG_SECURITY_PATH +BTF_ID(func, bpf_lsm_path_unlink) +BTF_ID(func, bpf_lsm_path_mkdir) +BTF_ID(func, bpf_lsm_path_rmdir) +BTF_ID(func, bpf_lsm_path_truncate) +BTF_ID(func, bpf_lsm_path_symlink) +BTF_ID(func, bpf_lsm_path_link) +BTF_ID(func, bpf_lsm_path_rename) +BTF_ID(func, bpf_lsm_path_chmod) +BTF_ID(func, bpf_lsm_path_chown) +#endif /* CONFIG_SECURITY_PATH */ + #ifdef CONFIG_KEYS BTF_ID(func, bpf_lsm_key_free) #endif /* CONFIG_KEYS */ -- cgit From 2a0c6b41eec90c2a138ea8b574836744783c67ff Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 11 Dec 2023 16:34:47 +0800 Subject: bpf: Update the comments in maybe_wait_bpf_programs() Since commit 638e4b825d52 ("bpf: Allows per-cpu maps and map-in-map in sleepable programs"), sleepable BPF program can also use map-in-map, but maybe_wait_bpf_programs() doesn't handle it accordingly. The main reason is that using synchronize_rcu_tasks_trace() to wait for the completions of these sleepable BPF programs may incur a very long delay and userspace may think it is hung, so the wait for sleepable BPF programs is skipped. Update the comments in maybe_wait_bpf_programs() to reflect the reason. Signed-off-by: Hou Tao Acked-by: Yonghong Song Acked-by: John Fastabend Link: https://lore.kernel.org/r/20231211083447.1921178-1-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 06320d9abf33..d63c1ed42412 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -142,9 +142,13 @@ static u32 bpf_map_value_size(const struct bpf_map *map) static void maybe_wait_bpf_programs(struct bpf_map *map) { - /* Wait for any running BPF programs to complete so that - * userspace, when we return to it, knows that all programs - * that could be running use the new map value. + /* Wait for any running non-sleepable BPF programs to complete so that + * userspace, when we return to it, knows that all non-sleepable + * programs that could be running use the new map value. For sleepable + * BPF programs, synchronize_rcu_tasks_trace() should be used to wait + * for the completions of these programs, but considering the waiting + * time can be very long and userspace may think it will hang forever, + * so don't handle sleepable BPF programs now. */ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) -- cgit From 8f82583f9527b3be9d70d9a5d1f33435e29d0480 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 14 Dec 2023 12:30:09 +0800 Subject: bpf: Reduce the scope of rcu_read_lock when updating fd map There is no rcu-read-lock requirement for ops->map_fd_get_ptr() or ops->map_fd_put_ptr(), so doesn't use rcu-read-lock for these two callbacks. For bpf_fd_array_map_update_elem(), accessing array->ptrs doesn't need rcu-read-lock because array->ptrs must still be allocated. For bpf_fd_htab_map_update_elem(), htab_map_update_elem() only requires rcu-read-lock to be held to avoid the WARN_ON_ONCE(), so only use rcu_read_lock() during the invocation of htab_map_update_elem(). Acked-by: Yonghong Song Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231214043010.3458072-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 6 ++++++ kernel/bpf/syscall.c | 4 ---- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 5b9146fa825f..ec3bdcc6a3cf 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -2523,7 +2523,13 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, if (IS_ERR(ptr)) return PTR_ERR(ptr); + /* The htab bucket lock is always held during update operations in fd + * htab map, and the following rcu_read_lock() is only used to avoid + * the WARN_ON_ONCE in htab_map_update_elem(). + */ + rcu_read_lock(); ret = htab_map_update_elem(map, key, &ptr, map_flags); + rcu_read_unlock(); if (ret) map->ops->map_fd_put_ptr(map, ptr, false); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d63c1ed42412..3fcf7741146a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -184,15 +184,11 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, err = bpf_percpu_cgroup_storage_update(map, key, value, flags); } else if (IS_FD_ARRAY(map)) { - rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, map_file, key, value, flags); - rcu_read_unlock(); } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { - rcu_read_lock(); err = bpf_fd_htab_map_update_elem(map, map_file, key, value, flags); - rcu_read_unlock(); } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { /* rcu_read_lock() is not needed */ err = bpf_fd_reuseport_array_update_elem(map, key, value, -- cgit From dc68540913ac523b46ebda3843cec179362c7a72 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 14 Dec 2023 12:30:10 +0800 Subject: bpf: Use GFP_KERNEL in bpf_event_entry_gen() rcu_read_lock() is no longer held when invoking bpf_event_entry_gen() which is called by perf_event_fd_array_get_ptr(), so using GFP_KERNEL instead of GFP_ATOMIC to reduce the possibility of failures due to out-of-memory. Acked-by: Yonghong Song Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231214043010.3458072-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 8d365bda9a8b..b5ec24b3563e 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -1195,7 +1195,7 @@ static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, { struct bpf_event_entry *ee; - ee = kzalloc(sizeof(*ee), GFP_ATOMIC); + ee = kzalloc(sizeof(*ee), GFP_KERNEL); if (ee) { ee->event = perf_file->private_data; ee->perf_file = perf_file; -- cgit From 59e5791f59dd83e8aa72a4e74217eabb6e8cfd90 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 14 Dec 2023 12:38:15 -0800 Subject: bpf: Fix a race condition between btf_put() and map_free() When running `./test_progs -j` in my local vm with latest kernel, I once hit a kasan error like below: [ 1887.184724] BUG: KASAN: slab-use-after-free in bpf_rb_root_free+0x1f8/0x2b0 [ 1887.185599] Read of size 4 at addr ffff888106806910 by task kworker/u12:2/2830 [ 1887.186498] [ 1887.186712] CPU: 3 PID: 2830 Comm: kworker/u12:2 Tainted: G OEL 6.7.0-rc3-00699-g90679706d486-dirty #494 [ 1887.188034] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 1887.189618] Workqueue: events_unbound bpf_map_free_deferred [ 1887.190341] Call Trace: [ 1887.190666] [ 1887.190949] dump_stack_lvl+0xac/0xe0 [ 1887.191423] ? nf_tcp_handle_invalid+0x1b0/0x1b0 [ 1887.192019] ? panic+0x3c0/0x3c0 [ 1887.192449] print_report+0x14f/0x720 [ 1887.192930] ? preempt_count_sub+0x1c/0xd0 [ 1887.193459] ? __virt_addr_valid+0xac/0x120 [ 1887.194004] ? bpf_rb_root_free+0x1f8/0x2b0 [ 1887.194572] kasan_report+0xc3/0x100 [ 1887.195085] ? bpf_rb_root_free+0x1f8/0x2b0 [ 1887.195668] bpf_rb_root_free+0x1f8/0x2b0 [ 1887.196183] ? __bpf_obj_drop_impl+0xb0/0xb0 [ 1887.196736] ? preempt_count_sub+0x1c/0xd0 [ 1887.197270] ? preempt_count_sub+0x1c/0xd0 [ 1887.197802] ? _raw_spin_unlock+0x1f/0x40 [ 1887.198319] bpf_obj_free_fields+0x1d4/0x260 [ 1887.198883] array_map_free+0x1a3/0x260 [ 1887.199380] bpf_map_free_deferred+0x7b/0xe0 [ 1887.199943] process_scheduled_works+0x3a2/0x6c0 [ 1887.200549] worker_thread+0x633/0x890 [ 1887.201047] ? __kthread_parkme+0xd7/0xf0 [ 1887.201574] ? kthread+0x102/0x1d0 [ 1887.202020] kthread+0x1ab/0x1d0 [ 1887.202447] ? pr_cont_work+0x270/0x270 [ 1887.202954] ? kthread_blkcg+0x50/0x50 [ 1887.203444] ret_from_fork+0x34/0x50 [ 1887.203914] ? kthread_blkcg+0x50/0x50 [ 1887.204397] ret_from_fork_asm+0x11/0x20 [ 1887.204913] [ 1887.204913] [ 1887.205209] [ 1887.205416] Allocated by task 2197: [ 1887.205881] kasan_set_track+0x3f/0x60 [ 1887.206366] __kasan_kmalloc+0x6e/0x80 [ 1887.206856] __kmalloc+0xac/0x1a0 [ 1887.207293] btf_parse_fields+0xa15/0x1480 [ 1887.207836] btf_parse_struct_metas+0x566/0x670 [ 1887.208387] btf_new_fd+0x294/0x4d0 [ 1887.208851] __sys_bpf+0x4ba/0x600 [ 1887.209292] __x64_sys_bpf+0x41/0x50 [ 1887.209762] do_syscall_64+0x4c/0xf0 [ 1887.210222] entry_SYSCALL_64_after_hwframe+0x63/0x6b [ 1887.210868] [ 1887.211074] Freed by task 36: [ 1887.211460] kasan_set_track+0x3f/0x60 [ 1887.211951] kasan_save_free_info+0x28/0x40 [ 1887.212485] ____kasan_slab_free+0x101/0x180 [ 1887.213027] __kmem_cache_free+0xe4/0x210 [ 1887.213514] btf_free+0x5b/0x130 [ 1887.213918] rcu_core+0x638/0xcc0 [ 1887.214347] __do_softirq+0x114/0x37e The error happens at bpf_rb_root_free+0x1f8/0x2b0: 00000000000034c0 : ; { 34c0: f3 0f 1e fa endbr64 34c4: e8 00 00 00 00 callq 0x34c9 34c9: 55 pushq %rbp 34ca: 48 89 e5 movq %rsp, %rbp ... ; if (rec && rec->refcount_off >= 0 && 36aa: 4d 85 ed testq %r13, %r13 36ad: 74 a9 je 0x3658 36af: 49 8d 7d 10 leaq 0x10(%r13), %rdi 36b3: e8 00 00 00 00 callq 0x36b8 <==== kasan function 36b8: 45 8b 7d 10 movl 0x10(%r13), %r15d <==== use-after-free load 36bc: 45 85 ff testl %r15d, %r15d 36bf: 78 8c js 0x364d So the problem is at rec->refcount_off in the above. I did some source code analysis and find the reason. CPU A CPU B bpf_map_put: ... btf_put with rcu callback ... bpf_map_free_deferred with system_unbound_wq ... ... ... ... btf_free_rcu: ... ... ... bpf_map_free_deferred: ... ... ... ---------> btf_struct_metas_free() ... | race condition ... ... ---------> map->ops->map_free() ... ... btf->struct_meta_tab = NULL In the above, map_free() corresponds to array_map_free() and eventually calling bpf_rb_root_free() which calls: ... __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); ... Here, 'value_rec' is assigned in btf_check_and_fixup_fields() with following code: meta = btf_find_struct_meta(btf, btf_id); if (!meta) return -EFAULT; rec->fields[i].graph_root.value_rec = meta->record; So basically, 'value_rec' is a pointer to the record in struct_metas_tab. And it is possible that that particular record has been freed by btf_struct_metas_free() and hence we have a kasan error here. Actually it is very hard to reproduce the failure with current bpf/bpf-next code, I only got the above error once. To increase reproducibility, I added a delay in bpf_map_free_deferred() to delay map->ops->map_free(), which significantly increased reproducibility. diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5e43ddd1b83f..aae5b5213e93 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -695,6 +695,7 @@ static void bpf_map_free_deferred(struct work_struct *work) struct bpf_map *map = container_of(work, struct bpf_map, work); struct btf_record *rec = map->record; + mdelay(100); security_bpf_map_free(map); bpf_map_release_memcg(map); /* implementation dependent freeing */ Hao also provided test cases ([1]) for easily reproducing the above issue. There are two ways to fix the issue, the v1 of the patch ([2]) moving btf_put() after map_free callback, and the v5 of the patch ([3]) using a kptr style fix which tries to get a btf reference during map_check_btf(). Each approach has its pro and cons. The first approach delays freeing btf while the second approach needs to acquire reference depending on context which makes logic not very elegant and may complicate things with future new data structures. Alexei suggested in [4] going back to v1 which is what this patch tries to do. Rerun './test_progs -j' with the above mdelay() hack for a couple of times and didn't observe the error for the above rb_root test cases. Running Hou's test ([1]) is also successful. [1] https://lore.kernel.org/bpf/20231207141500.917136-1-houtao@huaweicloud.com/ [2] v1: https://lore.kernel.org/bpf/20231204173946.3066377-1-yonghong.song@linux.dev/ [3] v5: https://lore.kernel.org/bpf/20231208041621.2968241-1-yonghong.song@linux.dev/ [4] v4: https://lore.kernel.org/bpf/CAADnVQJ3FiXUhZJwX_81sjZvSYYKCFB3BT6P8D59RS2Gu+0Z7g@mail.gmail.com/ Cc: Hou Tao Fixes: 958cf2e273f0 ("bpf: Introduce bpf_obj_new") Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20231214203815.1469107-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3fcf7741146a..8faa1a20edf8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -692,6 +692,7 @@ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); struct btf_record *rec = map->record; + struct btf *btf = map->btf; security_bpf_map_free(map); bpf_map_release_memcg(map); @@ -707,6 +708,10 @@ static void bpf_map_free_deferred(struct work_struct *work) * template bpf_map struct used during verification. */ btf_record_free(rec); + /* Delay freeing of btf for maps, as map_free callback may need + * struct_meta info which will be freed with btf_put(). + */ + btf_put(btf); } static void bpf_map_put_uref(struct bpf_map *map) @@ -747,7 +752,6 @@ void bpf_map_put(struct bpf_map *map) if (atomic64_dec_and_test(&map->refcnt)) { /* bpf_map_free_id() must be called first */ bpf_map_free_id(map); - btf_put(map->btf); WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt)); if (READ_ONCE(map->free_after_mult_rcu_gp)) -- cgit From c5707b2146d229691e193d5158ea70b21b8ba180 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Dec 2023 14:50:15 -0800 Subject: bpf: support symbolic BPF FS delegation mount options Besides already supported special "any" value and hex bit mask, support string-based parsing of delegation masks based on exact enumerator names. Utilize BTF information of `enum bpf_cmd`, `enum bpf_map_type`, `enum bpf_prog_type`, and `enum bpf_attach_type` types to find supported symbolic names (ignoring __MAX_xxx guard values and stripping repetitive prefixes like BPF_ for cmd and attach types, BPF_MAP_TYPE_ for maps, and BPF_PROG_TYPE_ for prog types). The case doesn't matter, but it is normalized to lower case in mount option output. So "PROG_LOAD", "prog_load", and "MAP_create" are all valid values to specify for delegate_cmds options, "array" is among supported for map types, etc. Besides supporting string values, we also support multiple values specified at the same time, using colon (':') separator. There are corresponding changes on bpf_show_options side to use known values to print them in human-readable format, falling back to hex mask printing, if there are any unrecognized bits. This shouldn't be necessary when enum BTF information is present, but in general we should always be able to fall back to this even if kernel was built without BTF. As mentioned, emitted symbolic names are normalized to be all lower case. Example below shows various ways to specify delegate_cmds options through mount command and how mount options are printed back: 12/14 14:39:07.604 vmuser@archvm:~/local/linux/tools/testing/selftests/bpf $ mount | rg token $ sudo mkdir -p /sys/fs/bpf/token $ sudo mount -t bpf bpffs /sys/fs/bpf/token \ -o delegate_cmds=prog_load:MAP_CREATE \ -o delegate_progs=kprobe \ -o delegate_attachs=xdp $ mount | grep token bpffs on /sys/fs/bpf/token type bpf (rw,relatime,delegate_cmds=map_create:prog_load,delegate_progs=kprobe,delegate_attachs=xdp) Acked-by: John Fastabend Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231214225016.1209867-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/inode.c | 249 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 211 insertions(+), 38 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 0a8e1188ea46..4383b3d13a55 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -595,6 +595,136 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ } EXPORT_SYMBOL(bpf_prog_get_type_path); +struct bpffs_btf_enums { + const struct btf *btf; + const struct btf_type *cmd_t; + const struct btf_type *map_t; + const struct btf_type *prog_t; + const struct btf_type *attach_t; +}; + +static int find_bpffs_btf_enums(struct bpffs_btf_enums *info) +{ + const struct btf *btf; + const struct btf_type *t; + const char *name; + int i, n; + + memset(info, 0, sizeof(*info)); + + btf = bpf_get_btf_vmlinux(); + if (IS_ERR(btf)) + return PTR_ERR(btf); + if (!btf) + return -ENOENT; + + info->btf = btf; + + for (i = 1, n = btf_nr_types(btf); i < n; i++) { + t = btf_type_by_id(btf, i); + if (!btf_type_is_enum(t)) + continue; + + name = btf_name_by_offset(btf, t->name_off); + if (!name) + continue; + + if (strcmp(name, "bpf_cmd") == 0) + info->cmd_t = t; + else if (strcmp(name, "bpf_map_type") == 0) + info->map_t = t; + else if (strcmp(name, "bpf_prog_type") == 0) + info->prog_t = t; + else if (strcmp(name, "bpf_attach_type") == 0) + info->attach_t = t; + else + continue; + + if (info->cmd_t && info->map_t && info->prog_t && info->attach_t) + return 0; + } + + return -ESRCH; +} + +static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t, + const char *prefix, const char *str, int *value) +{ + const struct btf_enum *e; + const char *name; + int i, n, pfx_len = strlen(prefix); + + *value = 0; + + if (!btf || !enum_t) + return false; + + for (i = 0, n = btf_vlen(enum_t); i < n; i++) { + e = &btf_enum(enum_t)[i]; + + name = btf_name_by_offset(btf, e->name_off); + if (!name || strncasecmp(name, prefix, pfx_len) != 0) + continue; + + /* match symbolic name case insensitive and ignoring prefix */ + if (strcasecmp(name + pfx_len, str) == 0) { + *value = e->val; + return true; + } + } + + return false; +} + +static void seq_print_delegate_opts(struct seq_file *m, + const char *opt_name, + const struct btf *btf, + const struct btf_type *enum_t, + const char *prefix, + u64 delegate_msk, u64 any_msk) +{ + const struct btf_enum *e; + bool first = true; + const char *name; + u64 msk; + int i, n, pfx_len = strlen(prefix); + + delegate_msk &= any_msk; /* clear unknown bits */ + + if (delegate_msk == 0) + return; + + seq_printf(m, ",%s", opt_name); + if (delegate_msk == any_msk) { + seq_printf(m, "=any"); + return; + } + + if (btf && enum_t) { + for (i = 0, n = btf_vlen(enum_t); i < n; i++) { + e = &btf_enum(enum_t)[i]; + name = btf_name_by_offset(btf, e->name_off); + if (!name || strncasecmp(name, prefix, pfx_len) != 0) + continue; + msk = 1ULL << e->val; + if (delegate_msk & msk) { + /* emit lower-case name without prefix */ + seq_printf(m, "%c", first ? '=' : ':'); + name += pfx_len; + while (*name) { + seq_printf(m, "%c", tolower(*name)); + name++; + } + + delegate_msk &= ~msk; + first = false; + } + } + } + if (delegate_msk) + seq_printf(m, "%c0x%llx", first ? '=' : ':', delegate_msk); +} + /* * Display the mount options in /proc/mounts. */ @@ -614,29 +744,34 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) if (mode != S_IRWXUGO) seq_printf(m, ",mode=%o", mode); - mask = (1ULL << __MAX_BPF_CMD) - 1; - if ((opts->delegate_cmds & mask) == mask) - seq_printf(m, ",delegate_cmds=any"); - else if (opts->delegate_cmds) - seq_printf(m, ",delegate_cmds=0x%llx", opts->delegate_cmds); - - mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1; - if ((opts->delegate_maps & mask) == mask) - seq_printf(m, ",delegate_maps=any"); - else if (opts->delegate_maps) - seq_printf(m, ",delegate_maps=0x%llx", opts->delegate_maps); - - mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1; - if ((opts->delegate_progs & mask) == mask) - seq_printf(m, ",delegate_progs=any"); - else if (opts->delegate_progs) - seq_printf(m, ",delegate_progs=0x%llx", opts->delegate_progs); - - mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1; - if ((opts->delegate_attachs & mask) == mask) - seq_printf(m, ",delegate_attachs=any"); - else if (opts->delegate_attachs) - seq_printf(m, ",delegate_attachs=0x%llx", opts->delegate_attachs); + if (opts->delegate_cmds || opts->delegate_maps || + opts->delegate_progs || opts->delegate_attachs) { + struct bpffs_btf_enums info; + + /* ignore errors, fallback to hex */ + (void)find_bpffs_btf_enums(&info); + + mask = (1ULL << __MAX_BPF_CMD) - 1; + seq_print_delegate_opts(m, "delegate_cmds", + info.btf, info.cmd_t, "BPF_", + opts->delegate_cmds, mask); + + mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1; + seq_print_delegate_opts(m, "delegate_maps", + info.btf, info.map_t, "BPF_MAP_TYPE_", + opts->delegate_maps, mask); + + mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1; + seq_print_delegate_opts(m, "delegate_progs", + info.btf, info.prog_t, "BPF_PROG_TYPE_", + opts->delegate_progs, mask); + + mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1; + seq_print_delegate_opts(m, "delegate_attachs", + info.btf, info.attach_t, "BPF_", + opts->delegate_attachs, mask); + } + return 0; } @@ -686,7 +821,6 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) kuid_t uid; kgid_t gid; int opt, err; - u64 msk; opt = fs_parse(fc, bpf_fs_parameters, param, &result); if (opt < 0) { @@ -741,24 +875,63 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) case OPT_DELEGATE_CMDS: case OPT_DELEGATE_MAPS: case OPT_DELEGATE_PROGS: - case OPT_DELEGATE_ATTACHS: - if (strcmp(param->string, "any") == 0) { - msk = ~0ULL; - } else { - err = kstrtou64(param->string, 0, &msk); - if (err) - return err; + case OPT_DELEGATE_ATTACHS: { + struct bpffs_btf_enums info; + const struct btf_type *enum_t; + const char *enum_pfx; + u64 *delegate_msk, msk = 0; + char *p; + int val; + + /* ignore errors, fallback to hex */ + (void)find_bpffs_btf_enums(&info); + + switch (opt) { + case OPT_DELEGATE_CMDS: + delegate_msk = &opts->delegate_cmds; + enum_t = info.cmd_t; + enum_pfx = "BPF_"; + break; + case OPT_DELEGATE_MAPS: + delegate_msk = &opts->delegate_maps; + enum_t = info.map_t; + enum_pfx = "BPF_MAP_TYPE_"; + break; + case OPT_DELEGATE_PROGS: + delegate_msk = &opts->delegate_progs; + enum_t = info.prog_t; + enum_pfx = "BPF_PROG_TYPE_"; + break; + case OPT_DELEGATE_ATTACHS: + delegate_msk = &opts->delegate_attachs; + enum_t = info.attach_t; + enum_pfx = "BPF_"; + break; + default: + return -EINVAL; } + + while ((p = strsep(¶m->string, ":"))) { + if (strcmp(p, "any") == 0) { + msk |= ~0ULL; + } else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) { + msk |= 1ULL << val; + } else { + err = kstrtou64(p, 0, &msk); + if (err) + return err; + } + } + /* Setting delegation mount options requires privileges */ if (msk && !capable(CAP_SYS_ADMIN)) return -EPERM; - switch (opt) { - case OPT_DELEGATE_CMDS: opts->delegate_cmds |= msk; break; - case OPT_DELEGATE_MAPS: opts->delegate_maps |= msk; break; - case OPT_DELEGATE_PROGS: opts->delegate_progs |= msk; break; - case OPT_DELEGATE_ATTACHS: opts->delegate_attachs |= msk; break; - default: return -EINVAL; - } + + *delegate_msk |= msk; + break; + } + default: + /* ignore unknown mount options */ break; } -- cgit From 7489723c2e26504573dbb49b66bbc59092840008 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Thu, 14 Dec 2023 15:56:25 -0700 Subject: bpf: xdp: Register generic_kfunc_set with XDP programs Registering generic_kfunc_set with XDP programs enables some of the newer BPF features inside XDP -- namely tree based data structures and BPF exceptions. The current motivation for this commit is to enable assertions inside XDP bpf progs. Assertions are a standard and useful tool to encode intent. Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/d07d4614b81ca6aada44fcb89bb6b618fb66e4ca.1702594357.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b3be5742d6f1..b0b485126a76 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2630,6 +2630,7 @@ static int __init kfunc_init(void) ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set); ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors, ARRAY_SIZE(generic_dtors), -- cgit From 4f9087f16651aca4a5f32da840a53f6660f0579a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Dec 2023 10:12:18 +0100 Subject: x86/cfi,bpf: Fix BPF JIT call The current BPF call convention is __nocfi, except when it calls !JIT things, then it calls regular C functions. It so happens that with FineIBT the __nocfi and C calling conventions are incompatible. Specifically __nocfi will call at func+0, while FineIBT will have endbr-poison there, which is not a valid indirect target. Causing #CP. Notably this only triggers on IBT enabled hardware, which is probably why this hasn't been reported (also, most people will have JIT on anyway). Implement proper CFI prologues for the BPF JIT codegen and drop __nocfi for x86. Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231215092707.345270396@infradead.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c34513d645c4..5aa6863ac33b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -121,6 +121,9 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag #endif INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); +#ifdef CONFIG_FINEIBT + INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode); +#endif mutex_init(&fp->aux->used_maps_mutex); mutex_init(&fp->aux->dst_mutex); @@ -683,6 +686,23 @@ void bpf_prog_kallsyms_add(struct bpf_prog *fp) fp->aux->ksym.prog = true; bpf_ksym_add(&fp->aux->ksym); + +#ifdef CONFIG_FINEIBT + /* + * When FineIBT, code in the __cfi_foo() symbols can get executed + * and hence unwinder needs help. + */ + if (cfi_mode != CFI_FINEIBT) + return; + + snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN, + "__cfi_%s", fp->aux->ksym.name); + + fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16; + fp->aux->ksym_prefix.end = (unsigned long) fp->bpf_func; + + bpf_ksym_add(&fp->aux->ksym_prefix); +#endif } void bpf_prog_kallsyms_del(struct bpf_prog *fp) @@ -691,6 +711,11 @@ void bpf_prog_kallsyms_del(struct bpf_prog *fp) return; bpf_ksym_del(&fp->aux->ksym); +#ifdef CONFIG_FINEIBT + if (cfi_mode != CFI_FINEIBT) + return; + bpf_ksym_del(&fp->aux->ksym_prefix); +#endif } static struct bpf_ksym *bpf_ksym_find(unsigned long addr) -- cgit From 2cd3e3772e41377f32d6eea643e0590774e9187c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Dec 2023 10:12:20 +0100 Subject: x86/cfi,bpf: Fix bpf_struct_ops CFI BPF struct_ops uses __arch_prepare_bpf_trampoline() to write trampolines for indirect function calls. These tramplines much have matching CFI. In order to obtain the correct CFI hash for the various methods, add a matching structure that contains stub functions, the compiler will generate correct CFI which we can pilfer for the trampolines. Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231215092707.566977112@infradead.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_struct_ops.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 4d53c53fc5aa..02068bd0e4d9 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -352,17 +352,16 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = { int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, struct bpf_tramp_link *link, const struct btf_func_model *model, - void *image, void *image_end) + void *stub_func, void *image, void *image_end) { - u32 flags; + u32 flags = BPF_TRAMP_F_INDIRECT; int size; tlinks[BPF_TRAMP_FENTRY].links[0] = link; tlinks[BPF_TRAMP_FENTRY].nr_links = 1; - /* BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops, - * and it must be used alone. - */ - flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0; + + if (model->ret_size > 0) + flags |= BPF_TRAMP_F_RET_FENTRY_RET; size = arch_bpf_trampoline_size(model, flags, tlinks, NULL); if (size < 0) @@ -370,7 +369,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, if (size > (unsigned long)image_end - (unsigned long)image) return -E2BIG; return arch_prepare_bpf_trampoline(NULL, image, image_end, - model, flags, tlinks, NULL); + model, flags, tlinks, stub_func); } static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, @@ -504,11 +503,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, err = bpf_struct_ops_prepare_trampoline(tlinks, link, &st_ops->func_models[i], + *(void **)(st_ops->cfi_stubs + moff), image, image_end); if (err < 0) goto reset_unlock; - *(void **)(kdata + moff) = image; + *(void **)(kdata + moff) = image + cfi_get_offset(); image += err; /* put prog_id to udata */ -- cgit From e4c00339891c074c76f626ac82981963cbba5332 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Dec 2023 10:12:22 +0100 Subject: bpf: Fix dtor CFI Ensure the various dtor functions match their prototype and retain their CFI signatures, since they don't have their address taken, they are prone to not getting CFI, making them impossible to call indirectly. Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231215092707.799451071@infradead.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumask.c | 8 +++++++- kernel/bpf/helpers.c | 16 ++++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 7499b7d8c06f..2e73533a3811 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -96,6 +96,12 @@ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask) migrate_enable(); } +__bpf_kfunc void bpf_cpumask_release_dtor(void *cpumask) +{ + bpf_cpumask_release(cpumask); +} +CFI_NOSEAL(bpf_cpumask_release_dtor); + /** * bpf_cpumask_first() - Get the index of the first nonzero bit in the cpumask. * @cpumask: The cpumask being queried. @@ -453,7 +459,7 @@ static const struct btf_kfunc_id_set cpumask_kfunc_set = { BTF_ID_LIST(cpumask_dtor_ids) BTF_ID(struct, bpf_cpumask) -BTF_ID(func, bpf_cpumask_release) +BTF_ID(func, bpf_cpumask_release_dtor) static int __init cpumask_kfunc_init(void) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b0b485126a76..e0c0e3676df8 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2150,6 +2150,12 @@ __bpf_kfunc void bpf_task_release(struct task_struct *p) put_task_struct_rcu_user(p); } +__bpf_kfunc void bpf_task_release_dtor(void *p) +{ + put_task_struct_rcu_user(p); +} +CFI_NOSEAL(bpf_task_release_dtor); + #ifdef CONFIG_CGROUPS /** * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by @@ -2174,6 +2180,12 @@ __bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp) cgroup_put(cgrp); } +__bpf_kfunc void bpf_cgroup_release_dtor(void *cgrp) +{ + cgroup_put(cgrp); +} +CFI_NOSEAL(bpf_cgroup_release_dtor); + /** * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor * array. A cgroup returned by this kfunc which is not subsequently stored in a @@ -2570,10 +2582,10 @@ static const struct btf_kfunc_id_set generic_kfunc_set = { BTF_ID_LIST(generic_dtor_ids) BTF_ID(struct, task_struct) -BTF_ID(func, bpf_task_release) +BTF_ID(func, bpf_task_release_dtor) #ifdef CONFIG_CGROUPS BTF_ID(struct, cgroup) -BTF_ID(func, bpf_cgroup_release) +BTF_ID(func, bpf_cgroup_release_dtor) #endif BTF_SET8_START(common_btf_ids) -- cgit From 852486b35f344887786d63250946dd921a05d7e8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 15 Dec 2023 10:12:23 +0100 Subject: x86/cfi,bpf: Fix bpf_exception_cb() signature As per the earlier patches, BPF sub-programs have bpf_callback_t signature and CFI expects callers to have matching signature. This is violated by bpf_prog_aux::bpf_exception_cb(). [peterz: Changelog] Reported-by: Peter Zijlstra Signed-off-by: Alexei Starovoitov Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/CAADnVQ+Z7UcXXBBhMubhcMM=R-dExk-uHtfOLtoLxQ1XxEpqEA@mail.gmail.com Link: https://lore.kernel.org/r/20231215092707.910319166@infradead.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e0c0e3676df8..07fd4b5704f3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2537,7 +2537,7 @@ __bpf_kfunc void bpf_throw(u64 cookie) * which skips compiler generated instrumentation to do the same. */ kasan_unpoison_task_stack_below((void *)(long)ctx.sp); - ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp); + ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0); WARN(1, "A call to BPF exception callback should never return\n"); } -- cgit From 8e432e6197cef6250dfd6fdffd41c06613c874ca Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 18 Dec 2023 09:36:01 -0800 Subject: bpf: Ensure precise is reset to false in __mark_reg_const_zero() It is safe to always start with imprecise SCALAR_VALUE register. Previously __mark_reg_const_zero() relied on caller to reset precise mark, but it's very error prone and we already missed it in a few places. So instead make __mark_reg_const_zero() reset precision always, as it's a safe default for SCALAR_VALUE. Explanation is basically the same as for why we are resetting (or rather not setting) precision in current state. If necessary, precision propagation will set it to precise correctly. As such, also remove a big comment about forward precision propagation in mark_reg_stack_read() and avoid unnecessarily setting precision to true after reading from STACK_ZERO stack. Again, precision propagation will correctly handle this, if that SCALAR_VALUE register will ever be needed to be precise. Reported-by: Maxim Mikityanskiy Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Maxim Mikityanskiy Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20231218173601.53047-1-andrii@kernel.org --- kernel/bpf/verifier.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1863826a4ac3..9456ee0ad129 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1777,10 +1777,14 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg) __mark_reg_known(reg, 0); } -static void __mark_reg_const_zero(struct bpf_reg_state *reg) +static void __mark_reg_const_zero(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) { __mark_reg_known(reg, 0); reg->type = SCALAR_VALUE; + /* all scalars are assumed imprecise initially (unless unprivileged, + * in which case everything is forced to be precise) + */ + reg->precise = !env->bpf_capable; } static void mark_reg_known_zero(struct bpf_verifier_env *env, @@ -4706,21 +4710,10 @@ static void mark_reg_stack_read(struct bpf_verifier_env *env, zeros++; } if (zeros == max_off - min_off) { - /* any access_size read into register is zero extended, - * so the whole register == const_zero - */ - __mark_reg_const_zero(&state->regs[dst_regno]); - /* backtracking doesn't support STACK_ZERO yet, - * so mark it precise here, so that later - * backtracking can stop here. - * Backtracking may not need this if this register - * doesn't participate in pointer adjustment. - * Forward propagation of precise flag is not - * necessary either. This mark is only to stop - * backtracking. Any register that contributed - * to const 0 was marked precise before spill. + /* Any access_size read into register is zero extended, + * so the whole register == const_zero. */ - state->regs[dst_regno].precise = true; + __mark_reg_const_zero(env, &state->regs[dst_regno]); } else { /* have read misc data from the stack */ mark_reg_unknown(env, state->regs, dst_regno); @@ -4803,11 +4796,11 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, if (spill_cnt == size && tnum_is_const(reg->var_off) && reg->var_off.value == 0) { - __mark_reg_const_zero(&state->regs[dst_regno]); + __mark_reg_const_zero(env, &state->regs[dst_regno]); /* this IS register fill, so keep insn_flags */ } else if (zero_cnt == size) { /* similarly to mark_reg_stack_read(), preserve zeroes */ - __mark_reg_const_zero(&state->regs[dst_regno]); + __mark_reg_const_zero(env, &state->regs[dst_regno]); insn_flags = 0; /* not restoring original register state */ } else { mark_reg_unknown(env, state->regs, dst_regno); @@ -7963,7 +7956,7 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, /* switch to DRAINED state, but keep the depth unchanged */ /* mark current iter state as drained and assume returned NULL */ cur_iter->iter.state = BPF_ITER_STATE_DRAINED; - __mark_reg_const_zero(&cur_fr->regs[BPF_REG_0]); + __mark_reg_const_zero(env, &cur_fr->regs[BPF_REG_0]); return 0; } -- cgit