summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorKumar Kartikeya Dwivedi <memxor@gmail.com>2023-09-13 01:32:01 +0200
committerAlexei Starovoitov <ast@kernel.org>2023-09-16 09:34:21 -0700
commitf18b03fabaa9b7c80e80b72a621f481f0d706ae0 (patch)
tree7410a260abf0aa01f1c21389e9c89c16089ded8e /kernel
parent335d1c5b545284d75ef96ee42e461eacefe865bb (diff)
bpf: Implement BPF exceptions
This patch implements BPF exceptions, and introduces a bpf_throw kfunc to allow programs to throw exceptions during their execution at runtime. A bpf_throw invocation is treated as an immediate termination of the program, returning back to its caller within the kernel, unwinding all stack frames. This allows the program to simplify its implementation, by testing for runtime conditions which the verifier has no visibility into, and assert that they are true. In case they are not, the program can simply throw an exception from the other branch. BPF exceptions are explicitly *NOT* an unlikely slowpath error handling primitive, and this objective has guided design choices of the implementation of the them within the kernel (with the bulk of the cost for unwinding the stack offloaded to the bpf_throw kfunc). The implementation of this mechanism requires use of add_hidden_subprog mechanism introduced in the previous patch, which generates a couple of instructions to move R1 to R0 and exit. The JIT then rewrites the prologue of this subprog to take the stack pointer and frame pointer as inputs and reset the stack frame, popping all callee-saved registers saved by the main subprog. The bpf_throw function then walks the stack at runtime, and invokes this exception subprog with the stack and frame pointers as parameters. Reviewers must take note that currently the main program is made to save all callee-saved registers on x86_64 during entry into the program. This is because we must do an equivalent of a lightweight context switch when unwinding the stack, therefore we need the callee-saved registers of the caller of the BPF program to be able to return with a sane state. Note that we have to additionally handle r12, even though it is not used by the program, because when throwing the exception the program makes an entry into the kernel which could clobber r12 after saving it on the stack. To be able to preserve the value we received on program entry, we push r12 and restore it from the generated subprogram when unwinding the stack. For now, bpf_throw invocation fails when lingering resources or locks exist in that path of the program. In a future followup, bpf_throw will be extended to perform frame-by-frame unwinding to release lingering resources for each stack frame, removing this limitation. Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Link: https://lore.kernel.org/r/20230912233214.1518551-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/helpers.c38
-rw-r--r--kernel/bpf/verifier.c116
3 files changed, 141 insertions, 15 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 840ba952702d..7849b9cca749 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -733,7 +733,7 @@ bool is_bpf_text_address(unsigned long addr)
return ret;
}
-static struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
+struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
{
struct bpf_ksym *ksym = bpf_ksym_find(addr);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index b0a9834f1051..78e8f4de6750 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2449,6 +2449,43 @@ __bpf_kfunc void bpf_rcu_read_unlock(void)
rcu_read_unlock();
}
+struct bpf_throw_ctx {
+ struct bpf_prog_aux *aux;
+ u64 sp;
+ u64 bp;
+ int cnt;
+};
+
+static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp)
+{
+ struct bpf_throw_ctx *ctx = cookie;
+ struct bpf_prog *prog;
+
+ if (!is_bpf_text_address(ip))
+ return !ctx->cnt;
+ prog = bpf_prog_ksym_find(ip);
+ ctx->cnt++;
+ if (bpf_is_subprog(prog))
+ return true;
+ ctx->aux = prog->aux;
+ ctx->sp = sp;
+ ctx->bp = bp;
+ return false;
+}
+
+__bpf_kfunc void bpf_throw(u64 cookie)
+{
+ struct bpf_throw_ctx ctx = {};
+
+ arch_bpf_stack_walk(bpf_stack_walker, &ctx);
+ WARN_ON_ONCE(!ctx.aux);
+ if (ctx.aux)
+ WARN_ON_ONCE(!ctx.aux->exception_boundary);
+ WARN_ON_ONCE(!ctx.bp);
+ WARN_ON_ONCE(!ctx.cnt);
+ ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp);
+}
+
__diag_pop();
BTF_SET8_START(generic_btf_ids)
@@ -2478,6 +2515,7 @@ BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
#endif
BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_throw)
BTF_SET8_END(generic_btf_ids)
static const struct btf_kfunc_id_set generic_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 39548e326d53..9baa6f187b38 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -543,6 +543,7 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)
}
static bool is_callback_calling_kfunc(u32 btf_id);
+static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
static bool is_callback_calling_function(enum bpf_func_id func_id)
{
@@ -1748,7 +1749,9 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
return -ENOMEM;
dst_state->jmp_history_cnt = src->jmp_history_cnt;
- /* if dst has more stack frames then src frame, free them */
+ /* if dst has more stack frames then src frame, free them, this is also
+ * necessary in case of exceptional exits using bpf_throw.
+ */
for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
free_func_state(dst_state->frame[i]);
dst_state->frame[i] = NULL;
@@ -2868,7 +2871,7 @@ next:
if (i == subprog_end - 1) {
/* to avoid fall-through from one subprog into another
* the last insn of the subprog should be either exit
- * or unconditional jump back
+ * or unconditional jump back or bpf_throw call
*/
if (code != (BPF_JMP | BPF_EXIT) &&
code != (BPF_JMP32 | BPF_JA) &&
@@ -5661,6 +5664,27 @@ continue_func:
for (; i < subprog_end; i++) {
int next_insn, sidx;
+ if (bpf_pseudo_kfunc_call(insn + i) && !insn[i].off) {
+ bool err = false;
+
+ if (!is_bpf_throw_kfunc(insn + i))
+ continue;
+ if (subprog[idx].is_cb)
+ err = true;
+ for (int c = 0; c < frame && !err; c++) {
+ if (subprog[ret_prog[c]].is_cb) {
+ err = true;
+ break;
+ }
+ }
+ if (!err)
+ continue;
+ verbose(env,
+ "bpf_throw kfunc (insn %d) cannot be called from callback subprog %d\n",
+ i, idx);
+ return -EINVAL;
+ }
+
if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
continue;
/* remember insn and function to return to */
@@ -8919,6 +8943,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
* callbacks
*/
if (set_callee_state_cb != set_callee_state) {
+ env->subprog_info[subprog].is_cb = true;
if (bpf_pseudo_kfunc_call(insn) &&
!is_callback_calling_kfunc(insn->imm)) {
verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
@@ -9308,7 +9333,8 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
verbose(env, "to caller at %d:\n", *insn_idx);
print_verifier_state(env, caller, true);
}
- /* clear everything in the callee */
+ /* clear everything in the callee. In case of exceptional exits using
+ * bpf_throw, this will be done by copy_verifier_state for extra frames. */
free_func_state(callee);
state->frame[state->curframe--] = NULL;
return 0;
@@ -9432,17 +9458,17 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
return 0;
}
-static int check_reference_leak(struct bpf_verifier_env *env)
+static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit)
{
struct bpf_func_state *state = cur_func(env);
bool refs_lingering = false;
int i;
- if (state->frameno && !state->in_callback_fn)
+ if (!exception_exit && state->frameno && !state->in_callback_fn)
return 0;
for (i = 0; i < state->acquired_refs; i++) {
- if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
+ if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
continue;
verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
state->refs[i].id, state->refs[i].insn_idx);
@@ -9697,7 +9723,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
switch (func_id) {
case BPF_FUNC_tail_call:
- err = check_reference_leak(env);
+ err = check_reference_leak(env, false);
if (err) {
verbose(env, "tail_call would lead to reference leak\n");
return err;
@@ -10332,6 +10358,7 @@ enum special_kfunc_type {
KF_bpf_dynptr_clone,
KF_bpf_percpu_obj_new_impl,
KF_bpf_percpu_obj_drop_impl,
+ KF_bpf_throw,
};
BTF_SET_START(special_kfunc_set)
@@ -10354,6 +10381,7 @@ BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
BTF_ID(func, bpf_percpu_obj_new_impl)
BTF_ID(func, bpf_percpu_obj_drop_impl)
+BTF_ID(func, bpf_throw)
BTF_SET_END(special_kfunc_set)
BTF_ID_LIST(special_kfunc_list)
@@ -10378,6 +10406,7 @@ BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
BTF_ID(func, bpf_percpu_obj_new_impl)
BTF_ID(func, bpf_percpu_obj_drop_impl)
+BTF_ID(func, bpf_throw)
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -10695,6 +10724,12 @@ static bool is_callback_calling_kfunc(u32 btf_id)
return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
}
+static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
+{
+ return bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
+ insn->imm == special_kfunc_list[KF_bpf_throw];
+}
+
static bool is_rbtree_lock_required_kfunc(u32 btf_id)
{
return is_bpf_rbtree_api_kfunc(btf_id);
@@ -11480,6 +11515,15 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
+ if (meta.func_id == special_kfunc_list[KF_bpf_throw]) {
+ if (!bpf_jit_supports_exceptions()) {
+ verbose(env, "JIT does not support calling kfunc %s#%d\n",
+ func_name, meta.func_id);
+ return -ENOTSUPP;
+ }
+ env->seen_exception = true;
+ }
+
for (i = 0; i < CALLER_SAVED_REGS; i++)
mark_reg_not_init(env, regs, caller_saved[i]);
@@ -14525,7 +14569,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
* gen_ld_abs() may terminate the program at runtime, leading to
* reference leak.
*/
- err = check_reference_leak(env);
+ err = check_reference_leak(env, false);
if (err) {
verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n");
return err;
@@ -16539,6 +16583,7 @@ static int do_check(struct bpf_verifier_env *env)
int prev_insn_idx = -1;
for (;;) {
+ bool exception_exit = false;
struct bpf_insn *insn;
u8 class;
int err;
@@ -16753,12 +16798,17 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
}
- if (insn->src_reg == BPF_PSEUDO_CALL)
+ if (insn->src_reg == BPF_PSEUDO_CALL) {
err = check_func_call(env, insn, &env->insn_idx);
- else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+ } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
err = check_kfunc_call(env, insn, &env->insn_idx);
- else
+ if (!err && is_bpf_throw_kfunc(insn)) {
+ exception_exit = true;
+ goto process_bpf_exit_full;
+ }
+ } else {
err = check_helper_call(env, insn, &env->insn_idx);
+ }
if (err)
return err;
@@ -16788,7 +16838,7 @@ static int do_check(struct bpf_verifier_env *env)
verbose(env, "BPF_EXIT uses reserved fields\n");
return -EINVAL;
}
-
+process_bpf_exit_full:
if (env->cur_state->active_lock.ptr &&
!in_rbtree_lock_required_cb(env)) {
verbose(env, "bpf_spin_unlock is missing\n");
@@ -16807,10 +16857,23 @@ static int do_check(struct bpf_verifier_env *env)
* function, for which reference_state must
* match caller reference state when it exits.
*/
- err = check_reference_leak(env);
+ err = check_reference_leak(env, exception_exit);
if (err)
return err;
+ /* The side effect of the prepare_func_exit
+ * which is being skipped is that it frees
+ * bpf_func_state. Typically, process_bpf_exit
+ * will only be hit with outermost exit.
+ * copy_verifier_state in pop_stack will handle
+ * freeing of any extra bpf_func_state left over
+ * from not processing all nested function
+ * exits. We also skip return code checks as
+ * they are not needed for exceptional exits.
+ */
+ if (exception_exit)
+ goto process_bpf_exit;
+
if (state->curframe) {
/* exit from nested function */
err = prepare_func_exit(env, &env->insn_idx);
@@ -18113,6 +18176,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
}
func[i]->aux->num_exentries = num_exentries;
func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
+ func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb;
+ if (!i)
+ func[i]->aux->exception_boundary = env->seen_exception;
func[i] = bpf_int_jit_compile(func[i]);
if (!func[i]->jited) {
err = -ENOTSUPP;
@@ -18201,6 +18267,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
prog->aux->func = func;
prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
prog->aux->real_func_cnt = env->subprog_cnt;
+ prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func;
+ prog->aux->exception_boundary = func[0]->aux->exception_boundary;
bpf_prog_jit_attempt_done(prog);
return 0;
out_free:
@@ -18437,7 +18505,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */
-static __maybe_unused int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
+static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
{
struct bpf_subprog_info *info = env->subprog_info;
int cnt = env->subprog_cnt;
@@ -18481,6 +18549,26 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
struct bpf_map *map_ptr;
int i, ret, cnt, delta = 0;
+ if (env->seen_exception && !env->exception_callback_subprog) {
+ struct bpf_insn patch[] = {
+ env->prog->insnsi[insn_cnt - 1],
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ };
+
+ ret = add_hidden_subprog(env, patch, ARRAY_SIZE(patch));
+ if (ret < 0)
+ return ret;
+ prog = env->prog;
+ insn = prog->insnsi;
+
+ env->exception_callback_subprog = env->subprog_cnt - 1;
+ /* Don't update insn_cnt, as add_hidden_subprog always appends insns */
+ env->subprog_info[env->exception_callback_subprog].is_cb = true;
+ env->subprog_info[env->exception_callback_subprog].is_async_cb = true;
+ env->subprog_info[env->exception_callback_subprog].is_exception_cb = true;
+ }
+
for (i = 0; i < insn_cnt; i++, insn++) {
/* Make divide-by-zero exceptions impossible. */
if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||