Merge branch 'timed-may_goto'

Kumar Kartikeya Dwivedi says: ==================== Timed may_goto This series replaces the current implementation of cond_break, which uses the may_goto instruction, and counts 8 million iterations per stack frame, with an implementation based on sampling time locally on the CPU. This is done to permit a longer time for a given loop per-program invocation. The accounting is still done per-stack frame, but the count is used to instead amortize the cost of the logic to sample and check the time spent since the start. This is needed for expressing more complicated algorithms (spin locks, waiting loops, etc.) in BPF programs without false positive expiration of the loop. For instance, the plan is to make use of this for implementing spin locks for BPF arena [0]. For the loop as follows: for (int i = 0;; i++) {} Testing on a bare-metal Sapphire Rapids Intel server yields the following table (taking an average of 25 runs). +-----------------------------+--------------+--------------+------------------+ | Loop type | Iterations | Time (ms) | Time/iter (ns) | +-----------------------------|--------------+--------------+------------------+ | may_goto | 8388608 | 3 | 0.36 | | timed_may_goto (count=65535)| 589674932 | 250 | 0.42 | | bpf_for | 8388608 | 10 | 1.19 | +-----------------------------+--------------+--------------+------------------+ Here, count is used to amortize the time sampling and checking logic. Obviously, this is the limit of an empty loop. Given the complexity of the loop body, the time spent in the loop can be longer. Cancellations will address the task of imposing an upper bound on program runtime. For now, the implementation only supports x86. [0]: https://lore.kernel.org/bpf/20250118162238.2621311-1-memxor@gmail.com Changelog: ---------- v1 -> v2 v1: https://lore.kernel.org/bpf/20250302201348.940234-1-memxor@gmail.com * Address comments from Alexei * Use kernel comment style for new code. * Remove p->count == 0 check in bpf_check_timed_may_goto. * Add comments on AX as argument/retval calling convention. * Add comments describing how the counting logic works. * Use BPF_EMIT_CALL instead of open-coding instruction encoding. * Change if ax != 1 goto pc+X condition to if ax != 0 goto pc+X. ==================== Link: https://patch.msgid.link/20250304003239.2390751-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
author: Alexei Starovoitov <ast@kernel.org> 2025-03-03 17:40:14 -0800
committer: Alexei Starovoitov <ast@kernel.org> 2025-03-15 11:48:28 -0700
commit: 3a6fa573c50f31d6ab8c8c3318a68d511c79f8fb (patch)
tree: b109c094280af113cda230fce761407b94cd9127 /kernel
parent: 2941e215376399d5e71eddcd720f185e28ba2dbb (diff)
parent: 2fb761823eadf2fdfb6fdf146c4b94807b4bc3ba (diff)
2 files changed, 87 insertions, 8 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a0200fbbace9..e583c19a0291 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -3069,6 +3069,32 @@ void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp,
 {
 }
 
+bool __weak bpf_jit_supports_timed_may_goto(void)
+{
+	return false;
+}
+
+u64 __weak arch_bpf_timed_may_goto(void)
+{
+	return 0;
+}
+
+u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p)
+{
+	u64 time = ktime_get_mono_fast_ns();
+
+	/* Populate the timestamp for this stack frame, and refresh count. */
+	if (!p->timestamp) {
+		p->timestamp = time;
+		return BPF_MAX_TIMED_LOOPS;
+	}
+	/* Check if we've exhausted our time slice, and zero count. */
+	if (time - p->timestamp >= (NSEC_PER_SEC / 4))
+		return 0;
+	/* Refresh the count for the stack frame. */
+	return BPF_MAX_TIMED_LOOPS;
+}
+
 /* for configs without MMU or 32-bit */
 __weak const struct bpf_map_ops arena_map_ops;
 __weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 22c4edc8695c..4ec1d1aa25ea 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -21572,7 +21572,50 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			goto next_insn;
 		}
 
-		if (is_may_goto_insn(insn)) {
+		if (is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) {
+			int stack_off_cnt = -stack_depth - 16;
+
+			/*
+			 * Two 8 byte slots, depth-16 stores the count, and
+			 * depth-8 stores the start timestamp of the loop.
+			 *
+			 * The starting value of count is BPF_MAX_TIMED_LOOPS
+			 * (0xffff).  Every iteration loads it and subs it by 1,
+			 * until the value becomes 0 in AX (thus, 1 in stack),
+			 * after which we call arch_bpf_timed_may_goto, which
+			 * either sets AX to 0xffff to keep looping, or to 0
+			 * upon timeout. AX is then stored into the stack. In
+			 * the next iteration, we either see 0 and break out, or
+			 * continue iterating until the next time value is 0
+			 * after subtraction, rinse and repeat.
+			 */
+			stack_depth_extra = 16;
+			insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt);
+			if (insn->off >= 0)
+				insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5);
+			else
+				insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
+			insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
+			insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2);
+			/*
+			 * AX is used as an argument to pass in stack_off_cnt
+			 * (to add to r10/fp), and also as the return value of
+			 * the call to arch_bpf_timed_may_goto.
+			 */
+			insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt);
+			insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto);
+			insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt);
+			cnt = 7;
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta += cnt - 1;
+			env->prog = prog = new_prog;
+			insn = new_prog->insnsi + i + delta;
+			goto next_insn;
+		} else if (is_may_goto_insn(insn)) {
 			int stack_off = -stack_depth - 8;
 
 			stack_depth_extra = 8;
@@ -22113,23 +22156,33 @@ next_insn:
 
 	env->prog->aux->stack_depth = subprogs[0].stack_depth;
 	for (i = 0; i < env->subprog_cnt; i++) {
+		int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1;
 		int subprog_start = subprogs[i].start;
 		int stack_slots = subprogs[i].stack_extra / 8;
+		int slots = delta, cnt = 0;
 
 		if (!stack_slots)
 			continue;
-		if (stack_slots > 1) {
+		/* We need two slots in case timed may_goto is supported. */
+		if (stack_slots > slots) {
 			verbose(env, "verifier bug: stack_slots supports may_goto only\n");
 			return -EFAULT;
 		}
 
-		/* Add ST insn to subprog prologue to init extra stack */
-		insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP,
-					 -subprogs[i].stack_depth, BPF_MAX_LOOPS);
+		stack_depth = subprogs[i].stack_depth;
+		if (bpf_jit_supports_timed_may_goto()) {
+			insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
+						     BPF_MAX_TIMED_LOOPS);
+			insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0);
+		} else {
+			/* Add ST insn to subprog prologue to init extra stack */
+			insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
+						     BPF_MAX_LOOPS);
+		}
 		/* Copy first actual insn to preserve it */
-		insn_buf[1] = env->prog->insnsi[subprog_start];
+		insn_buf[cnt++] = env->prog->insnsi[subprog_start];
 
-		new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2);
+		new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt);
 		if (!new_prog)
 			return -ENOMEM;
 		env->prog = prog = new_prog;
@@ -22139,7 +22192,7 @@ next_insn:
 		 * to insn after BPF_ST that inits may_goto count.
 		 * Adjustment will succeed because bpf_patch_insn_data() didn't fail.
 		 */
-		WARN_ON(adjust_jmp_off(env->prog, subprog_start, 1));
+		WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta));
 	}
 
 	/* Since poke tab is now finalized, publish aux to tracker. */
author	Alexei Starovoitov <ast@kernel.org>	2025-03-03 17:40:14 -0800
committer	Alexei Starovoitov <ast@kernel.org>	2025-03-15 11:48:28 -0700
commit	3a6fa573c50f31d6ab8c8c3318a68d511c79f8fb (patch)
tree	b109c094280af113cda230fce761407b94cd9127 /kernel
parent	2941e215376399d5e71eddcd720f185e28ba2dbb (diff)
parent	2fb761823eadf2fdfb6fdf146c4b94807b4bc3ba (diff)