summaryrefslogtreecommitdiff
path: root/arch/x86/net/bpf_jit_comp.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/net/bpf_jit_comp.c')
-rw-r--r--arch/x86/net/bpf_jit_comp.c387
1 files changed, 320 insertions, 67 deletions
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 438adb695daa..a5930042139d 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -701,6 +701,38 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg)
*pprog = prog;
}
+static void emit_movsx_reg(u8 **pprog, int num_bits, bool is64, u32 dst_reg,
+ u32 src_reg)
+{
+ u8 *prog = *pprog;
+
+ if (is64) {
+ /* movs[b,w,l]q dst, src */
+ if (num_bits == 8)
+ EMIT4(add_2mod(0x48, src_reg, dst_reg), 0x0f, 0xbe,
+ add_2reg(0xC0, src_reg, dst_reg));
+ else if (num_bits == 16)
+ EMIT4(add_2mod(0x48, src_reg, dst_reg), 0x0f, 0xbf,
+ add_2reg(0xC0, src_reg, dst_reg));
+ else if (num_bits == 32)
+ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x63,
+ add_2reg(0xC0, src_reg, dst_reg));
+ } else {
+ /* movs[b,w]l dst, src */
+ if (num_bits == 8) {
+ EMIT4(add_2mod(0x40, src_reg, dst_reg), 0x0f, 0xbe,
+ add_2reg(0xC0, src_reg, dst_reg));
+ } else if (num_bits == 16) {
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT1(add_2mod(0x40, src_reg, dst_reg));
+ EMIT3(add_2mod(0x0f, src_reg, dst_reg), 0xbf,
+ add_2reg(0xC0, src_reg, dst_reg));
+ }
+ }
+
+ *pprog = prog;
+}
+
/* Emit the suffix (ModR/M etc) for addressing *(ptr_reg + off) and val_reg */
static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off)
{
@@ -779,6 +811,29 @@ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
*pprog = prog;
}
+/* LDSX: dst_reg = *(s8*)(src_reg + off) */
+static void emit_ldsx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
+{
+ u8 *prog = *pprog;
+
+ switch (size) {
+ case BPF_B:
+ /* Emit 'movsx rax, byte ptr [rax + off]' */
+ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xBE);
+ break;
+ case BPF_H:
+ /* Emit 'movsx rax, word ptr [rax + off]' */
+ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xBF);
+ break;
+ case BPF_W:
+ /* Emit 'movsx rax, dword ptr [rax+0x14]' */
+ EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x63);
+ break;
+ }
+ emit_insn_suffix(&prog, src_reg, dst_reg, off);
+ *pprog = prog;
+}
+
/* STX: *(u8*)(dst_reg + off) = src_reg */
static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
@@ -1028,9 +1083,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
case BPF_ALU64 | BPF_MOV | BPF_X:
case BPF_ALU | BPF_MOV | BPF_X:
- emit_mov_reg(&prog,
- BPF_CLASS(insn->code) == BPF_ALU64,
- dst_reg, src_reg);
+ if (insn->off == 0)
+ emit_mov_reg(&prog,
+ BPF_CLASS(insn->code) == BPF_ALU64,
+ dst_reg, src_reg);
+ else
+ emit_movsx_reg(&prog, insn->off,
+ BPF_CLASS(insn->code) == BPF_ALU64,
+ dst_reg, src_reg);
break;
/* neg dst */
@@ -1134,15 +1194,26 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
/* mov rax, dst_reg */
emit_mov_reg(&prog, is64, BPF_REG_0, dst_reg);
- /*
- * xor edx, edx
- * equivalent to 'xor rdx, rdx', but one byte less
- */
- EMIT2(0x31, 0xd2);
+ if (insn->off == 0) {
+ /*
+ * xor edx, edx
+ * equivalent to 'xor rdx, rdx', but one byte less
+ */
+ EMIT2(0x31, 0xd2);
- /* div src_reg */
- maybe_emit_1mod(&prog, src_reg, is64);
- EMIT2(0xF7, add_1reg(0xF0, src_reg));
+ /* div src_reg */
+ maybe_emit_1mod(&prog, src_reg, is64);
+ EMIT2(0xF7, add_1reg(0xF0, src_reg));
+ } else {
+ if (BPF_CLASS(insn->code) == BPF_ALU)
+ EMIT1(0x99); /* cdq */
+ else
+ EMIT2(0x48, 0x99); /* cqo */
+
+ /* idiv src_reg */
+ maybe_emit_1mod(&prog, src_reg, is64);
+ EMIT2(0xF7, add_1reg(0xF8, src_reg));
+ }
if (BPF_OP(insn->code) == BPF_MOD &&
dst_reg != BPF_REG_3)
@@ -1262,6 +1333,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
break;
case BPF_ALU | BPF_END | BPF_FROM_BE:
+ case BPF_ALU64 | BPF_END | BPF_FROM_LE:
switch (imm32) {
case 16:
/* Emit 'ror %ax, 8' to swap lower 2 bytes */
@@ -1370,9 +1442,17 @@ st: if (is_imm8(insn->off))
case BPF_LDX | BPF_PROBE_MEM | BPF_W:
case BPF_LDX | BPF_MEM | BPF_DW:
case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
+ /* LDXS: dst_reg = *(s8*)(src_reg + off) */
+ case BPF_LDX | BPF_MEMSX | BPF_B:
+ case BPF_LDX | BPF_MEMSX | BPF_H:
+ case BPF_LDX | BPF_MEMSX | BPF_W:
+ case BPF_LDX | BPF_PROBE_MEMSX | BPF_B:
+ case BPF_LDX | BPF_PROBE_MEMSX | BPF_H:
+ case BPF_LDX | BPF_PROBE_MEMSX | BPF_W:
insn_off = insn->off;
- if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
+ if (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEMSX) {
/* Conservatively check that src_reg + insn->off is a kernel address:
* src_reg + insn->off >= TASK_SIZE_MAX + PAGE_SIZE
* src_reg is used as scratch for src_reg += insn->off and restored
@@ -1415,8 +1495,13 @@ st: if (is_imm8(insn->off))
start_of_ldx = prog;
end_of_jmp[-1] = start_of_ldx - end_of_jmp;
}
- emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off);
- if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
+ if (BPF_MODE(insn->code) == BPF_PROBE_MEMSX ||
+ BPF_MODE(insn->code) == BPF_MEMSX)
+ emit_ldsx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off);
+ else
+ emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off);
+ if (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEMSX) {
struct exception_table_entry *ex;
u8 *_insn = image + proglen + (start_of_ldx - temp);
s64 delta;
@@ -1730,16 +1815,24 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */
break;
case BPF_JMP | BPF_JA:
- if (insn->off == -1)
- /* -1 jmp instructions will always jump
- * backwards two bytes. Explicitly handling
- * this case avoids wasting too many passes
- * when there are long sequences of replaced
- * dead code.
- */
- jmp_offset = -2;
- else
- jmp_offset = addrs[i + insn->off] - addrs[i];
+ case BPF_JMP32 | BPF_JA:
+ if (BPF_CLASS(insn->code) == BPF_JMP) {
+ if (insn->off == -1)
+ /* -1 jmp instructions will always jump
+ * backwards two bytes. Explicitly handling
+ * this case avoids wasting too many passes
+ * when there are long sequences of replaced
+ * dead code.
+ */
+ jmp_offset = -2;
+ else
+ jmp_offset = addrs[i + insn->off] - addrs[i];
+ } else {
+ if (insn->imm == -1)
+ jmp_offset = -2;
+ else
+ jmp_offset = addrs[i + insn->imm] - addrs[i];
+ }
if (!jmp_offset) {
/*
@@ -1857,59 +1950,177 @@ emit_jmp:
return proglen;
}
-static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_regs,
- int stack_size)
+static void clean_stack_garbage(const struct btf_func_model *m,
+ u8 **pprog, int nr_stack_slots,
+ int stack_size)
+{
+ int arg_size, off;
+ u8 *prog;
+
+ /* Generally speaking, the compiler will pass the arguments
+ * on-stack with "push" instruction, which will take 8-byte
+ * on the stack. In this case, there won't be garbage values
+ * while we copy the arguments from origin stack frame to current
+ * in BPF_DW.
+ *
+ * However, sometimes the compiler will only allocate 4-byte on
+ * the stack for the arguments. For now, this case will only
+ * happen if there is only one argument on-stack and its size
+ * not more than 4 byte. In this case, there will be garbage
+ * values on the upper 4-byte where we store the argument on
+ * current stack frame.
+ *
+ * arguments on origin stack:
+ *
+ * stack_arg_1(4-byte) xxx(4-byte)
+ *
+ * what we copy:
+ *
+ * stack_arg_1(8-byte): stack_arg_1(origin) xxx
+ *
+ * and the xxx is the garbage values which we should clean here.
+ */
+ if (nr_stack_slots != 1)
+ return;
+
+ /* the size of the last argument */
+ arg_size = m->arg_size[m->nr_args - 1];
+ if (arg_size <= 4) {
+ off = -(stack_size - 4);
+ prog = *pprog;
+ /* mov DWORD PTR [rbp + off], 0 */
+ if (!is_imm8(off))
+ EMIT2_off32(0xC7, 0x85, off);
+ else
+ EMIT3(0xC7, 0x45, off);
+ EMIT(0, 4);
+ *pprog = prog;
+ }
+}
+
+/* get the count of the regs that are used to pass arguments */
+static int get_nr_used_regs(const struct btf_func_model *m)
{
- int i, j, arg_size;
- bool next_same_struct = false;
+ int i, arg_regs, nr_used_regs = 0;
+
+ for (i = 0; i < min_t(int, m->nr_args, MAX_BPF_FUNC_ARGS); i++) {
+ arg_regs = (m->arg_size[i] + 7) / 8;
+ if (nr_used_regs + arg_regs <= 6)
+ nr_used_regs += arg_regs;
+
+ if (nr_used_regs >= 6)
+ break;
+ }
+
+ return nr_used_regs;
+}
+
+static void save_args(const struct btf_func_model *m, u8 **prog,
+ int stack_size, bool for_call_origin)
+{
+ int arg_regs, first_off = 0, nr_regs = 0, nr_stack_slots = 0;
+ int i, j;
/* Store function arguments to stack.
* For a function that accepts two pointers the sequence will be:
* mov QWORD PTR [rbp-0x10],rdi
* mov QWORD PTR [rbp-0x8],rsi
*/
- for (i = 0, j = 0; i < min(nr_regs, 6); i++) {
- /* The arg_size is at most 16 bytes, enforced by the verifier. */
- arg_size = m->arg_size[j];
- if (arg_size > 8) {
- arg_size = 8;
- next_same_struct = !next_same_struct;
- }
+ for (i = 0; i < min_t(int, m->nr_args, MAX_BPF_FUNC_ARGS); i++) {
+ arg_regs = (m->arg_size[i] + 7) / 8;
- emit_stx(prog, bytes_to_bpf_size(arg_size),
- BPF_REG_FP,
- i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
- -(stack_size - i * 8));
+ /* According to the research of Yonghong, struct members
+ * should be all in register or all on the stack.
+ * Meanwhile, the compiler will pass the argument on regs
+ * if the remaining regs can hold the argument.
+ *
+ * Disorder of the args can happen. For example:
+ *
+ * struct foo_struct {
+ * long a;
+ * int b;
+ * };
+ * int foo(char, char, char, char, char, struct foo_struct,
+ * char);
+ *
+ * the arg1-5,arg7 will be passed by regs, and arg6 will
+ * by stack.
+ */
+ if (nr_regs + arg_regs > 6) {
+ /* copy function arguments from origin stack frame
+ * into current stack frame.
+ *
+ * The starting address of the arguments on-stack
+ * is:
+ * rbp + 8(push rbp) +
+ * 8(return addr of origin call) +
+ * 8(return addr of the caller)
+ * which means: rbp + 24
+ */
+ for (j = 0; j < arg_regs; j++) {
+ emit_ldx(prog, BPF_DW, BPF_REG_0, BPF_REG_FP,
+ nr_stack_slots * 8 + 0x18);
+ emit_stx(prog, BPF_DW, BPF_REG_FP, BPF_REG_0,
+ -stack_size);
+
+ if (!nr_stack_slots)
+ first_off = stack_size;
+ stack_size -= 8;
+ nr_stack_slots++;
+ }
+ } else {
+ /* Only copy the arguments on-stack to current
+ * 'stack_size' and ignore the regs, used to
+ * prepare the arguments on-stack for orign call.
+ */
+ if (for_call_origin) {
+ nr_regs += arg_regs;
+ continue;
+ }
- j = next_same_struct ? j : j + 1;
+ /* copy the arguments from regs into stack */
+ for (j = 0; j < arg_regs; j++) {
+ emit_stx(prog, BPF_DW, BPF_REG_FP,
+ nr_regs == 5 ? X86_REG_R9 : BPF_REG_1 + nr_regs,
+ -stack_size);
+ stack_size -= 8;
+ nr_regs++;
+ }
+ }
}
+
+ clean_stack_garbage(m, prog, nr_stack_slots, first_off);
}
-static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_regs,
+static void restore_regs(const struct btf_func_model *m, u8 **prog,
int stack_size)
{
- int i, j, arg_size;
- bool next_same_struct = false;
+ int i, j, arg_regs, nr_regs = 0;
/* Restore function arguments from stack.
* For a function that accepts two pointers the sequence will be:
* EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10]
* EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8]
+ *
+ * The logic here is similar to what we do in save_args()
*/
- for (i = 0, j = 0; i < min(nr_regs, 6); i++) {
- /* The arg_size is at most 16 bytes, enforced by the verifier. */
- arg_size = m->arg_size[j];
- if (arg_size > 8) {
- arg_size = 8;
- next_same_struct = !next_same_struct;
+ for (i = 0; i < min_t(int, m->nr_args, MAX_BPF_FUNC_ARGS); i++) {
+ arg_regs = (m->arg_size[i] + 7) / 8;
+ if (nr_regs + arg_regs <= 6) {
+ for (j = 0; j < arg_regs; j++) {
+ emit_ldx(prog, BPF_DW,
+ nr_regs == 5 ? X86_REG_R9 : BPF_REG_1 + nr_regs,
+ BPF_REG_FP,
+ -stack_size);
+ stack_size -= 8;
+ nr_regs++;
+ }
+ } else {
+ stack_size -= 8 * arg_regs;
}
- emit_ldx(prog, bytes_to_bpf_size(arg_size),
- i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
- BPF_REG_FP,
- -(stack_size - i * 8));
-
- j = next_same_struct ? j : j + 1;
+ if (nr_regs >= 6)
+ break;
}
}
@@ -1938,7 +2149,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
/* arg1: mov rdi, progs[i] */
emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
/* arg2: lea rsi, [rbp - ctx_cookie_off] */
- EMIT4(0x48, 0x8D, 0x75, -run_ctx_off);
+ if (!is_imm8(-run_ctx_off))
+ EMIT3_off32(0x48, 0x8D, 0xB5, -run_ctx_off);
+ else
+ EMIT4(0x48, 0x8D, 0x75, -run_ctx_off);
if (emit_rsb_call(&prog, bpf_trampoline_enter(p), prog))
return -EINVAL;
@@ -1954,7 +2168,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
emit_nops(&prog, 2);
/* arg1: lea rdi, [rbp - stack_size] */
- EMIT4(0x48, 0x8D, 0x7D, -stack_size);
+ if (!is_imm8(-stack_size))
+ EMIT3_off32(0x48, 0x8D, 0xBD, -stack_size);
+ else
+ EMIT4(0x48, 0x8D, 0x7D, -stack_size);
/* arg2: progs[i]->insnsi for interpreter */
if (!p->jited)
emit_mov_imm64(&prog, BPF_REG_2,
@@ -1984,7 +2201,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
/* arg2: mov rsi, rbx <- start time in nsec */
emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
/* arg3: lea rdx, [rbp - run_ctx_off] */
- EMIT4(0x48, 0x8D, 0x55, -run_ctx_off);
+ if (!is_imm8(-run_ctx_off))
+ EMIT3_off32(0x48, 0x8D, 0x95, -run_ctx_off);
+ else
+ EMIT4(0x48, 0x8D, 0x55, -run_ctx_off);
if (emit_rsb_call(&prog, bpf_trampoline_exit(p), prog))
return -EINVAL;
@@ -2136,7 +2356,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
void *func_addr)
{
int i, ret, nr_regs = m->nr_args, stack_size = 0;
- int regs_off, nregs_off, ip_off, run_ctx_off;
+ int regs_off, nregs_off, ip_off, run_ctx_off, arg_stack_off, rbx_off;
struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
@@ -2150,8 +2370,10 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
nr_regs += (m->arg_size[i] + 7) / 8 - 1;
- /* x86-64 supports up to 6 arguments. 7+ can be added in the future */
- if (nr_regs > 6)
+ /* x86-64 supports up to MAX_BPF_FUNC_ARGS arguments. 1-6
+ * are passed through regs, the remains are through stack.
+ */
+ if (nr_regs > MAX_BPF_FUNC_ARGS)
return -ENOTSUPP;
/* Generated trampoline stack layout:
@@ -2170,7 +2392,14 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
*
* RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag
*
+ * RBP - rbx_off [ rbx value ] always
+ *
* RBP - run_ctx_off [ bpf_tramp_run_ctx ]
+ *
+ * [ stack_argN ] BPF_TRAMP_F_CALL_ORIG
+ * [ ... ]
+ * [ stack_arg2 ]
+ * RBP - arg_stack_off [ stack_arg1 ]
*/
/* room for return value of orig_call or fentry prog */
@@ -2190,9 +2419,26 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
ip_off = stack_size;
+ stack_size += 8;
+ rbx_off = stack_size;
+
stack_size += (sizeof(struct bpf_tramp_run_ctx) + 7) & ~0x7;
run_ctx_off = stack_size;
+ if (nr_regs > 6 && (flags & BPF_TRAMP_F_CALL_ORIG)) {
+ /* the space that used to pass arguments on-stack */
+ stack_size += (nr_regs - get_nr_used_regs(m)) * 8;
+ /* make sure the stack pointer is 16-byte aligned if we
+ * need pass arguments on stack, which means
+ * [stack_size + 8(rbp) + 8(rip) + 8(origin rip)]
+ * should be 16-byte aligned. Following code depend on
+ * that stack_size is already 8-byte aligned.
+ */
+ stack_size += (stack_size % 16) ? 0 : 8;
+ }
+
+ arg_stack_off = stack_size;
+
if (flags & BPF_TRAMP_F_SKIP_FRAME) {
/* skip patched call instruction and point orig_call to actual
* body of the kernel function.
@@ -2212,8 +2458,14 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
x86_call_depth_emit_accounting(&prog, NULL);
EMIT1(0x55); /* push rbp */
EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
- EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
- EMIT1(0x53); /* push rbx */
+ if (!is_imm8(stack_size))
+ /* sub rsp, stack_size */
+ EMIT3_off32(0x48, 0x81, 0xEC, stack_size);
+ else
+ /* sub rsp, stack_size */
+ EMIT4(0x48, 0x83, 0xEC, stack_size);
+ /* mov QWORD PTR [rbp - rbx_off], rbx */
+ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_6, -rbx_off);
/* Store number of argument registers of the traced function:
* mov rax, nr_regs
@@ -2231,7 +2483,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off);
}
- save_regs(m, &prog, nr_regs, regs_off);
+ save_args(m, &prog, regs_off, false);
if (flags & BPF_TRAMP_F_CALL_ORIG) {
/* arg1: mov rdi, im */
@@ -2261,7 +2513,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
}
if (flags & BPF_TRAMP_F_CALL_ORIG) {
- restore_regs(m, &prog, nr_regs, regs_off);
+ restore_regs(m, &prog, regs_off);
+ save_args(m, &prog, arg_stack_off, true);
if (flags & BPF_TRAMP_F_ORIG_STACK) {
emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8);
@@ -2302,7 +2555,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
}
if (flags & BPF_TRAMP_F_RESTORE_REGS)
- restore_regs(m, &prog, nr_regs, regs_off);
+ restore_regs(m, &prog, regs_off);
/* This needs to be done regardless. If there were fmod_ret programs,
* the return value is only updated on the stack and still needs to be
@@ -2321,7 +2574,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
if (save_ret)
emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
- EMIT1(0x5B); /* pop rbx */
+ emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, -rbx_off);
EMIT1(0xC9); /* leave */
if (flags & BPF_TRAMP_F_SKIP_FRAME)
/* skip our return address and return to parent */