summaryrefslogtreecommitdiff
path: root/arch/arm64/kernel/ftrace.c
diff options
context:
space:
mode:
authorMark Rutland <mark.rutland@arm.com>2023-01-23 13:46:03 +0000
committerCatalin Marinas <catalin.marinas@arm.com>2023-01-24 11:49:43 +0000
commitbaaf553d3bc330697c68a00f96cf11f4edfeac7e (patch)
treee1ceac10497d693527717908067e5b53fa5afcdd /arch/arm64/kernel/ftrace.c
parent90955d778ad7873964a271852b1f24d31e00248b (diff)
arm64: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
This patch enables support for DYNAMIC_FTRACE_WITH_CALL_OPS on arm64. This allows each ftrace callsite to provide an ftrace_ops to the common ftrace trampoline, allowing each callsite to invoke distinct tracer functions without the need to fall back to list processing or to allocate custom trampolines for each callsite. This significantly speeds up cases where multiple distinct trace functions are used and callsites are mostly traced by a single tracer. The main idea is to place a pointer to the ftrace_ops as a literal at a fixed offset from the function entry point, which can be recovered by the common ftrace trampoline. Using a 64-bit literal avoids branch range limitations, and permits the ops to be swapped atomically without special considerations that apply to code-patching. In future this will also allow for the implementation of DYNAMIC_FTRACE_WITH_DIRECT_CALLS without branch range limitations by using additional fields in struct ftrace_ops. As noted in the core patch adding support for DYNAMIC_FTRACE_WITH_CALL_OPS, this approach allows for directly invoking ftrace_ops::func even for ftrace_ops which are dynamically-allocated (or part of a module), without going via ftrace_ops_list_func. Currently, this approach is not compatible with CLANG_CFI, as the presence/absence of pre-function NOPs changes the offset of the pre-function type hash, and there's no existing mechanism to ensure a consistent offset for instrumented and uninstrumented functions. When CLANG_CFI is enabled, the existing scheme with a global ops->func pointer is used, and there should be no functional change. I am currently working with others to allow the two to work together in future (though this will liekly require updated compiler support). I've benchamrked this with the ftrace_ops sample module [1], which is not currently upstream, but available at: https://lore.kernel.org/lkml/20230103124912.2948963-1-mark.rutland@arm.com git://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git ftrace-ops-sample-20230109 Using that module I measured the total time taken for 100,000 calls to a trivial instrumented function, with a number of tracers enabled with relevant filters (which would apply to the instrumented function) and a number of tracers enabled with irrelevant filters (which would not apply to the instrumented function). I tested on an M1 MacBook Pro, running under a HVF-accelerated QEMU VM (i.e. on real hardware). Before this patch: Number of tracers || Total time | Per-call average time (ns) Relevant | Irrelevant || (ns) | Total | Overhead =========+============++=============+==============+============ 0 | 0 || 94,583 | 0.95 | - 0 | 1 || 93,709 | 0.94 | - 0 | 2 || 93,666 | 0.94 | - 0 | 10 || 93,709 | 0.94 | - 0 | 100 || 93,792 | 0.94 | - ---------+------------++-------------+--------------+------------ 1 | 1 || 6,467,833 | 64.68 | 63.73 1 | 2 || 7,509,708 | 75.10 | 74.15 1 | 10 || 23,786,792 | 237.87 | 236.92 1 | 100 || 106,432,500 | 1,064.43 | 1063.38 ---------+------------++-------------+--------------+------------ 1 | 0 || 1,431,875 | 14.32 | 13.37 2 | 0 || 6,456,334 | 64.56 | 63.62 10 | 0 || 22,717,000 | 227.17 | 226.22 100 | 0 || 103,293,667 | 1032.94 | 1031.99 ---------+------------++-------------+--------------+-------------- Note: per-call overhead is estimated relative to the baseline case with 0 relevant tracers and 0 irrelevant tracers. After this patch Number of tracers || Total time | Per-call average time (ns) Relevant | Irrelevant || (ns) | Total | Overhead =========+============++=============+==============+============ 0 | 0 || 94,541 | 0.95 | - 0 | 1 || 93,666 | 0.94 | - 0 | 2 || 93,709 | 0.94 | - 0 | 10 || 93,667 | 0.94 | - 0 | 100 || 93,792 | 0.94 | - ---------+------------++-------------+--------------+------------ 1 | 1 || 281,000 | 2.81 | 1.86 1 | 2 || 281,042 | 2.81 | 1.87 1 | 10 || 280,958 | 2.81 | 1.86 1 | 100 || 281,250 | 2.81 | 1.87 ---------+------------++-------------+--------------+------------ 1 | 0 || 280,959 | 2.81 | 1.86 2 | 0 || 6,502,708 | 65.03 | 64.08 10 | 0 || 18,681,209 | 186.81 | 185.87 100 | 0 || 103,550,458 | 1,035.50 | 1034.56 ---------+------------++-------------+--------------+------------ Note: per-call overhead is estimated relative to the baseline case with 0 relevant tracers and 0 irrelevant tracers. As can be seen from the above: a) Whenever there is a single relevant tracer function associated with a tracee, the overhead of invoking the tracer is constant, and does not scale with the number of tracers which are *not* associated with that tracee. b) The overhead for a single relevant tracer has dropped to ~1/7 of the overhead prior to this series (from 13.37ns to 1.86ns). This is largely due to permitting calls to dynamically-allocated ftrace_ops without going through ftrace_ops_list_func. I've run the ftrace selftests from v6.2-rc3, which reports: | # of passed: 110 | # of failed: 0 | # of unresolved: 3 | # of untested: 0 | # of unsupported: 0 | # of xfailed: 1 | # of undefined(test bug): 0 ... where the unresolved entries were the tests for DIRECT functions (which are not supported), and the checkbashisms selftest (which is irrelevant here): | [8] Test ftrace direct functions against tracers [UNRESOLVED] | [9] Test ftrace direct functions against kprobes [UNRESOLVED] | [62] Meta-selftest: Checkbashisms [UNRESOLVED] ... with all other tests passing (or failing as expected). Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Florent Revest <revest@chromium.org> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Will Deacon <will@kernel.org> Link: https://lore.kernel.org/r/20230123134603.1064407-9-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Diffstat (limited to 'arch/arm64/kernel/ftrace.c')
-rw-r--r--arch/arm64/kernel/ftrace.c156
1 files changed, 156 insertions, 0 deletions
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index 38ebdf063255..5545fe1a9012 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -60,6 +60,89 @@ int ftrace_regs_query_register_offset(const char *name)
}
#endif
+unsigned long ftrace_call_adjust(unsigned long addr)
+{
+ /*
+ * When using mcount, addr is the address of the mcount call
+ * instruction, and no adjustment is necessary.
+ */
+ if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_ARGS))
+ return addr;
+
+ /*
+ * When using patchable-function-entry without pre-function NOPS, addr
+ * is the address of the first NOP after the function entry point.
+ *
+ * The compiler has either generated:
+ *
+ * addr+00: func: NOP // To be patched to MOV X9, LR
+ * addr+04: NOP // To be patched to BL <caller>
+ *
+ * Or:
+ *
+ * addr-04: BTI C
+ * addr+00: func: NOP // To be patched to MOV X9, LR
+ * addr+04: NOP // To be patched to BL <caller>
+ *
+ * We must adjust addr to the address of the NOP which will be patched
+ * to `BL <caller>`, which is at `addr + 4` bytes in either case.
+ *
+ */
+ if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS))
+ return addr + AARCH64_INSN_SIZE;
+
+ /*
+ * When using patchable-function-entry with pre-function NOPs, addr is
+ * the address of the first pre-function NOP.
+ *
+ * Starting from an 8-byte aligned base, the compiler has either
+ * generated:
+ *
+ * addr+00: NOP // Literal (first 32 bits)
+ * addr+04: NOP // Literal (last 32 bits)
+ * addr+08: func: NOP // To be patched to MOV X9, LR
+ * addr+12: NOP // To be patched to BL <caller>
+ *
+ * Or:
+ *
+ * addr+00: NOP // Literal (first 32 bits)
+ * addr+04: NOP // Literal (last 32 bits)
+ * addr+08: func: BTI C
+ * addr+12: NOP // To be patched to MOV X9, LR
+ * addr+16: NOP // To be patched to BL <caller>
+ *
+ * We must adjust addr to the address of the NOP which will be patched
+ * to `BL <caller>`, which is at either addr+12 or addr+16 depending on
+ * whether there is a BTI.
+ */
+
+ if (!IS_ALIGNED(addr, sizeof(unsigned long))) {
+ WARN_RATELIMIT(1, "Misaligned patch-site %pS\n",
+ (void *)(addr + 8));
+ return 0;
+ }
+
+ /* Skip the NOPs placed before the function entry point */
+ addr += 2 * AARCH64_INSN_SIZE;
+
+ /* Skip any BTI */
+ if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) {
+ u32 insn = le32_to_cpu(*(__le32 *)addr);
+
+ if (aarch64_insn_is_bti(insn)) {
+ addr += AARCH64_INSN_SIZE;
+ } else if (insn != aarch64_insn_gen_nop()) {
+ WARN_RATELIMIT(1, "unexpected insn in patch-site %pS: 0x%08x\n",
+ (void *)addr, insn);
+ }
+ }
+
+ /* Skip the first NOP after function entry */
+ addr += AARCH64_INSN_SIZE;
+
+ return addr;
+}
+
/*
* Replace a single instruction, which may be a branch or NOP.
* If @validate == true, a replaced instruction is checked against 'old'.
@@ -98,6 +181,13 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
unsigned long pc;
u32 new;
+ /*
+ * When using CALL_OPS, the function to call is associated with the
+ * call site, and we don't have a global function pointer to update.
+ */
+ if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS))
+ return 0;
+
pc = (unsigned long)ftrace_call;
new = aarch64_insn_gen_branch_imm(pc, (unsigned long)func,
AARCH64_INSN_BRANCH_LINK);
@@ -176,6 +266,44 @@ static bool ftrace_find_callable_addr(struct dyn_ftrace *rec,
return true;
}
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
+static const struct ftrace_ops *arm64_rec_get_ops(struct dyn_ftrace *rec)
+{
+ const struct ftrace_ops *ops = NULL;
+
+ if (rec->flags & FTRACE_FL_CALL_OPS_EN) {
+ ops = ftrace_find_unique_ops(rec);
+ WARN_ON_ONCE(!ops);
+ }
+
+ if (!ops)
+ ops = &ftrace_list_ops;
+
+ return ops;
+}
+
+static int ftrace_rec_set_ops(const struct dyn_ftrace *rec,
+ const struct ftrace_ops *ops)
+{
+ unsigned long literal = ALIGN_DOWN(rec->ip - 12, 8);
+ return aarch64_insn_write_literal_u64((void *)literal,
+ (unsigned long)ops);
+}
+
+static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec)
+{
+ return ftrace_rec_set_ops(rec, &ftrace_nop_ops);
+}
+
+static int ftrace_rec_update_ops(struct dyn_ftrace *rec)
+{
+ return ftrace_rec_set_ops(rec, arm64_rec_get_ops(rec));
+}
+#else
+static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec) { return 0; }
+static int ftrace_rec_update_ops(struct dyn_ftrace *rec) { return 0; }
+#endif
+
/*
* Turn on the call to ftrace_caller() in instrumented function
*/
@@ -183,6 +311,11 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
{
unsigned long pc = rec->ip;
u32 old, new;
+ int ret;
+
+ ret = ftrace_rec_update_ops(rec);
+ if (ret)
+ return ret;
if (!ftrace_find_callable_addr(rec, NULL, &addr))
return -EINVAL;
@@ -193,6 +326,19 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
return ftrace_modify_code(pc, old, new, true);
}
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
+int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
+ unsigned long addr)
+{
+ if (WARN_ON_ONCE(old_addr != (unsigned long)ftrace_caller))
+ return -EINVAL;
+ if (WARN_ON_ONCE(addr != (unsigned long)ftrace_caller))
+ return -EINVAL;
+
+ return ftrace_rec_update_ops(rec);
+}
+#endif
+
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
/*
* The compiler has inserted two NOPs before the regular function prologue.
@@ -220,6 +366,11 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
{
unsigned long pc = rec->ip - AARCH64_INSN_SIZE;
u32 old, new;
+ int ret;
+
+ ret = ftrace_rec_set_nop_ops(rec);
+ if (ret)
+ return ret;
old = aarch64_insn_gen_nop();
new = aarch64_insn_gen_move_reg(AARCH64_INSN_REG_9,
@@ -237,9 +388,14 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
{
unsigned long pc = rec->ip;
u32 old = 0, new;
+ int ret;
new = aarch64_insn_gen_nop();
+ ret = ftrace_rec_set_nop_ops(rec);
+ if (ret)
+ return ret;
+
/*
* When using mcount, callsites in modules may have been initalized to
* call an arbitrary module PLT (which redirects to the _mcount stub)