summaryrefslogtreecommitdiff
path: root/arch/x86/kernel/uprobes.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/uprobes.c')
-rw-r--r--arch/x86/kernel/uprobes.c925
1 files changed, 877 insertions, 48 deletions
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 495c776de4b4..7be8e361ca55 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* User-space Probes (UProbes) for x86
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
* Copyright (C) IBM Corporation, 2008-2011
* Authors:
* Srikar Dronamraju
@@ -25,11 +12,14 @@
#include <linux/ptrace.h>
#include <linux/uprobes.h>
#include <linux/uaccess.h>
+#include <linux/syscalls.h>
#include <linux/kdebug.h>
#include <asm/processor.h>
#include <asm/insn.h>
+#include <asm/insn-eval.h>
#include <asm/mmu_context.h>
+#include <asm/nops.h>
/* Post-execution fixups. */
@@ -268,15 +258,18 @@ static volatile u32 good_2byte_insns[256 / 32] = {
static bool is_prefix_bad(struct insn *insn)
{
- int i;
-
- for (i = 0; i < insn->prefixes.nbytes; i++) {
- switch (insn->prefixes.bytes[i]) {
- case 0x26: /* INAT_PFX_ES */
- case 0x2E: /* INAT_PFX_CS */
- case 0x36: /* INAT_PFX_DS */
- case 0x3E: /* INAT_PFX_SS */
- case 0xF0: /* INAT_PFX_LOCK */
+ insn_byte_t p;
+
+ for_each_insn_prefix(insn, p) {
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute(p);
+ switch (attr) {
+ case INAT_MAKE_PREFIX(INAT_PFX_ES):
+ case INAT_MAKE_PREFIX(INAT_PFX_CS):
+ case INAT_MAKE_PREFIX(INAT_PFX_DS):
+ case INAT_MAKE_PREFIX(INAT_PFX_SS):
+ case INAT_MAKE_PREFIX(INAT_PFX_LOCK):
return true;
}
}
@@ -285,17 +278,21 @@ static bool is_prefix_bad(struct insn *insn)
static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64)
{
+ enum insn_mode m = x86_64 ? INSN_MODE_64 : INSN_MODE_32;
u32 volatile *good_insns;
+ int ret;
- insn_init(insn, auprobe->insn, sizeof(auprobe->insn), x86_64);
- /* has the side-effect of processing the entire instruction */
- insn_get_length(insn);
- if (WARN_ON_ONCE(!insn_complete(insn)))
+ ret = insn_decode(insn, auprobe->insn, sizeof(auprobe->insn), m);
+ if (ret < 0)
return -ENOEXEC;
if (is_prefix_bad(insn))
return -ENOTSUPP;
+ /* We should not singlestep on the exception masking instructions */
+ if (insn_masking_exception(insn))
+ return -ENOTSUPP;
+
if (x86_64)
good_insns = good_insns_64;
else
@@ -313,6 +310,134 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool
}
#ifdef CONFIG_X86_64
+
+struct uretprobe_syscall_args {
+ unsigned long r11;
+ unsigned long cx;
+ unsigned long ax;
+};
+
+asm (
+ ".pushsection .rodata\n"
+ ".global uretprobe_trampoline_entry\n"
+ "uretprobe_trampoline_entry:\n"
+ "push %rax\n"
+ "push %rcx\n"
+ "push %r11\n"
+ "mov $" __stringify(__NR_uretprobe) ", %rax\n"
+ "syscall\n"
+ ".global uretprobe_syscall_check\n"
+ "uretprobe_syscall_check:\n"
+ "pop %r11\n"
+ "pop %rcx\n"
+ /*
+ * The uretprobe syscall replaces stored %rax value with final
+ * return address, so we don't restore %rax in here and just
+ * call ret.
+ */
+ "ret\n"
+ "int3\n"
+ ".global uretprobe_trampoline_end\n"
+ "uretprobe_trampoline_end:\n"
+ ".popsection\n"
+);
+
+extern u8 uretprobe_trampoline_entry[];
+extern u8 uretprobe_trampoline_end[];
+extern u8 uretprobe_syscall_check[];
+
+void *arch_uretprobe_trampoline(unsigned long *psize)
+{
+ static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+ struct pt_regs *regs = task_pt_regs(current);
+
+ /*
+ * At the moment the uretprobe syscall trampoline is supported
+ * only for native 64-bit process, the compat process still uses
+ * standard breakpoint.
+ */
+ if (user_64bit_mode(regs)) {
+ *psize = uretprobe_trampoline_end - uretprobe_trampoline_entry;
+ return uretprobe_trampoline_entry;
+ }
+
+ *psize = UPROBE_SWBP_INSN_SIZE;
+ return &insn;
+}
+
+static unsigned long trampoline_check_ip(unsigned long tramp)
+{
+ return tramp + (uretprobe_syscall_check - uretprobe_trampoline_entry);
+}
+
+SYSCALL_DEFINE0(uretprobe)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ struct uretprobe_syscall_args args;
+ unsigned long err, ip, sp, tramp;
+
+ /* If there's no trampoline, we are called from wrong place. */
+ tramp = uprobe_get_trampoline_vaddr();
+ if (unlikely(tramp == UPROBE_NO_TRAMPOLINE_VADDR))
+ goto sigill;
+
+ /* Make sure the ip matches the only allowed sys_uretprobe caller. */
+ if (unlikely(regs->ip != trampoline_check_ip(tramp)))
+ goto sigill;
+
+ err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
+ if (err)
+ goto sigill;
+
+ /* expose the "right" values of r11/cx/ax/sp to uprobe_consumer/s */
+ regs->r11 = args.r11;
+ regs->cx = args.cx;
+ regs->ax = args.ax;
+ regs->sp += sizeof(args);
+ regs->orig_ax = -1;
+
+ ip = regs->ip;
+ sp = regs->sp;
+
+ uprobe_handle_trampoline(regs);
+
+ /*
+ * Some of the uprobe consumers has changed sp, we can do nothing,
+ * just return via iret.
+ * .. or shadow stack is enabled, in which case we need to skip
+ * return through the user space stack address.
+ */
+ if (regs->sp != sp || shstk_is_enabled())
+ return regs->ax;
+ regs->sp -= sizeof(args);
+
+ /* for the case uprobe_consumer has changed r11/cx */
+ args.r11 = regs->r11;
+ args.cx = regs->cx;
+
+ /*
+ * ax register is passed through as return value, so we can use
+ * its space on stack for ip value and jump to it through the
+ * trampoline's ret instruction
+ */
+ args.ax = regs->ip;
+ regs->ip = ip;
+
+ err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
+ if (err)
+ goto sigill;
+
+ /* ensure sysret, see do_syscall_64() */
+ regs->r11 = regs->flags;
+ regs->cx = regs->ip;
+
+ return regs->ax;
+
+sigill:
+ force_sig(SIGILL);
+ return -1;
+}
+
/*
* If arch_uprobe->insn doesn't use rip-relative addressing, return
* immediately. Otherwise, rewrite the instruction so that it accesses
@@ -492,6 +617,558 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
*sr = utask->autask.saved_scratch_register;
}
}
+
+static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
+{
+ return -EPERM;
+}
+
+static struct page *tramp_mapping_pages[2] __ro_after_init;
+
+static struct vm_special_mapping tramp_mapping = {
+ .name = "[uprobes-trampoline]",
+ .mremap = tramp_mremap,
+ .pages = tramp_mapping_pages,
+};
+
+struct uprobe_trampoline {
+ struct hlist_node node;
+ unsigned long vaddr;
+};
+
+static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr)
+{
+ long delta = (long)(vaddr + 5 - vtramp);
+
+ return delta >= INT_MIN && delta <= INT_MAX;
+}
+
+static unsigned long find_nearest_trampoline(unsigned long vaddr)
+{
+ struct vm_unmapped_area_info info = {
+ .length = PAGE_SIZE,
+ .align_mask = ~PAGE_MASK,
+ };
+ unsigned long low_limit, high_limit;
+ unsigned long low_tramp, high_tramp;
+ unsigned long call_end = vaddr + 5;
+
+ if (check_add_overflow(call_end, INT_MIN, &low_limit))
+ low_limit = PAGE_SIZE;
+
+ high_limit = call_end + INT_MAX;
+
+ /* Search up from the caller address. */
+ info.low_limit = call_end;
+ info.high_limit = min(high_limit, TASK_SIZE);
+ high_tramp = vm_unmapped_area(&info);
+
+ /* Search down from the caller address. */
+ info.low_limit = max(low_limit, PAGE_SIZE);
+ info.high_limit = call_end;
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ low_tramp = vm_unmapped_area(&info);
+
+ if (IS_ERR_VALUE(high_tramp) && IS_ERR_VALUE(low_tramp))
+ return -ENOMEM;
+ if (IS_ERR_VALUE(high_tramp))
+ return low_tramp;
+ if (IS_ERR_VALUE(low_tramp))
+ return high_tramp;
+
+ /* Return address that's closest to the caller address. */
+ if (call_end - low_tramp < high_tramp - call_end)
+ return low_tramp;
+ return high_tramp;
+}
+
+static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ struct mm_struct *mm = current->mm;
+ struct uprobe_trampoline *tramp;
+ struct vm_area_struct *vma;
+
+ if (!user_64bit_mode(regs))
+ return NULL;
+
+ vaddr = find_nearest_trampoline(vaddr);
+ if (IS_ERR_VALUE(vaddr))
+ return NULL;
+
+ tramp = kzalloc(sizeof(*tramp), GFP_KERNEL);
+ if (unlikely(!tramp))
+ return NULL;
+
+ tramp->vaddr = vaddr;
+ vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE,
+ VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO,
+ &tramp_mapping);
+ if (IS_ERR(vma)) {
+ kfree(tramp);
+ return NULL;
+ }
+ return tramp;
+}
+
+static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new)
+{
+ struct uprobes_state *state = &current->mm->uprobes_state;
+ struct uprobe_trampoline *tramp = NULL;
+
+ if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE)
+ return NULL;
+
+ hlist_for_each_entry(tramp, &state->head_tramps, node) {
+ if (is_reachable_by_call(tramp->vaddr, vaddr)) {
+ *new = false;
+ return tramp;
+ }
+ }
+
+ tramp = create_uprobe_trampoline(vaddr);
+ if (!tramp)
+ return NULL;
+
+ *new = true;
+ hlist_add_head(&tramp->node, &state->head_tramps);
+ return tramp;
+}
+
+static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp)
+{
+ /*
+ * We do not unmap and release uprobe trampoline page itself,
+ * because there's no easy way to make sure none of the threads
+ * is still inside the trampoline.
+ */
+ hlist_del(&tramp->node);
+ kfree(tramp);
+}
+
+void arch_uprobe_init_state(struct mm_struct *mm)
+{
+ INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps);
+}
+
+void arch_uprobe_clear_state(struct mm_struct *mm)
+{
+ struct uprobes_state *state = &mm->uprobes_state;
+ struct uprobe_trampoline *tramp;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node)
+ destroy_uprobe_trampoline(tramp);
+}
+
+static bool __in_uprobe_trampoline(unsigned long ip)
+{
+ struct vm_area_struct *vma = vma_lookup(current->mm, ip);
+
+ return vma && vma_is_special_mapping(vma, &tramp_mapping);
+}
+
+static bool in_uprobe_trampoline(unsigned long ip)
+{
+ struct mm_struct *mm = current->mm;
+ bool found, retry = true;
+ unsigned int seq;
+
+ rcu_read_lock();
+ if (mmap_lock_speculate_try_begin(mm, &seq)) {
+ found = __in_uprobe_trampoline(ip);
+ retry = mmap_lock_speculate_retry(mm, seq);
+ }
+ rcu_read_unlock();
+
+ if (retry) {
+ mmap_read_lock(mm);
+ found = __in_uprobe_trampoline(ip);
+ mmap_read_unlock(mm);
+ }
+ return found;
+}
+
+/*
+ * See uprobe syscall trampoline; the call to the trampoline will push
+ * the return address on the stack, the trampoline itself then pushes
+ * cx, r11 and ax.
+ */
+struct uprobe_syscall_args {
+ unsigned long ax;
+ unsigned long r11;
+ unsigned long cx;
+ unsigned long retaddr;
+};
+
+SYSCALL_DEFINE0(uprobe)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ struct uprobe_syscall_args args;
+ unsigned long ip, sp, sret;
+ int err;
+
+ /* Allow execution only from uprobe trampolines. */
+ if (!in_uprobe_trampoline(regs->ip))
+ return -ENXIO;
+
+ err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
+ if (err)
+ goto sigill;
+
+ ip = regs->ip;
+
+ /*
+ * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus:
+ * - adjust ip to the probe address, call saved next instruction address
+ * - adjust sp to the probe's stack frame (check trampoline code)
+ */
+ regs->ax = args.ax;
+ regs->r11 = args.r11;
+ regs->cx = args.cx;
+ regs->ip = args.retaddr - 5;
+ regs->sp += sizeof(args);
+ regs->orig_ax = -1;
+
+ sp = regs->sp;
+
+ err = shstk_pop((u64 *)&sret);
+ if (err == -EFAULT || (!err && sret != args.retaddr))
+ goto sigill;
+
+ handle_syscall_uprobe(regs, regs->ip);
+
+ /*
+ * Some of the uprobe consumers has changed sp, we can do nothing,
+ * just return via iret.
+ */
+ if (regs->sp != sp) {
+ /* skip the trampoline call */
+ if (args.retaddr - 5 == regs->ip)
+ regs->ip += 5;
+ return regs->ax;
+ }
+
+ regs->sp -= sizeof(args);
+
+ /* for the case uprobe_consumer has changed ax/r11/cx */
+ args.ax = regs->ax;
+ args.r11 = regs->r11;
+ args.cx = regs->cx;
+
+ /* keep return address unless we are instructed otherwise */
+ if (args.retaddr - 5 != regs->ip)
+ args.retaddr = regs->ip;
+
+ if (shstk_push(args.retaddr) == -EFAULT)
+ goto sigill;
+
+ regs->ip = ip;
+
+ err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
+ if (err)
+ goto sigill;
+
+ /* ensure sysret, see do_syscall_64() */
+ regs->r11 = regs->flags;
+ regs->cx = regs->ip;
+ return 0;
+
+sigill:
+ force_sig(SIGILL);
+ return -1;
+}
+
+asm (
+ ".pushsection .rodata\n"
+ ".balign " __stringify(PAGE_SIZE) "\n"
+ "uprobe_trampoline_entry:\n"
+ "push %rcx\n"
+ "push %r11\n"
+ "push %rax\n"
+ "mov $" __stringify(__NR_uprobe) ", %rax\n"
+ "syscall\n"
+ "pop %rax\n"
+ "pop %r11\n"
+ "pop %rcx\n"
+ "ret\n"
+ "int3\n"
+ ".balign " __stringify(PAGE_SIZE) "\n"
+ ".popsection\n"
+);
+
+extern u8 uprobe_trampoline_entry[];
+
+static int __init arch_uprobes_init(void)
+{
+ tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry);
+ return 0;
+}
+
+late_initcall(arch_uprobes_init);
+
+enum {
+ EXPECT_SWBP,
+ EXPECT_CALL,
+};
+
+struct write_opcode_ctx {
+ unsigned long base;
+ int expect;
+};
+
+static int is_call_insn(uprobe_opcode_t *insn)
+{
+ return *insn == CALL_INSN_OPCODE;
+}
+
+/*
+ * Verification callback used by int3_update uprobe_write calls to make sure
+ * the underlying instruction is as expected - either int3 or call.
+ */
+static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode,
+ int nbytes, void *data)
+{
+ struct write_opcode_ctx *ctx = data;
+ uprobe_opcode_t old_opcode[5];
+
+ uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5);
+
+ switch (ctx->expect) {
+ case EXPECT_SWBP:
+ if (is_swbp_insn(&old_opcode[0]))
+ return 1;
+ break;
+ case EXPECT_CALL:
+ if (is_call_insn(&old_opcode[0]))
+ return 1;
+ break;
+ }
+
+ return -1;
+}
+
+/*
+ * Modify multi-byte instructions by using INT3 breakpoints on SMP.
+ * We completely avoid using stop_machine() here, and achieve the
+ * synchronization using INT3 breakpoints and SMP cross-calls.
+ * (borrowed comment from smp_text_poke_batch_finish)
+ *
+ * The way it is done:
+ * - Add an INT3 trap to the address that will be patched
+ * - SMP sync all CPUs
+ * - Update all but the first byte of the patched range
+ * - SMP sync all CPUs
+ * - Replace the first byte (INT3) by the first byte of the replacing opcode
+ * - SMP sync all CPUs
+ */
+static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr, char *insn, bool optimize)
+{
+ uprobe_opcode_t int3 = UPROBE_SWBP_INSN;
+ struct write_opcode_ctx ctx = {
+ .base = vaddr,
+ };
+ int err;
+
+ /*
+ * Write int3 trap.
+ *
+ * The swbp_optimize path comes with breakpoint already installed,
+ * so we can skip this step for optimize == true.
+ */
+ if (!optimize) {
+ ctx.expect = EXPECT_CALL;
+ err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn,
+ true /* is_register */, false /* do_update_ref_ctr */,
+ &ctx);
+ if (err)
+ return err;
+ }
+
+ smp_text_poke_sync_each_cpu();
+
+ /* Write all but the first byte of the patched range. */
+ ctx.expect = EXPECT_SWBP;
+ err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn,
+ true /* is_register */, false /* do_update_ref_ctr */,
+ &ctx);
+ if (err)
+ return err;
+
+ smp_text_poke_sync_each_cpu();
+
+ /*
+ * Write first byte.
+ *
+ * The swbp_unoptimize needs to finish uprobe removal together
+ * with ref_ctr update, using uprobe_write with proper flags.
+ */
+ err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn,
+ optimize /* is_register */, !optimize /* do_update_ref_ctr */,
+ &ctx);
+ if (err)
+ return err;
+
+ smp_text_poke_sync_each_cpu();
+ return 0;
+}
+
+static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr, unsigned long tramp)
+{
+ u8 call[5];
+
+ __text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr,
+ (const void *) tramp, CALL_INSN_SIZE);
+ return int3_update(auprobe, vma, vaddr, call, true /* optimize */);
+}
+
+static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
+{
+ return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */);
+}
+
+static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len)
+{
+ unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD;
+ struct vm_area_struct *vma;
+ struct page *page;
+
+ page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+ uprobe_copy_from_page(page, vaddr, dst, len);
+ put_page(page);
+ return 0;
+}
+
+static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr)
+{
+ struct __packed __arch_relative_insn {
+ u8 op;
+ s32 raddr;
+ } *call = (struct __arch_relative_insn *) insn;
+
+ if (!is_call_insn(insn))
+ return false;
+ return __in_uprobe_trampoline(vaddr + 5 + call->raddr);
+}
+
+static int is_optimized(struct mm_struct *mm, unsigned long vaddr)
+{
+ uprobe_opcode_t insn[5];
+ int err;
+
+ err = copy_from_vaddr(mm, vaddr, &insn, 5);
+ if (err)
+ return err;
+ return __is_optimized((uprobe_opcode_t *)&insn, vaddr);
+}
+
+static bool should_optimize(struct arch_uprobe *auprobe)
+{
+ return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) &&
+ test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+}
+
+int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
+{
+ if (should_optimize(auprobe)) {
+ /*
+ * We could race with another thread that already optimized the probe,
+ * so let's not overwrite it with int3 again in this case.
+ */
+ int ret = is_optimized(vma->vm_mm, vaddr);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ return 0;
+ }
+ return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN,
+ true /* is_register */);
+}
+
+int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
+{
+ if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) {
+ int ret = is_optimized(vma->vm_mm, vaddr);
+ if (ret < 0)
+ return ret;
+ if (ret) {
+ ret = swbp_unoptimize(auprobe, vma, vaddr);
+ WARN_ON_ONCE(ret);
+ return ret;
+ }
+ }
+ return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn,
+ false /* is_register */);
+}
+
+static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm,
+ unsigned long vaddr)
+{
+ struct uprobe_trampoline *tramp;
+ struct vm_area_struct *vma;
+ bool new = false;
+ int err = 0;
+
+ vma = find_vma(mm, vaddr);
+ if (!vma)
+ return -EINVAL;
+ tramp = get_uprobe_trampoline(vaddr, &new);
+ if (!tramp)
+ return -EINVAL;
+ err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr);
+ if (WARN_ON_ONCE(err) && new)
+ destroy_uprobe_trampoline(tramp);
+ return err;
+}
+
+void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+ struct mm_struct *mm = current->mm;
+ uprobe_opcode_t insn[5];
+
+ if (!should_optimize(auprobe))
+ return;
+
+ mmap_write_lock(mm);
+
+ /*
+ * Check if some other thread already optimized the uprobe for us,
+ * if it's the case just go away silently.
+ */
+ if (copy_from_vaddr(mm, vaddr, &insn, 5))
+ goto unlock;
+ if (!is_swbp_insn((uprobe_opcode_t*) &insn))
+ goto unlock;
+
+ /*
+ * If we fail to optimize the uprobe we set the fail bit so the
+ * above should_optimize will fail from now on.
+ */
+ if (__arch_uprobe_optimize(auprobe, mm, vaddr))
+ set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags);
+
+unlock:
+ mmap_write_unlock(mm);
+}
+
+static bool can_optimize(struct insn *insn, unsigned long vaddr)
+{
+ if (!insn->x86_64 || insn->length != 5)
+ return false;
+
+ if (!insn_is_nop(insn))
+ return false;
+
+ /* We can't do cross page atomic writes yet. */
+ return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5;
+}
#else /* 32-bit: */
/*
* No RIP-relative addressing on 32-bit
@@ -505,6 +1182,10 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
}
+static bool can_optimize(struct insn *insn, unsigned long vaddr)
+{
+ return false;
+}
#endif /* CONFIG_X86_64 */
struct uprobe_xol_ops {
@@ -514,9 +1195,12 @@ struct uprobe_xol_ops {
void (*abort)(struct arch_uprobe *, struct pt_regs *);
};
-static inline int sizeof_long(void)
+static inline int sizeof_long(struct pt_regs *regs)
{
- return in_ia32_syscall() ? 4 : 8;
+ /*
+ * Check registers for mode as in_xxx_syscall() does not apply here.
+ */
+ return user_64bit_mode(regs) ? 8 : 4;
}
static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
@@ -525,11 +1209,11 @@ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
return 0;
}
-static int push_ret_address(struct pt_regs *regs, unsigned long ip)
+static int emulate_push_stack(struct pt_regs *regs, unsigned long val)
{
- unsigned long new_sp = regs->sp - sizeof_long();
+ unsigned long new_sp = regs->sp - sizeof_long(regs);
- if (copy_to_user((void __user *)new_sp, &ip, sizeof_long()))
+ if (copy_to_user((void __user *)new_sp, &val, sizeof_long(regs)))
return -EFAULT;
regs->sp = new_sp;
@@ -562,8 +1246,8 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs
long correction = utask->vaddr - utask->xol_vaddr;
regs->ip += correction;
} else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
- regs->sp += sizeof_long(); /* Pop incorrect return address */
- if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen))
+ regs->sp += sizeof_long(regs); /* Pop incorrect return address */
+ if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen))
return -ERESTART;
}
/* popf; tell the caller to not touch TF */
@@ -652,7 +1336,7 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
*
* But there is corner case, see the comment in ->post_xol().
*/
- if (push_ret_address(regs, new_ip))
+ if (emulate_push_stack(regs, new_ip))
return false;
} else if (!check_jmp_cond(auprobe, regs)) {
offs = 0;
@@ -662,6 +1346,16 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
return true;
}
+static bool push_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ unsigned long *src_ptr = (void *)regs + auprobe->push.reg_offset;
+
+ if (emulate_push_stack(regs, *src_ptr))
+ return false;
+ regs->ip += auprobe->push.ilen;
+ return true;
+}
+
static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
BUG_ON(!branch_is_call(auprobe));
@@ -671,7 +1365,7 @@ static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
* "call" insn was executed out-of-line. Just restore ->sp and restart.
* We could also restore ->ip and try to call branch_emulate_op() again.
*/
- regs->sp += sizeof_long();
+ regs->sp += sizeof_long(regs);
return -ERESTART;
}
@@ -700,16 +1394,22 @@ static const struct uprobe_xol_ops branch_xol_ops = {
.post_xol = branch_post_xol_op,
};
+static const struct uprobe_xol_ops push_xol_ops = {
+ .emulate = push_emulate_op,
+};
+
/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
{
u8 opc1 = OPCODE1(insn);
- int i;
+ insn_byte_t p;
+
+ if (insn_is_nop(insn))
+ goto setup;
switch (opc1) {
case 0xeb: /* jmp 8 */
case 0xe9: /* jmp 32 */
- case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */
break;
case 0xe8: /* call relative */
@@ -724,6 +1424,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
* OPCODE1() of the "short" jmp which checks the same condition.
*/
opc1 = OPCODE2(insn) - 0x10;
+ fallthrough;
default:
if (!is_cond_jmp_opcode(opc1))
return -ENOSYS;
@@ -734,11 +1435,12 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
* Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix.
* No one uses these insns, reject any branch insns with such prefix.
*/
- for (i = 0; i < insn->prefixes.nbytes; i++) {
- if (insn->prefixes.bytes[i] == 0x66)
+ for_each_insn_prefix(insn, p) {
+ if (p == 0x66)
return -ENOTSUPP;
}
+setup:
auprobe->branch.opc1 = opc1;
auprobe->branch.ilen = insn->length;
auprobe->branch.offs = insn->immediate.value;
@@ -747,27 +1449,115 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
return 0;
}
+/* Returns -ENOSYS if push_xol_ops doesn't handle this insn */
+static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ u8 opc1 = OPCODE1(insn), reg_offset = 0;
+
+ if (opc1 < 0x50 || opc1 > 0x57)
+ return -ENOSYS;
+
+ if (insn->length > 2)
+ return -ENOSYS;
+ if (insn->length == 2) {
+ /* only support rex_prefix 0x41 (x64 only) */
+#ifdef CONFIG_X86_64
+ if (insn->rex_prefix.nbytes != 1 ||
+ insn->rex_prefix.bytes[0] != 0x41)
+ return -ENOSYS;
+
+ switch (opc1) {
+ case 0x50:
+ reg_offset = offsetof(struct pt_regs, r8);
+ break;
+ case 0x51:
+ reg_offset = offsetof(struct pt_regs, r9);
+ break;
+ case 0x52:
+ reg_offset = offsetof(struct pt_regs, r10);
+ break;
+ case 0x53:
+ reg_offset = offsetof(struct pt_regs, r11);
+ break;
+ case 0x54:
+ reg_offset = offsetof(struct pt_regs, r12);
+ break;
+ case 0x55:
+ reg_offset = offsetof(struct pt_regs, r13);
+ break;
+ case 0x56:
+ reg_offset = offsetof(struct pt_regs, r14);
+ break;
+ case 0x57:
+ reg_offset = offsetof(struct pt_regs, r15);
+ break;
+ }
+#else
+ return -ENOSYS;
+#endif
+ } else {
+ switch (opc1) {
+ case 0x50:
+ reg_offset = offsetof(struct pt_regs, ax);
+ break;
+ case 0x51:
+ reg_offset = offsetof(struct pt_regs, cx);
+ break;
+ case 0x52:
+ reg_offset = offsetof(struct pt_regs, dx);
+ break;
+ case 0x53:
+ reg_offset = offsetof(struct pt_regs, bx);
+ break;
+ case 0x54:
+ reg_offset = offsetof(struct pt_regs, sp);
+ break;
+ case 0x55:
+ reg_offset = offsetof(struct pt_regs, bp);
+ break;
+ case 0x56:
+ reg_offset = offsetof(struct pt_regs, si);
+ break;
+ case 0x57:
+ reg_offset = offsetof(struct pt_regs, di);
+ break;
+ }
+ }
+
+ auprobe->push.reg_offset = reg_offset;
+ auprobe->push.ilen = insn->length;
+ auprobe->ops = &push_xol_ops;
+ return 0;
+}
+
/**
* arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
+ * @auprobe: the probepoint information.
* @mm: the probed address space.
- * @arch_uprobe: the probepoint information.
* @addr: virtual address at which to install the probepoint
* Return 0 on success or a -ve number on error.
*/
int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
{
- struct insn insn;
u8 fix_ip_or_call = UPROBE_FIX_IP;
+ struct insn insn;
int ret;
ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm));
if (ret)
return ret;
+ if (can_optimize(&insn, addr))
+ set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+
ret = branch_setup_xol_ops(auprobe, &insn);
if (ret != -ENOSYS)
return ret;
+ ret = push_setup_xol_ops(auprobe, &insn);
+ if (ret != -ENOSYS)
+ return ret;
+
/*
* Figure out which fixups default_post_xol_op() will need to perform,
* and annotate defparam->fixups accordingly.
@@ -795,7 +1585,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
fix_ip_or_call = 0;
break;
}
- /* fall through */
+ fallthrough;
default:
riprel_analyze(auprobe, &insn);
}
@@ -918,6 +1708,8 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
if (uprobe_post_sstep_notifier(regs))
ret = NOTIFY_STOP;
+ break;
+
default:
break;
}
@@ -962,7 +1754,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
unsigned long
arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)
{
- int rasize = sizeof_long(), nleft;
+ int rasize = sizeof_long(regs), nleft;
unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */
if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize))
@@ -973,14 +1765,19 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
return orig_ret_vaddr;
nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize);
- if (likely(!nleft))
+ if (likely(!nleft)) {
+ if (shstk_update_last_frame(trampoline_vaddr)) {
+ force_sig(SIGSEGV);
+ return -1;
+ }
return orig_ret_vaddr;
+ }
if (nleft != rasize) {
- pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, "
- "%%ip=%#lx\n", current->pid, regs->sp, regs->ip);
+ pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n",
+ current->pid, regs->sp, regs->ip);
- force_sig_info(SIGSEGV, SEND_SIG_FORCED, current);
+ force_sig(SIGSEGV);
}
return -1;
@@ -994,3 +1791,35 @@ bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
else
return regs->sp <= ret->stack;
}
+
+/*
+ * Heuristic-based check if uprobe is installed at the function entry.
+ *
+ * Under assumption of user code being compiled with frame pointers,
+ * `push %rbp/%ebp` is a good indicator that we indeed are.
+ *
+ * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
+ * If we get this wrong, captured stack trace might have one extra bogus
+ * entry, but the rest of stack trace will still be meaningful.
+ */
+bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+ struct arch_uprobe *auprobe;
+
+ if (!current->utask)
+ return false;
+
+ auprobe = current->utask->auprobe;
+ if (!auprobe)
+ return false;
+
+ /* push %rbp/%ebp */
+ if (auprobe->insn[0] == 0x55)
+ return true;
+
+ /* endbr64 (64-bit only) */
+ if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn))
+ return true;
+
+ return false;
+}