diff options
Diffstat (limited to 'kernel/events/uprobes.c')
| -rw-r--r-- | kernel/events/uprobes.c | 2447 |
1 files changed, 1745 insertions, 702 deletions
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f3569747d629..f11ceb8be8c4 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1,25 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * User-space Probes (UProbes) * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * Copyright (C) IBM Corporation, 2008-2012 * Authors: * Srikar Dronamraju * Jim Keniston - * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra */ #include <linux/kernel.h> @@ -27,14 +14,22 @@ #include <linux/pagemap.h> /* read_mapping_page */ #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/export.h> #include <linux/rmap.h> /* anon_vma_prepare */ -#include <linux/mmu_notifier.h> /* set_pte_at_notify */ -#include <linux/swap.h> /* try_to_free_swap */ +#include <linux/mmu_notifier.h> +#include <linux/swap.h> /* folio_free_swap */ #include <linux/ptrace.h> /* user_enable_single_step */ #include <linux/kdebug.h> /* notifier mechanism */ -#include "../../mm/internal.h" /* munlock_vma_page */ #include <linux/percpu-rwsem.h> +#include <linux/task_work.h> +#include <linux/shmem_fs.h> +#include <linux/khugepaged.h> +#include <linux/rcupdate_trace.h> +#include <linux/workqueue.h> +#include <linux/srcu.h> +#include <linux/oom.h> /* check_stable_address_space */ +#include <linux/pagewalk.h> #include <linux/uprobes.h> @@ -48,42 +43,87 @@ static struct rb_root uprobes_tree = RB_ROOT; */ #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) -static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ +static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ +static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock); #define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) -static struct percpu_rw_semaphore dup_mmap_sem; +DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); + +/* Covers return_instance's uprobe lifetime. */ +DEFINE_STATIC_SRCU(uretprobes_srcu); /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 -/* Can skip singlestep */ -#define UPROBE_SKIP_SSTEP 1 struct uprobe { struct rb_node rb_node; /* node in the rb tree */ - atomic_t ref; + refcount_t ref; struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; struct list_head pending_list; - struct uprobe_consumer *consumers; + struct list_head consumers; struct inode *inode; /* Also hold a ref to inode */ + union { + struct rcu_head rcu; + struct work_struct work; + }; loff_t offset; - unsigned long flags; + loff_t ref_ctr_offset; + unsigned long flags; /* "unsigned long" so bitops work */ + + /* + * The generic code assumes that it has two members of unknown type + * owned by the arch-specific code: + * + * insn - copy_insn() saves the original instruction here for + * arch_uprobe_analyze_insn(). + * + * ixol - potentially modified instruction to execute out of + * line, copied to xol_area by xol_get_insn_slot(). + */ struct arch_uprobe arch; }; -struct return_instance { - struct uprobe *uprobe; - unsigned long func; - unsigned long orig_ret_vaddr; /* original return address */ - bool chained; /* true, if instance is nested */ +struct delayed_uprobe { + struct list_head list; + struct uprobe *uprobe; + struct mm_struct *mm; +}; + +static DEFINE_MUTEX(delayed_uprobe_lock); +static LIST_HEAD(delayed_uprobe_list); - struct return_instance *next; /* keep as stack */ +/* + * Execute out of line area: anonymous executable mapping installed + * by the probed task to execute the copy of the original instruction + * mangled by set_swbp(). + * + * On a breakpoint hit, thread contests for a slot. It frees the + * slot after singlestep. Currently a fixed number of slots are + * allocated. + */ +struct xol_area { + wait_queue_head_t wq; /* if all slots are busy */ + unsigned long *bitmap; /* 0 = free slot */ + + struct page *page; + /* + * We keep the vma's vm_start rather than a pointer to the vma + * itself. The probed process or a naughty kernel module could make + * the vma go away, and we must handle that reasonably gracefully. + */ + unsigned long vaddr; /* Page(s) of instruction slots */ }; +static void uprobe_warn(struct task_struct *t, const char *msg) +{ + pr_warn("uprobe: %s:%d failed to %s\n", t->comm, t->pid, msg); +} + /* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have @@ -94,7 +134,7 @@ struct return_instance { */ static bool valid_vma(struct vm_area_struct *vma, bool is_register) { - vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED; + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE; if (is_register) flags |= VM_WRITE; @@ -113,65 +153,6 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) } /** - * __replace_page - replace page in vma by new page. - * based on replace_page in mm/ksm.c - * - * @vma: vma that holds the pte pointing to page - * @addr: address the old @page is mapped at - * @page: the cowed page we are replacing by kpage - * @kpage: the modified page we replace page by - * - * Returns 0 on success, -EFAULT on failure. - */ -static int __replace_page(struct vm_area_struct *vma, unsigned long addr, - struct page *page, struct page *kpage) -{ - struct mm_struct *mm = vma->vm_mm; - spinlock_t *ptl; - pte_t *ptep; - int err; - /* For mmu_notifiers */ - const unsigned long mmun_start = addr; - const unsigned long mmun_end = addr + PAGE_SIZE; - - /* For try_to_free_swap() and munlock_vma_page() below */ - lock_page(page); - - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - err = -EAGAIN; - ptep = page_check_address(page, mm, addr, &ptl, 0); - if (!ptep) - goto unlock; - - get_page(kpage); - page_add_new_anon_rmap(kpage, vma, addr); - - if (!PageAnon(page)) { - dec_mm_counter(mm, MM_FILEPAGES); - inc_mm_counter(mm, MM_ANONPAGES); - } - - flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); - - page_remove_rmap(page); - if (!page_mapped(page)) - try_to_free_swap(page); - pte_unmap_unlock(ptep, ptl); - - if (vma->vm_flags & VM_LOCKED) - munlock_vma_page(page); - put_page(page); - - err = 0; - unlock: - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - unlock_page(page); - return err; -} - -/** * is_swbp_insn - check if instruction is breakpoint instruction. * @insn: instruction to be checked. * Default implementation of is_swbp_insn @@ -196,7 +177,7 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn) return is_swbp_insn(insn); } -static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) +void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); @@ -210,7 +191,8 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src kunmap_atomic(kaddr); } -static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) +static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn, + int nbytes, void *data) { uprobe_opcode_t old_opcode; bool is_swbp; @@ -224,10 +206,10 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * is a trap variant; uprobes always wins over any other (gdb) * breakpoint. */ - copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); - if (is_swbp_insn(new_opcode)) { + if (is_swbp_insn(insn)) { if (is_swbp) /* register: already installed? */ return 0; } else { @@ -238,235 +220,805 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t return 1; } +static struct delayed_uprobe * +delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct delayed_uprobe *du; + + list_for_each_entry(du, &delayed_uprobe_list, list) + if (du->uprobe == uprobe && du->mm == mm) + return du; + return NULL; +} + +static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct delayed_uprobe *du; + + if (delayed_uprobe_check(uprobe, mm)) + return 0; + + du = kzalloc(sizeof(*du), GFP_KERNEL); + if (!du) + return -ENOMEM; + + du->uprobe = uprobe; + du->mm = mm; + list_add(&du->list, &delayed_uprobe_list); + return 0; +} + +static void delayed_uprobe_delete(struct delayed_uprobe *du) +{ + if (WARN_ON(!du)) + return; + list_del(&du->list); + kfree(du); +} + +static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct list_head *pos, *q; + struct delayed_uprobe *du; + + if (!uprobe && !mm) + return; + + list_for_each_safe(pos, q, &delayed_uprobe_list) { + du = list_entry(pos, struct delayed_uprobe, list); + + if (uprobe && du->uprobe != uprobe) + continue; + if (mm && du->mm != mm) + continue; + + delayed_uprobe_delete(du); + } +} + +static bool valid_ref_ctr_vma(struct uprobe *uprobe, + struct vm_area_struct *vma) +{ + unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset); + + return uprobe->ref_ctr_offset && + vma->vm_file && + file_inode(vma->vm_file) == uprobe->inode && + (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && + vma->vm_start <= vaddr && + vma->vm_end > vaddr; +} + +static struct vm_area_struct * +find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm) +{ + VMA_ITERATOR(vmi, mm, 0); + struct vm_area_struct *tmp; + + for_each_vma(vmi, tmp) + if (valid_ref_ctr_vma(uprobe, tmp)) + return tmp; + + return NULL; +} + +static int +__update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) +{ + void *kaddr; + struct page *page; + int ret; + short *ptr; + + if (!vaddr || !d) + return -EINVAL; + + ret = get_user_pages_remote(mm, vaddr, 1, + FOLL_WRITE, &page, NULL); + if (unlikely(ret <= 0)) { + /* + * We are asking for 1 page. If get_user_pages_remote() fails, + * it may return 0, in that case we have to return error. + */ + return ret == 0 ? -EBUSY : ret; + } + + kaddr = kmap_atomic(page); + ptr = kaddr + (vaddr & ~PAGE_MASK); + + if (unlikely(*ptr + d < 0)) { + pr_warn("ref_ctr going negative. vaddr: 0x%lx, " + "curr val: %d, delta: %d\n", vaddr, *ptr, d); + ret = -EINVAL; + goto out; + } + + *ptr += d; + ret = 0; +out: + kunmap_atomic(kaddr); + put_page(page); + return ret; +} + +static void update_ref_ctr_warn(struct uprobe *uprobe, + struct mm_struct *mm, short d) +{ + pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " + "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n", + d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, + (unsigned long long) uprobe->offset, + (unsigned long long) uprobe->ref_ctr_offset, mm); +} + +static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, + short d) +{ + struct vm_area_struct *rc_vma; + unsigned long rc_vaddr; + int ret = 0; + + rc_vma = find_ref_ctr_vma(uprobe, mm); + + if (rc_vma) { + rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset); + ret = __update_ref_ctr(mm, rc_vaddr, d); + if (ret) + update_ref_ctr_warn(uprobe, mm, d); + + if (d > 0) + return ret; + } + + mutex_lock(&delayed_uprobe_lock); + if (d > 0) + ret = delayed_uprobe_add(uprobe, mm); + else + delayed_uprobe_remove(uprobe, mm); + mutex_unlock(&delayed_uprobe_lock); + + return ret; +} + +static bool orig_page_is_identical(struct vm_area_struct *vma, + unsigned long vaddr, struct page *page, bool *pmd_mappable) +{ + const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT; + struct folio *orig_folio = filemap_get_folio(vma->vm_file->f_mapping, + index); + struct page *orig_page; + bool identical; + + if (IS_ERR(orig_folio)) + return false; + orig_page = folio_file_page(orig_folio, index); + + *pmd_mappable = folio_test_pmd_mappable(orig_folio); + identical = folio_test_uptodate(orig_folio) && + pages_identical(page, orig_page); + folio_put(orig_folio); + return identical; +} + +static int __uprobe_write(struct vm_area_struct *vma, + struct folio_walk *fw, struct folio *folio, + unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, + bool is_register) +{ + const unsigned long vaddr = insn_vaddr & PAGE_MASK; + bool pmd_mappable; + + /* For now, we'll only handle PTE-mapped folios. */ + if (fw->level != FW_LEVEL_PTE) + return -EFAULT; + + /* + * See can_follow_write_pte(): we'd actually prefer a writable PTE here, + * but the VMA might not be writable. + */ + if (!pte_write(fw->pte)) { + if (!PageAnonExclusive(fw->page)) + return -EFAULT; + if (unlikely(userfaultfd_pte_wp(vma, fw->pte))) + return -EFAULT; + /* SOFTDIRTY is handled via pte_mkdirty() below. */ + } + + /* + * We'll temporarily unmap the page and flush the TLB, such that we can + * modify the page atomically. + */ + flush_cache_page(vma, vaddr, pte_pfn(fw->pte)); + fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep); + copy_to_page(fw->page, insn_vaddr, insn, nbytes); + + /* + * When unregistering, we may only zap a PTE if uffd is disabled and + * there are no unexpected folio references ... + */ + if (is_register || userfaultfd_missing(vma) || + (folio_ref_count(folio) != folio_expected_ref_count(folio) + 1)) + goto remap; + + /* + * ... and the mapped page is identical to the original page that + * would get faulted in on next access. + */ + if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable)) + goto remap; + + dec_mm_counter(vma->vm_mm, MM_ANONPAGES); + folio_remove_rmap_pte(folio, fw->page, vma); + if (!folio_mapped(folio) && folio_test_swapcache(folio) && + folio_trylock(folio)) { + folio_free_swap(folio); + folio_unlock(folio); + } + folio_put(folio); + + return pmd_mappable; +remap: + /* + * Make sure that our copy_to_page() changes become visible before the + * set_pte_at() write. + */ + smp_wmb(); + /* We modified the page. Make sure to mark the PTE dirty. */ + set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte)); + return 0; +} + /* * NOTE: * Expect the breakpoint instruction to be the smallest size instruction for * the architecture. If an arch has variable length instruction and the * breakpoint instruction is not of the smallest length instruction * supported by that architecture then we need to modify is_trap_at_addr and - * write_opcode accordingly. This would never be a problem for archs that - * have fixed length instructions. - */ - -/* - * write_opcode - write the opcode at a given virtual address. - * @mm: the probed process address space. - * @vaddr: the virtual address to store the opcode. - * @opcode: opcode to be written at @vaddr. + * uprobe_write_opcode accordingly. This would never be a problem for archs + * that have fixed length instructions. * - * Called with mm->mmap_sem held (for read and with a reference to - * mm). + * uprobe_write_opcode - write the opcode at a given virtual address. + * @auprobe: arch specific probepoint information. + * @vma: the probed virtual memory area. + * @opcode_vaddr: the virtual address to store the opcode. + * @opcode: opcode to be written at @opcode_vaddr. * - * For mm @mm, write the opcode at @vaddr. + * Called with mm->mmap_lock held for write. * Return 0 (success) or a negative errno. */ -static int write_opcode(struct mm_struct *mm, unsigned long vaddr, - uprobe_opcode_t opcode) +int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + const unsigned long opcode_vaddr, uprobe_opcode_t opcode, + bool is_register) { - struct page *old_page, *new_page; - struct vm_area_struct *vma; - int ret; + return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, + verify_opcode, is_register, true /* do_update_ref_ctr */, NULL); +} + +int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, + uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr, + void *data) +{ + const unsigned long vaddr = insn_vaddr & PAGE_MASK; + struct mm_struct *mm = vma->vm_mm; + struct uprobe *uprobe; + int ret, ref_ctr_updated = 0; + unsigned int gup_flags = FOLL_FORCE; + struct mmu_notifier_range range; + struct folio_walk fw; + struct folio *folio; + struct page *page; + + uprobe = container_of(auprobe, struct uprobe, arch); + + if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) + return -EINVAL; + + /* + * When registering, we have to break COW to get an exclusive anonymous + * page that we can safely modify. Use FOLL_WRITE to trigger a write + * fault if required. When unregistering, we might be lucky and the + * anon page is already gone. So defer write faults until really + * required. Use FOLL_SPLIT_PMD, because __uprobe_write() + * cannot deal with PMDs yet. + */ + if (is_register) + gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; retry: - /* Read the page with vaddr into memory */ - ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); + ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL); if (ret <= 0) - return ret; + goto out; + folio = page_folio(page); - ret = verify_opcode(old_page, vaddr, &opcode); - if (ret <= 0) - goto put_old; + ret = verify(page, insn_vaddr, insn, nbytes, data); + if (ret <= 0) { + folio_put(folio); + goto out; + } - ret = -ENOMEM; - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); - if (!new_page) - goto put_old; + /* We are going to replace instruction, update ref_ctr. */ + if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) { + ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); + if (ret) { + folio_put(folio); + goto out; + } - __SetPageUptodate(new_page); + ref_ctr_updated = 1; + } - copy_highpage(new_page, old_page); - copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + ret = 0; + if (unlikely(!folio_test_anon(folio) || folio_is_zone_device(folio))) { + VM_WARN_ON_ONCE(is_register); + folio_put(folio); + goto out; + } - ret = anon_vma_prepare(vma); - if (ret) - goto put_new; + if (!is_register) { + /* + * In the common case, we'll be able to zap the page when + * unregistering. So trigger MMU notifiers now, as we won't + * be able to do it under PTL. + */ + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, + vaddr, vaddr + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); + } - ret = __replace_page(vma, vaddr, old_page, new_page); + ret = -EAGAIN; + /* Walk the page tables again, to perform the actual update. */ + if (folio_walk_start(&fw, vma, vaddr, 0)) { + if (fw.page == page) + ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register); + folio_walk_end(&fw, vma); + } -put_new: - page_cache_release(new_page); -put_old: - put_page(old_page); + if (!is_register) + mmu_notifier_invalidate_range_end(&range); - if (unlikely(ret == -EAGAIN)) + folio_put(folio); + switch (ret) { + case -EFAULT: + gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; + fallthrough; + case -EAGAIN: goto retry; - return ret; + default: + break; + } + +out: + /* Revert back reference counter if instruction update failed. */ + if (do_update_ref_ctr && ret < 0 && ref_ctr_updated) + update_ref_ctr(uprobe, mm, is_register ? -1 : 1); + + /* try collapse pmd for compound page */ + if (ret > 0) + collapse_pte_mapped_thp(mm, vaddr, false); + + return ret < 0 ? ret : 0; } /** * set_swbp - store breakpoint at a given address. * @auprobe: arch specific probepoint information. - * @mm: the probed process address space. + * @vma: the probed virtual memory area. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, store the breakpoint instruction at @vaddr. * Return 0 (success) or a negative errno. */ -int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) { - return write_opcode(mm, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true); } /** * set_orig_insn - Restore the original instruction. - * @mm: the probed process address space. + * @vma: the probed virtual memory area. * @auprobe: arch specific probepoint information. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, restore the original opcode (opcode) at @vaddr. * Return 0 (success) or a negative errno. */ -int __weak -set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +int __weak set_orig_insn(struct arch_uprobe *auprobe, + struct vm_area_struct *vma, unsigned long vaddr) +{ + return uprobe_write_opcode(auprobe, vma, vaddr, + *(uprobe_opcode_t *)&auprobe->insn, false); +} + +/* uprobe should have guaranteed positive refcount */ +static struct uprobe *get_uprobe(struct uprobe *uprobe) +{ + refcount_inc(&uprobe->ref); + return uprobe; +} + +/* + * uprobe should have guaranteed lifetime, which can be either of: + * - caller already has refcount taken (and wants an extra one); + * - uprobe is RCU protected and won't be freed until after grace period; + * - we are holding uprobes_treelock (for read or write, doesn't matter). + */ +static struct uprobe *try_get_uprobe(struct uprobe *uprobe) +{ + if (refcount_inc_not_zero(&uprobe->ref)) + return uprobe; + return NULL; +} + +static inline bool uprobe_is_active(struct uprobe *uprobe) +{ + return !RB_EMPTY_NODE(&uprobe->rb_node); +} + +static void uprobe_free_rcu_tasks_trace(struct rcu_head *rcu) +{ + struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); + + kfree(uprobe); +} + +static void uprobe_free_srcu(struct rcu_head *rcu) +{ + struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); + + call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu_tasks_trace); +} + +static void uprobe_free_deferred(struct work_struct *work) +{ + struct uprobe *uprobe = container_of(work, struct uprobe, work); + + write_lock(&uprobes_treelock); + + if (uprobe_is_active(uprobe)) { + write_seqcount_begin(&uprobes_seqcount); + rb_erase(&uprobe->rb_node, &uprobes_tree); + write_seqcount_end(&uprobes_seqcount); + } + + write_unlock(&uprobes_treelock); + + /* + * If application munmap(exec_vma) before uprobe_unregister() + * gets called, we don't get a chance to remove uprobe from + * delayed_uprobe_list from remove_breakpoint(). Do it here. + */ + mutex_lock(&delayed_uprobe_lock); + delayed_uprobe_remove(uprobe, NULL); + mutex_unlock(&delayed_uprobe_lock); + + /* start srcu -> rcu_tasks_trace -> kfree chain */ + call_srcu(&uretprobes_srcu, &uprobe->rcu, uprobe_free_srcu); +} + +static void put_uprobe(struct uprobe *uprobe) +{ + if (!refcount_dec_and_test(&uprobe->ref)) + return; + + INIT_WORK(&uprobe->work, uprobe_free_deferred); + schedule_work(&uprobe->work); +} + +/* Initialize hprobe as SRCU-protected "leased" uprobe */ +static void hprobe_init_leased(struct hprobe *hprobe, struct uprobe *uprobe, int srcu_idx) +{ + WARN_ON(!uprobe); + hprobe->state = HPROBE_LEASED; + hprobe->uprobe = uprobe; + hprobe->srcu_idx = srcu_idx; +} + +/* Initialize hprobe as refcounted ("stable") uprobe (uprobe can be NULL). */ +static void hprobe_init_stable(struct hprobe *hprobe, struct uprobe *uprobe) +{ + hprobe->state = uprobe ? HPROBE_STABLE : HPROBE_GONE; + hprobe->uprobe = uprobe; + hprobe->srcu_idx = -1; +} + +/* + * hprobe_consume() fetches hprobe's underlying uprobe and detects whether + * uprobe is SRCU protected or is refcounted. hprobe_consume() can be + * used only once for a given hprobe. + * + * Caller has to call hprobe_finalize() and pass previous hprobe_state, so + * that hprobe_finalize() can perform SRCU unlock or put uprobe, whichever + * is appropriate. + */ +static inline struct uprobe *hprobe_consume(struct hprobe *hprobe, enum hprobe_state *hstate) +{ + *hstate = xchg(&hprobe->state, HPROBE_CONSUMED); + switch (*hstate) { + case HPROBE_LEASED: + case HPROBE_STABLE: + return hprobe->uprobe; + case HPROBE_GONE: /* uprobe is NULL, no SRCU */ + case HPROBE_CONSUMED: /* uprobe was finalized already, do nothing */ + return NULL; + default: + WARN(1, "hprobe invalid state %d", *hstate); + return NULL; + } +} + +/* + * Reset hprobe state and, if hprobe was LEASED, release SRCU lock. + * hprobe_finalize() can only be used from current context after + * hprobe_consume() call (which determines uprobe and hstate value). + */ +static void hprobe_finalize(struct hprobe *hprobe, enum hprobe_state hstate) +{ + switch (hstate) { + case HPROBE_LEASED: + __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); + break; + case HPROBE_STABLE: + put_uprobe(hprobe->uprobe); + break; + case HPROBE_GONE: + case HPROBE_CONSUMED: + break; + default: + WARN(1, "hprobe invalid state %d", hstate); + break; + } +} + +/* + * Attempt to switch (atomically) uprobe from being SRCU protected (LEASED) + * to refcounted (STABLE) state. Competes with hprobe_consume(); only one of + * them can win the race to perform SRCU unlocking. Whoever wins must perform + * SRCU unlock. + * + * Returns underlying valid uprobe or NULL, if there was no underlying uprobe + * to begin with or we failed to bump its refcount and it's going away. + * + * Returned non-NULL uprobe can be still safely used within an ongoing SRCU + * locked region. If `get` is true, it's guaranteed that non-NULL uprobe has + * an extra refcount for caller to assume and use. Otherwise, it's not + * guaranteed that returned uprobe has a positive refcount, so caller has to + * attempt try_get_uprobe(), if it needs to preserve uprobe beyond current + * SRCU lock region. See dup_utask(). + */ +static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get) { - return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); + enum hprobe_state hstate; + + /* + * Caller should guarantee that return_instance is not going to be + * freed from under us. This can be achieved either through holding + * rcu_read_lock() or by owning return_instance in the first place. + * + * Underlying uprobe is itself protected from reuse by SRCU, so ensure + * SRCU lock is held properly. + */ + lockdep_assert(srcu_read_lock_held(&uretprobes_srcu)); + + hstate = READ_ONCE(hprobe->state); + switch (hstate) { + case HPROBE_STABLE: + /* uprobe has positive refcount, bump refcount, if necessary */ + return get ? get_uprobe(hprobe->uprobe) : hprobe->uprobe; + case HPROBE_GONE: + /* + * SRCU was unlocked earlier and we didn't manage to take + * uprobe refcnt, so it's effectively NULL + */ + return NULL; + case HPROBE_CONSUMED: + /* + * uprobe was consumed, so it's effectively NULL as far as + * uretprobe processing logic is concerned + */ + return NULL; + case HPROBE_LEASED: { + struct uprobe *uprobe = try_get_uprobe(hprobe->uprobe); + /* + * Try to switch hprobe state, guarding against + * hprobe_consume() or another hprobe_expire() racing with us. + * Note, if we failed to get uprobe refcount, we use special + * HPROBE_GONE state to signal that hprobe->uprobe shouldn't + * be used as it will be freed after SRCU is unlocked. + */ + if (try_cmpxchg(&hprobe->state, &hstate, uprobe ? HPROBE_STABLE : HPROBE_GONE)) { + /* We won the race, we are the ones to unlock SRCU */ + __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); + return get ? get_uprobe(uprobe) : uprobe; + } + + /* + * We lost the race, undo refcount bump (if it ever happened), + * unless caller would like an extra refcount anyways. + */ + if (uprobe && !get) + put_uprobe(uprobe); + /* + * Even if hprobe_consume() or another hprobe_expire() wins + * the state update race and unlocks SRCU from under us, we + * still have a guarantee that underyling uprobe won't be + * freed due to ongoing caller's SRCU lock region, so we can + * return it regardless. Also, if `get` was true, we also have + * an extra ref for the caller to own. This is used in dup_utask(). + */ + return uprobe; + } + default: + WARN(1, "unknown hprobe state %d", hstate); + return NULL; + } } -static int match_uprobe(struct uprobe *l, struct uprobe *r) +static __always_inline +int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset, + const struct uprobe *r) { - if (l->inode < r->inode) + if (l_inode < r->inode) return -1; - if (l->inode > r->inode) + if (l_inode > r->inode) return 1; - if (l->offset < r->offset) + if (l_offset < r->offset) return -1; - if (l->offset > r->offset) + if (l_offset > r->offset) return 1; return 0; } -static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) -{ - struct uprobe u = { .inode = inode, .offset = offset }; - struct rb_node *n = uprobes_tree.rb_node; - struct uprobe *uprobe; - int match; +#define __node_2_uprobe(node) \ + rb_entry((node), struct uprobe, rb_node) - while (n) { - uprobe = rb_entry(n, struct uprobe, rb_node); - match = match_uprobe(&u, uprobe); - if (!match) { - atomic_inc(&uprobe->ref); - return uprobe; - } +struct __uprobe_key { + struct inode *inode; + loff_t offset; +}; - if (match < 0) - n = n->rb_left; - else - n = n->rb_right; - } - return NULL; +static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b) +{ + const struct __uprobe_key *a = key; + return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b)); +} + +static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b) +{ + struct uprobe *u = __node_2_uprobe(a); + return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b)); } /* - * Find a uprobe corresponding to a given inode:offset - * Acquires uprobes_treelock + * Assumes being inside RCU protected region. + * No refcount is taken on returned uprobe. */ -static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) +static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset) { - struct uprobe *uprobe; + struct __uprobe_key key = { + .inode = inode, + .offset = offset, + }; + struct rb_node *node; + unsigned int seq; - spin_lock(&uprobes_treelock); - uprobe = __find_uprobe(inode, offset); - spin_unlock(&uprobes_treelock); + lockdep_assert(rcu_read_lock_trace_held()); - return uprobe; + do { + seq = read_seqcount_begin(&uprobes_seqcount); + node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key); + /* + * Lockless RB-tree lookups can result only in false negatives. + * If the element is found, it is correct and can be returned + * under RCU protection. If we find nothing, we need to + * validate that seqcount didn't change. If it did, we have to + * try again as we might have missed the element (false + * negative). If seqcount is unchanged, search truly failed. + */ + if (node) + return __node_2_uprobe(node); + } while (read_seqcount_retry(&uprobes_seqcount, seq)); + + return NULL; } +/* + * Attempt to insert a new uprobe into uprobes_tree. + * + * If uprobe already exists (for given inode+offset), we just increment + * refcount of previously existing uprobe. + * + * If not, a provided new instance of uprobe is inserted into the tree (with + * assumed initial refcount == 1). + * + * In any case, we return a uprobe instance that ends up being in uprobes_tree. + * Caller has to clean up new uprobe instance, if it ended up not being + * inserted into the tree. + * + * We assume that uprobes_treelock is held for writing. + */ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) { - struct rb_node **p = &uprobes_tree.rb_node; - struct rb_node *parent = NULL; - struct uprobe *u; - int match; - - while (*p) { - parent = *p; - u = rb_entry(parent, struct uprobe, rb_node); - match = match_uprobe(uprobe, u); - if (!match) { - atomic_inc(&u->ref); - return u; - } + struct rb_node *node; +again: + node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); + if (node) { + struct uprobe *u = __node_2_uprobe(node); - if (match < 0) - p = &parent->rb_left; - else - p = &parent->rb_right; + if (!try_get_uprobe(u)) { + rb_erase(node, &uprobes_tree); + RB_CLEAR_NODE(&u->rb_node); + goto again; + } + return u; } - u = NULL; - rb_link_node(&uprobe->rb_node, parent, p); - rb_insert_color(&uprobe->rb_node, &uprobes_tree); - /* get access + creation ref */ - atomic_set(&uprobe->ref, 2); - - return u; + return uprobe; } /* - * Acquire uprobes_treelock. - * Matching uprobe already exists in rbtree; - * increment (access refcount) and return the matching uprobe. - * - * No matching uprobe; insert the uprobe in rb_tree; - * get a double refcount (access + creation) and return NULL. + * Acquire uprobes_treelock and insert uprobe into uprobes_tree + * (or reuse existing one, see __insert_uprobe() comments above). */ static struct uprobe *insert_uprobe(struct uprobe *uprobe) { struct uprobe *u; - spin_lock(&uprobes_treelock); + write_lock(&uprobes_treelock); + write_seqcount_begin(&uprobes_seqcount); u = __insert_uprobe(uprobe); - spin_unlock(&uprobes_treelock); + write_seqcount_end(&uprobes_seqcount); + write_unlock(&uprobes_treelock); return u; } -static void put_uprobe(struct uprobe *uprobe) +static void +ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe) { - if (atomic_dec_and_test(&uprobe->ref)) - kfree(uprobe); + pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx " + "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n", + uprobe->inode->i_ino, (unsigned long long) uprobe->offset, + (unsigned long long) cur_uprobe->ref_ctr_offset, + (unsigned long long) uprobe->ref_ctr_offset); } -static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) +static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, + loff_t ref_ctr_offset) { struct uprobe *uprobe, *cur_uprobe; uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL); if (!uprobe) - return NULL; + return ERR_PTR(-ENOMEM); - uprobe->inode = igrab(inode); + uprobe->inode = inode; uprobe->offset = offset; + uprobe->ref_ctr_offset = ref_ctr_offset; + INIT_LIST_HEAD(&uprobe->consumers); init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); - /* For now assume that the instruction need not be single-stepped */ - __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); + RB_CLEAR_NODE(&uprobe->rb_node); + refcount_set(&uprobe->ref, 1); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); - /* a uprobe exists for this inode:offset combination */ - if (cur_uprobe) { + if (cur_uprobe != uprobe) { + if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) { + ref_ctr_mismatch_warn(cur_uprobe, uprobe); + put_uprobe(cur_uprobe); + kfree(uprobe); + return ERR_PTR(-EINVAL); + } kfree(uprobe); uprobe = cur_uprobe; - iput(inode); } return uprobe; @@ -474,81 +1026,71 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { + static atomic64_t id; + down_write(&uprobe->consumer_rwsem); - uc->next = uprobe->consumers; - uprobe->consumers = uc; + list_add_rcu(&uc->cons_node, &uprobe->consumers); + uc->id = (__u64) atomic64_inc_return(&id); up_write(&uprobe->consumer_rwsem); } /* * For uprobe @uprobe, delete the consumer @uc. - * Return true if the @uc is deleted successfully - * or return false. + * Should never be called with consumer that's not part of @uprobe->consumers. */ -static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) +static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) { - struct uprobe_consumer **con; - bool ret = false; - down_write(&uprobe->consumer_rwsem); - for (con = &uprobe->consumers; *con; con = &(*con)->next) { - if (*con == uc) { - *con = uc->next; - ret = true; - break; - } - } + list_del_rcu(&uc->cons_node); up_write(&uprobe->consumer_rwsem); - - return ret; } -static int -__copy_insn(struct address_space *mapping, struct file *filp, char *insn, - unsigned long nbytes, loff_t offset) +static int __copy_insn(struct address_space *mapping, struct file *filp, + void *insn, int nbytes, loff_t offset) { struct page *page; - - if (!mapping->a_ops->readpage) - return -EIO; /* - * Ensure that the page that has the original instruction is - * populated and in page-cache. + * Ensure that the page that has the original instruction is populated + * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(), + * see uprobe_register(). */ - page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp); + if (mapping->a_ops->read_folio) + page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp); + else + page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT); if (IS_ERR(page)) return PTR_ERR(page); - copy_from_page(page, offset, insn, nbytes); - page_cache_release(page); + uprobe_copy_from_page(page, offset, insn, nbytes); + put_page(page); return 0; } static int copy_insn(struct uprobe *uprobe, struct file *filp) { - struct address_space *mapping; - unsigned long nbytes; - int bytes; - - nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); - mapping = uprobe->inode->i_mapping; + struct address_space *mapping = uprobe->inode->i_mapping; + loff_t offs = uprobe->offset; + void *insn = &uprobe->arch.insn; + int size = sizeof(uprobe->arch.insn); + int len, err = -EIO; - /* Instruction at end of binary; copy only available bytes */ - if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size) - bytes = uprobe->inode->i_size - uprobe->offset; - else - bytes = MAX_UINSN_BYTES; + /* Copy only available bytes, -EIO if nothing was read */ + do { + if (offs >= i_size_read(uprobe->inode)) + break; - /* Instruction at the page-boundary; copy bytes in second page */ - if (nbytes < bytes) { - int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes, - bytes - nbytes, uprobe->offset + nbytes); + len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK)); + err = __copy_insn(mapping, filp, insn, len, offs); if (err) - return err; - bytes = nbytes; - } - return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); + break; + + insn += len; + offs += len; + size -= len; + } while (size); + + return err; } static int prepare_uprobe(struct uprobe *uprobe, struct file *file, @@ -569,18 +1111,14 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, goto out; ret = -ENOTSUPP; - if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn)) + if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn)) goto out; ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); if (ret) goto out; - /* write_opcode() assumes we don't cross page boundary */ - BUG_ON((uprobe->offset & ~PAGE_MASK) + - UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); - - smp_wmb(); /* pairs with rmb() in find_active_uprobe() */ + smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */ set_bit(UPROBE_COPY_INSN, &uprobe->flags); out: @@ -589,21 +1127,19 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, return ret; } -static inline bool consumer_filter(struct uprobe_consumer *uc, - enum uprobe_filter_ctx ctx, struct mm_struct *mm) +static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm) { - return !uc->filter || uc->filter(uc, ctx, mm); + return !uc->filter || uc->filter(uc, mm); } -static bool filter_chain(struct uprobe *uprobe, - enum uprobe_filter_ctx ctx, struct mm_struct *mm) +static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) { struct uprobe_consumer *uc; bool ret = false; down_read(&uprobe->consumer_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { - ret = consumer_filter(uc, ctx, mm); + list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { + ret = consumer_filter(uc, mm); if (ret) break; } @@ -612,10 +1148,10 @@ static bool filter_chain(struct uprobe *uprobe, return ret; } -static int -install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long vaddr) +static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, + unsigned long vaddr) { + struct mm_struct *mm = vma->vm_mm; bool first_uprobe; int ret; @@ -627,46 +1163,26 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(), * the task can hit this breakpoint right after __replace_page(). */ - first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags); + first_uprobe = !mm_flags_test(MMF_HAS_UPROBES, mm); if (first_uprobe) - set_bit(MMF_HAS_UPROBES, &mm->flags); + mm_flags_set(MMF_HAS_UPROBES, mm); - ret = set_swbp(&uprobe->arch, mm, vaddr); + ret = set_swbp(&uprobe->arch, vma, vaddr); if (!ret) - clear_bit(MMF_RECALC_UPROBES, &mm->flags); + mm_flags_clear(MMF_RECALC_UPROBES, mm); else if (first_uprobe) - clear_bit(MMF_HAS_UPROBES, &mm->flags); + mm_flags_clear(MMF_HAS_UPROBES, mm); return ret; } -static int -remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) -{ - set_bit(MMF_RECALC_UPROBES, &mm->flags); - return set_orig_insn(&uprobe->arch, mm, vaddr); -} - -static inline bool uprobe_is_active(struct uprobe *uprobe) -{ - return !RB_EMPTY_NODE(&uprobe->rb_node); -} -/* - * There could be threads that have already hit the breakpoint. They - * will recheck the current insn and restart if find_uprobe() fails. - * See find_active_uprobe(). - */ -static void delete_uprobe(struct uprobe *uprobe) +static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, + unsigned long vaddr) { - if (WARN_ON(!uprobe_is_active(uprobe))) - return; + struct mm_struct *mm = vma->vm_mm; - spin_lock(&uprobes_treelock); - rb_erase(&uprobe->rb_node, &uprobes_tree); - spin_unlock(&uprobes_treelock); - RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ - iput(uprobe->inode); - put_uprobe(uprobe); + mm_flags_set(MMF_RECALC_UPROBES, mm); + return set_orig_insn(&uprobe->arch, vma, vaddr); } struct map_info { @@ -693,18 +1209,18 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) int more = 0; again: - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; if (!prev && !more) { /* - * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through + * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through * reclaim. This is optimistic, no harm done if it fails. */ prev = kmalloc(sizeof(struct map_info), - GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); + GFP_NOWAIT | __GFP_NOMEMALLOC); if (prev) prev->next = NULL; } @@ -713,7 +1229,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) continue; } - if (!atomic_inc_not_zero(&vma->vm_mm->mm_users)) + if (!mmget_not_zero(vma->vm_mm)) continue; info = prev; @@ -724,7 +1240,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) info->mm = vma->vm_mm; info->vaddr = offset_to_vaddr(vma, offset); } - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_read(mapping); if (!more) goto out; @@ -773,8 +1289,17 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) if (err && is_register) goto free; + /* + * We take mmap_lock for writing to avoid the race with + * find_active_uprobe_rcu() which takes mmap_lock for reading. + * Thus this install_breakpoint() can not make + * is_trap_at_addr() true right after find_uprobe_rcu() + * returns NULL in find_active_uprobe_rcu(). + */ + mmap_write_lock(mm); + if (check_stable_address_space(mm)) + goto unlock; - down_write(&mm->mmap_sem); vma = find_vma(mm, info->vaddr); if (!vma || !valid_vma(vma, is_register) || file_inode(vma->vm_file) != uprobe->inode) @@ -786,17 +1311,15 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) if (is_register) { /* consult only the "caller", new consumer. */ - if (consumer_filter(new, - UPROBE_FILTER_REGISTER, mm)) - err = install_breakpoint(uprobe, mm, vma, info->vaddr); - } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { - if (!filter_chain(uprobe, - UPROBE_FILTER_UNREGISTER, mm)) - err |= remove_breakpoint(uprobe, mm, info->vaddr); + if (consumer_filter(new, mm)) + err = install_breakpoint(uprobe, vma, info->vaddr); + } else if (mm_flags_test(MMF_HAS_UPROBES, mm)) { + if (!filter_chain(uprobe, mm)) + err |= remove_breakpoint(uprobe, vma, info->vaddr); } unlock: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); free: mmput(mm); info = free_map_info(info); @@ -806,29 +1329,51 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) return err; } -static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc) -{ - consumer_add(uprobe, uc); - return register_for_each_vma(uprobe, uc); -} - -static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) +/** + * uprobe_unregister_nosync - unregister an already registered probe. + * @uprobe: uprobe to remove + * @uc: identify which probe if multiple probes are colocated. + */ +void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc) { int err; - if (!consumer_del(uprobe, uc)) /* WARN? */ - return; - + down_write(&uprobe->register_rwsem); + consumer_del(uprobe, uc); err = register_for_each_vma(uprobe, NULL); + up_write(&uprobe->register_rwsem); + /* TODO : cant unregister? schedule a worker thread */ - if (!uprobe->consumers && !err) - delete_uprobe(uprobe); + if (unlikely(err)) { + uprobe_warn(current, "unregister, leaking uprobe"); + return; + } + + put_uprobe(uprobe); } +EXPORT_SYMBOL_GPL(uprobe_unregister_nosync); -/* +void uprobe_unregister_sync(void) +{ + /* + * Now that handler_chain() and handle_uretprobe_chain() iterate over + * uprobe->consumers list under RCU protection without holding + * uprobe->register_rwsem, we need to wait for RCU grace period to + * make sure that we can't call into just unregistered + * uprobe_consumer's callbacks anymore. If we don't do that, fast and + * unlucky enough caller can free consumer's memory and cause + * handler_chain() or handle_uretprobe_chain() to do an use-after-free. + */ + synchronize_rcu_tasks_trace(); + synchronize_srcu(&uretprobes_srcu); +} +EXPORT_SYMBOL_GPL(uprobe_unregister_sync); + +/** * uprobe_register - register a probe * @inode: the file in which the probe has to be placed. * @offset: offset from the start of the file. + * @ref_ctr_offset: offset of SDT marker / reference counter * @uc: information on howto handle the probe.. * * Apart from the access refcount, uprobe_register() takes a creation @@ -837,105 +1382,99 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation * refcount is released when the last @uc for the @uprobe - * unregisters. + * unregisters. Caller of uprobe_register() is required to keep @inode + * (and the containing mount) referenced. * - * Return errno if it cannot successully install probes - * else return 0 (success) + * Return: pointer to the new uprobe on success or an ERR_PTR on failure. */ -int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) +struct uprobe *uprobe_register(struct inode *inode, + loff_t offset, loff_t ref_ctr_offset, + struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret; /* Uprobe must have at least one set consumer */ if (!uc->handler && !uc->ret_handler) - return -EINVAL; + return ERR_PTR(-EINVAL); + /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ + if (!inode->i_mapping->a_ops->read_folio && + !shmem_mapping(inode->i_mapping)) + return ERR_PTR(-EIO); /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) - return -EINVAL; + return ERR_PTR(-EINVAL); - retry: - uprobe = alloc_uprobe(inode, offset); - if (!uprobe) - return -ENOMEM; /* - * We can race with uprobe_unregister()->delete_uprobe(). - * Check uprobe_is_active() and retry if it is false. + * This ensures that uprobe_copy_from_page(), copy_to_page() and + * __update_ref_ctr() can't cross page boundary. */ + if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) + return ERR_PTR(-EINVAL); + if (!IS_ALIGNED(ref_ctr_offset, sizeof(short))) + return ERR_PTR(-EINVAL); + + uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); + if (IS_ERR(uprobe)) + return uprobe; + down_write(&uprobe->register_rwsem); - ret = -EAGAIN; - if (likely(uprobe_is_active(uprobe))) { - ret = __uprobe_register(uprobe, uc); - if (ret) - __uprobe_unregister(uprobe, uc); - } + consumer_add(uprobe, uc); + ret = register_for_each_vma(uprobe, uc); up_write(&uprobe->register_rwsem); - put_uprobe(uprobe); - if (unlikely(ret == -EAGAIN)) - goto retry; - return ret; + if (ret) { + uprobe_unregister_nosync(uprobe, uc); + /* + * Registration might have partially succeeded, so we can have + * this consumer being called right at this time. We need to + * sync here. It's ok, it's unlikely slow path. + */ + uprobe_unregister_sync(); + return ERR_PTR(ret); + } + + return uprobe; } EXPORT_SYMBOL_GPL(uprobe_register); -/* - * uprobe_apply - unregister a already registered probe. - * @inode: the file in which the probe has to be removed. - * @offset: offset from the start of the file. +/** + * uprobe_apply - add or remove the breakpoints according to @uc->filter + * @uprobe: uprobe which "owns" the breakpoint * @uc: consumer which wants to add more or remove some breakpoints * @add: add or remove the breakpoints + * Return: 0 on success or negative error code. */ -int uprobe_apply(struct inode *inode, loff_t offset, - struct uprobe_consumer *uc, bool add) +int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add) { - struct uprobe *uprobe; struct uprobe_consumer *con; int ret = -ENOENT; - uprobe = find_uprobe(inode, offset); - if (!uprobe) - return ret; - down_write(&uprobe->register_rwsem); - for (con = uprobe->consumers; con && con != uc ; con = con->next) - ; - if (con) - ret = register_for_each_vma(uprobe, add ? uc : NULL); - up_write(&uprobe->register_rwsem); - put_uprobe(uprobe); - - return ret; -} -/* - * uprobe_unregister - unregister a already registered probe. - * @inode: the file in which the probe has to be removed. - * @offset: offset from the start of the file. - * @uc: identify which probe if multiple probes are colocated. - */ -void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) -{ - struct uprobe *uprobe; - - uprobe = find_uprobe(inode, offset); - if (!uprobe) - return; + rcu_read_lock_trace(); + list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { + if (con == uc) { + ret = register_for_each_vma(uprobe, add ? uc : NULL); + break; + } + } + rcu_read_unlock_trace(); - down_write(&uprobe->register_rwsem); - __uprobe_unregister(uprobe, uc); up_write(&uprobe->register_rwsem); - put_uprobe(uprobe); + + return ret; } -EXPORT_SYMBOL_GPL(uprobe_unregister); static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; int err = 0; - down_read(&mm->mmap_sem); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mmap_write_lock(mm); + for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; @@ -949,9 +1488,9 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) continue; vaddr = offset_to_vaddr(vma, uprobe->offset); - err |= remove_breakpoint(uprobe, mm, vaddr); + err |= remove_breakpoint(uprobe, vma, vaddr); } - up_read(&mm->mmap_sem); + mmap_write_unlock(mm); return err; } @@ -997,29 +1536,60 @@ static void build_probe_list(struct inode *inode, min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; - spin_lock(&uprobes_treelock); + read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); if (n) { for (t = n; t; t = rb_prev(t)) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset < min) break; - list_add(&u->pending_list, head); - atomic_inc(&u->ref); + /* if uprobe went away, it's safe to ignore it */ + if (try_get_uprobe(u)) + list_add(&u->pending_list, head); } for (t = n; (t = rb_next(t)); ) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset > max) break; - list_add(&u->pending_list, head); - atomic_inc(&u->ref); + /* if uprobe went away, it's safe to ignore it */ + if (try_get_uprobe(u)) + list_add(&u->pending_list, head); } } - spin_unlock(&uprobes_treelock); + read_unlock(&uprobes_treelock); +} + +/* @vma contains reference counter, not the probed instruction. */ +static int delayed_ref_ctr_inc(struct vm_area_struct *vma) +{ + struct list_head *pos, *q; + struct delayed_uprobe *du; + unsigned long vaddr; + int ret = 0, err = 0; + + mutex_lock(&delayed_uprobe_lock); + list_for_each_safe(pos, q, &delayed_uprobe_list) { + du = list_entry(pos, struct delayed_uprobe, list); + + if (du->mm != vma->vm_mm || + !valid_ref_ctr_vma(du->uprobe, vma)) + continue; + + vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset); + ret = __update_ref_ctr(vma->vm_mm, vaddr, 1); + if (ret) { + update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1); + if (!err) + err = ret; + } + delayed_uprobe_delete(du); + } + mutex_unlock(&delayed_uprobe_lock); + return err; } /* - * Called from mmap_region/vma_adjust with mm->mmap_sem acquired. + * Called from mmap_region/vma_merge with mm->mmap_lock acquired. * * Currently we ignore all errors and always return 0, the callers * can't handle the failure anyway. @@ -1030,7 +1600,15 @@ int uprobe_mmap(struct vm_area_struct *vma) struct uprobe *uprobe, *u; struct inode *inode; - if (no_uprobe_events() || !valid_vma(vma, true)) + if (no_uprobe_events()) + return 0; + + if (vma->vm_file && + (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && + mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm)) + delayed_ref_ctr_inc(vma); + + if (!valid_vma(vma, true)) return 0; inode = file_inode(vma->vm_file); @@ -1046,9 +1624,9 @@ int uprobe_mmap(struct vm_area_struct *vma) */ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { if (!fatal_signal_pending(current) && - filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) { + filter_chain(uprobe, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); - install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); + install_breakpoint(uprobe, vma, vaddr); } put_uprobe(uprobe); } @@ -1069,9 +1647,9 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; - spin_lock(&uprobes_treelock); + read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); - spin_unlock(&uprobes_treelock); + read_unlock(&uprobes_treelock); return !!n; } @@ -1087,81 +1665,113 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ return; - if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) || - test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags)) + if (!mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm) || + mm_flags_test(MMF_RECALC_UPROBES, vma->vm_mm)) return; if (vma_has_uprobes(vma, start, end)) - set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags); + mm_flags_set(MMF_RECALC_UPROBES, vma->vm_mm); +} + +static vm_fault_t xol_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct xol_area *area = vma->vm_mm->uprobes_state.xol_area; + + vmf->page = area->page; + get_page(vmf->page); + return 0; } +static int xol_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) +{ + return -EPERM; +} + +static const struct vm_special_mapping xol_mapping = { + .name = "[uprobes]", + .fault = xol_fault, + .mremap = xol_mremap, +}; + /* Slot allocation for XOL */ -static int xol_add_vma(struct xol_area *area) +static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { - struct mm_struct *mm = current->mm; - int ret = -EALREADY; + struct vm_area_struct *vma; + int ret; - down_write(&mm->mmap_sem); - if (mm->uprobes_state.xol_area) - goto fail; + if (mmap_write_lock_killable(mm)) + return -EINTR; - ret = -ENOMEM; - /* Try to map as high as possible, this is only a hint. */ - area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); - if (area->vaddr & ~PAGE_MASK) { - ret = area->vaddr; + if (mm->uprobes_state.xol_area) { + ret = -EALREADY; goto fail; } - ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, - VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page); - if (ret) + if (!area->vaddr) { + /* Try to map as high as possible, this is only a hint. */ + area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, + PAGE_SIZE, 0, 0); + if (IS_ERR_VALUE(area->vaddr)) { + ret = area->vaddr; + goto fail; + } + } + + vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE, + VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO| + VM_SEALED_SYSMAP, + &xol_mapping); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); goto fail; + } - smp_wmb(); /* pairs with get_xol_area() */ - mm->uprobes_state.xol_area = area; ret = 0; + /* pairs with get_xol_area() */ + smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */ fail: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return ret; } -/* - * get_xol_area - Allocate process's xol_area if necessary. - * This area will be used for storing instructions for execution out of line. - * - * Returns the allocated area or NULL. - */ -static struct xol_area *get_xol_area(void) +void * __weak arch_uretprobe_trampoline(unsigned long *psize) +{ + static uprobe_opcode_t insn = UPROBE_SWBP_INSN; + + *psize = UPROBE_SWBP_INSN_SIZE; + return &insn; +} + +static struct xol_area *__create_xol_area(unsigned long vaddr) { struct mm_struct *mm = current->mm; + unsigned long insns_size; struct xol_area *area; - uprobe_opcode_t insn = UPROBE_SWBP_INSN; - - area = mm->uprobes_state.xol_area; - if (area) - goto ret; + void *insns; area = kzalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) goto out; - area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL); + area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long), + GFP_KERNEL); if (!area->bitmap) goto free_area; - area->page = alloc_page(GFP_HIGHUSER); + area->page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); if (!area->page) goto free_bitmap; - /* allocate first slot of task's xol_area for the return probes */ - set_bit(0, area->bitmap); - copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); - atomic_set(&area->slot_count, 1); + area->vaddr = vaddr; init_waitqueue_head(&area->wq); + /* Reserve the 1st slot for get_trampoline_vaddr() */ + set_bit(0, area->bitmap); + insns = arch_uretprobe_trampoline(&insns_size); + arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); - if (!xol_add_vma(area)) + if (!xol_add_vma(mm, area)) return area; __free_page(area->page); @@ -1170,12 +1780,36 @@ static struct xol_area *get_xol_area(void) free_area: kfree(area); out: - area = mm->uprobes_state.xol_area; - ret: - smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ + return NULL; +} + +/* + * get_xol_area - Allocate process's xol_area if necessary. + * This area will be used for storing instructions for execution out of line. + * + * Returns the allocated area or NULL. + */ +static struct xol_area *get_xol_area(void) +{ + struct mm_struct *mm = current->mm; + struct xol_area *area; + + if (!mm->uprobes_state.xol_area) + __create_xol_area(0); + + /* Pairs with xol_add_vma() smp_store_release() */ + area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */ return area; } +void __weak arch_uprobe_clear_state(struct mm_struct *mm) +{ +} + +void __weak arch_uprobe_init_state(struct mm_struct *mm) +{ +} + /* * uprobe_clear_state - Free the area allocated for slots. */ @@ -1183,6 +1817,12 @@ void uprobe_clear_state(struct mm_struct *mm) { struct xol_area *area = mm->uprobes_state.xol_area; + mutex_lock(&delayed_uprobe_lock); + delayed_uprobe_remove(NULL, mm); + mutex_unlock(&delayed_uprobe_lock); + + arch_uprobe_clear_state(mm); + if (!area) return; @@ -1203,105 +1843,79 @@ void uprobe_end_dup_mmap(void) void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { - newmm->uprobes_state.xol_area = NULL; - - if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) { - set_bit(MMF_HAS_UPROBES, &newmm->flags); + if (mm_flags_test(MMF_HAS_UPROBES, oldmm)) { + mm_flags_set(MMF_HAS_UPROBES, newmm); /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ - set_bit(MMF_RECALC_UPROBES, &newmm->flags); + mm_flags_set(MMF_RECALC_UPROBES, newmm); } } -/* - * - search for a free slot. - */ -static unsigned long xol_take_insn_slot(struct xol_area *area) +static unsigned long xol_get_slot_nr(struct xol_area *area) { - unsigned long slot_addr; - int slot_nr; + unsigned long slot_nr; - do { - slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE); - if (slot_nr < UINSNS_PER_PAGE) { - if (!test_and_set_bit(slot_nr, area->bitmap)) - break; - - slot_nr = UINSNS_PER_PAGE; - continue; - } - wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE)); - } while (slot_nr >= UINSNS_PER_PAGE); - - slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES); - atomic_inc(&area->slot_count); + slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE); + if (slot_nr < UINSNS_PER_PAGE) { + if (!test_and_set_bit(slot_nr, area->bitmap)) + return slot_nr; + } - return slot_addr; + return UINSNS_PER_PAGE; } /* * xol_get_insn_slot - allocate a slot for xol. - * Returns the allocated slot address or 0. */ -static unsigned long xol_get_insn_slot(struct uprobe *uprobe) +static bool xol_get_insn_slot(struct uprobe *uprobe, struct uprobe_task *utask) { - struct xol_area *area; - unsigned long xol_vaddr; + struct xol_area *area = get_xol_area(); + unsigned long slot_nr; - area = get_xol_area(); if (!area) - return 0; + return false; - xol_vaddr = xol_take_insn_slot(area); - if (unlikely(!xol_vaddr)) - return 0; + wait_event(area->wq, (slot_nr = xol_get_slot_nr(area)) < UINSNS_PER_PAGE); - /* Initialize the slot */ - copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES); - /* - * We probably need flush_icache_user_range() but it needs vma. - * This should work on supported architectures too. - */ - flush_dcache_page(area->page); - - return xol_vaddr; + utask->xol_vaddr = area->vaddr + slot_nr * UPROBE_XOL_SLOT_BYTES; + arch_uprobe_copy_ixol(area->page, utask->xol_vaddr, + &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); + return true; } /* - * xol_free_insn_slot - If slot was earlier allocated by - * @xol_get_insn_slot(), make the slot available for - * subsequent requests. + * xol_free_insn_slot - free the slot allocated by xol_get_insn_slot() */ -static void xol_free_insn_slot(struct task_struct *tsk) +static void xol_free_insn_slot(struct uprobe_task *utask) { - struct xol_area *area; - unsigned long vma_end; - unsigned long slot_addr; + struct xol_area *area = current->mm->uprobes_state.xol_area; + unsigned long offset = utask->xol_vaddr - area->vaddr; + unsigned int slot_nr; - if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask) + utask->xol_vaddr = 0; + /* xol_vaddr must fit into [area->vaddr, area->vaddr + PAGE_SIZE) */ + if (WARN_ON_ONCE(offset >= PAGE_SIZE)) return; - slot_addr = tsk->utask->xol_vaddr; - if (unlikely(!slot_addr)) - return; - - area = tsk->mm->uprobes_state.xol_area; - vma_end = area->vaddr + PAGE_SIZE; - if (area->vaddr <= slot_addr && slot_addr < vma_end) { - unsigned long offset; - int slot_nr; - - offset = slot_addr - area->vaddr; - slot_nr = offset / UPROBE_XOL_SLOT_BYTES; - if (slot_nr >= UINSNS_PER_PAGE) - return; + slot_nr = offset / UPROBE_XOL_SLOT_BYTES; + clear_bit(slot_nr, area->bitmap); + smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ + if (waitqueue_active(&area->wq)) + wake_up(&area->wq); +} - clear_bit(slot_nr, area->bitmap); - atomic_dec(&area->slot_count); - if (waitqueue_active(&area->wq)) - wake_up(&area->wq); +void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, + void *src, unsigned long len) +{ + /* Initialize the slot */ + copy_to_page(page, vaddr, src, len); - tsk->utask->xol_vaddr = 0; - } + /* + * We probably need flush_icache_user_page() but it needs vma. + * This should work on most of architectures by default. If + * architecture needs to do something different it can define + * its own version of the function. + */ + flush_dcache_page(page); } /** @@ -1315,45 +1929,154 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE; } +unsigned long uprobe_get_trap_addr(struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + if (unlikely(utask && utask->active_uprobe)) + return utask->vaddr; + + return instruction_pointer(regs); +} + +static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri) +{ + ri->cons_cnt = 0; + ri->next = utask->ri_pool; + utask->ri_pool = ri; +} + +static struct return_instance *ri_pool_pop(struct uprobe_task *utask) +{ + struct return_instance *ri = utask->ri_pool; + + if (likely(ri)) + utask->ri_pool = ri->next; + + return ri; +} + +static void ri_free(struct return_instance *ri) +{ + kfree(ri->extra_consumers); + kfree_rcu(ri, rcu); +} + +static void free_ret_instance(struct uprobe_task *utask, + struct return_instance *ri, bool cleanup_hprobe) +{ + unsigned seq; + + if (cleanup_hprobe) { + enum hprobe_state hstate; + + (void)hprobe_consume(&ri->hprobe, &hstate); + hprobe_finalize(&ri->hprobe, hstate); + } + + /* + * At this point return_instance is unlinked from utask's + * return_instances list and this has become visible to ri_timer(). + * If seqcount now indicates that ri_timer's return instance + * processing loop isn't active, we can return ri into the pool of + * to-be-reused return instances for future uretprobes. If ri_timer() + * happens to be running right now, though, we fallback to safety and + * just perform RCU-delated freeing of ri. + * Admittedly, this is a rather simple use of seqcount, but it nicely + * abstracts away all the necessary memory barriers, so we use + * a well-supported kernel primitive here. + */ + if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) { + /* immediate reuse of ri without RCU GP is OK */ + ri_pool_push(utask, ri); + } else { + /* we might be racing with ri_timer(), so play it safe */ + ri_free(ri); + } +} + /* * Called with no locks held. - * Called in context of a exiting or a exec-ing thread. + * Called in context of an exiting or an exec-ing thread. */ void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; - struct return_instance *ri, *tmp; + struct return_instance *ri, *ri_next; if (!utask) return; - if (utask->active_uprobe) - put_uprobe(utask->active_uprobe); + t->utask = NULL; + WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr); + + timer_delete_sync(&utask->ri_timer); ri = utask->return_instances; while (ri) { - tmp = ri; - ri = ri->next; + ri_next = ri->next; + free_ret_instance(utask, ri, true /* cleanup_hprobe */); + ri = ri_next; + } - put_uprobe(tmp->uprobe); - kfree(tmp); + /* free_ret_instance() above might add to ri_pool, so this loop should come last */ + ri = utask->ri_pool; + while (ri) { + ri_next = ri->next; + ri_free(ri); + ri = ri_next; } - xol_free_insn_slot(t); kfree(utask); - t->utask = NULL; } -/* - * Called in context of a new clone/fork from copy_process. - */ -void uprobe_copy_process(struct task_struct *t) +#define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */ + +#define for_each_ret_instance_rcu(pos, head) \ + for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next)) + +static void ri_timer(struct timer_list *timer) { - t->utask = NULL; + struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer); + struct return_instance *ri; + + /* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */ + guard(srcu)(&uretprobes_srcu); + /* RCU protects return_instance from freeing. */ + guard(rcu)(); + + /* + * See free_ret_instance() for notes on seqcount use. + * We also employ raw API variants to avoid lockdep false-positive + * warning complaining about enabled preemption. The timer can only be + * invoked once for a uprobe_task. Therefore there can only be one + * writer. The reader does not require an even sequence count to make + * progress, so it is OK to remain preemptible on PREEMPT_RT. + */ + raw_write_seqcount_begin(&utask->ri_seqcount); + + for_each_ret_instance_rcu(ri, utask->return_instances) + hprobe_expire(&ri->hprobe, false); + + raw_write_seqcount_end(&utask->ri_seqcount); +} + +static struct uprobe_task *alloc_utask(void) +{ + struct uprobe_task *utask; + + utask = kzalloc(sizeof(*utask), GFP_KERNEL); + if (!utask) + return NULL; + + timer_setup(&utask->ri_timer, ri_timer, 0); + seqcount_init(&utask->ri_seqcount); + + return utask; } /* - * Allocate a uprobe_task object for the task if if necessary. + * Allocate a uprobe_task object for the task if necessary. * Called when the thread hits a breakpoint. * * Returns: @@ -1363,125 +2086,258 @@ void uprobe_copy_process(struct task_struct *t) static struct uprobe_task *get_utask(void) { if (!current->utask) - current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); + current->utask = alloc_utask(); return current->utask; } +static struct return_instance *alloc_return_instance(struct uprobe_task *utask) +{ + struct return_instance *ri; + + ri = ri_pool_pop(utask); + if (ri) + return ri; + + ri = kzalloc(sizeof(*ri), GFP_KERNEL); + if (!ri) + return ZERO_SIZE_PTR; + + return ri; +} + +static struct return_instance *dup_return_instance(struct return_instance *old) +{ + struct return_instance *ri; + + ri = kmemdup(old, sizeof(*ri), GFP_KERNEL); + if (!ri) + return NULL; + + if (unlikely(old->cons_cnt > 1)) { + ri->extra_consumers = kmemdup(old->extra_consumers, + sizeof(ri->extra_consumers[0]) * (old->cons_cnt - 1), + GFP_KERNEL); + if (!ri->extra_consumers) { + kfree(ri); + return NULL; + } + } + + return ri; +} + +static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) +{ + struct uprobe_task *n_utask; + struct return_instance **p, *o, *n; + struct uprobe *uprobe; + + n_utask = alloc_utask(); + if (!n_utask) + return -ENOMEM; + t->utask = n_utask; + + /* protect uprobes from freeing, we'll need try_get_uprobe() them */ + guard(srcu)(&uretprobes_srcu); + + p = &n_utask->return_instances; + for (o = o_utask->return_instances; o; o = o->next) { + n = dup_return_instance(o); + if (!n) + return -ENOMEM; + + /* if uprobe is non-NULL, we'll have an extra refcount for uprobe */ + uprobe = hprobe_expire(&o->hprobe, true); + + /* + * New utask will have stable properly refcounted uprobe or + * NULL. Even if we failed to get refcounted uprobe, we still + * need to preserve full set of return_instances for proper + * uretprobe handling and nesting in forked task. + */ + hprobe_init_stable(&n->hprobe, uprobe); + + n->next = NULL; + rcu_assign_pointer(*p, n); + p = &n->next; + + n_utask->depth++; + } + + return 0; +} + +static void dup_xol_work(struct callback_head *work) +{ + if (current->flags & PF_EXITING) + return; + + if (!__create_xol_area(current->utask->dup_xol_addr) && + !fatal_signal_pending(current)) + uprobe_warn(current, "dup xol area"); +} + +/* + * Called in context of a new clone/fork from copy_process. + */ +void uprobe_copy_process(struct task_struct *t, u64 flags) +{ + struct uprobe_task *utask = current->utask; + struct mm_struct *mm = current->mm; + struct xol_area *area; + + t->utask = NULL; + + if (!utask || !utask->return_instances) + return; + + if (mm == t->mm && !(flags & CLONE_VFORK)) + return; + + if (dup_utask(t, utask)) + return uprobe_warn(t, "dup ret instances"); + + /* The task can fork() after dup_xol_work() fails */ + area = mm->uprobes_state.xol_area; + if (!area) + return uprobe_warn(t, "dup xol area"); + + if (mm == t->mm) + return; + + t->utask->dup_xol_addr = area->vaddr; + init_task_work(&t->utask->dup_xol_work, dup_xol_work); + task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME); +} + /* * Current area->vaddr notion assume the trampoline address is always * equal area->vaddr. * * Returns -1 in case the xol_area is not allocated. */ -static unsigned long get_trampoline_vaddr(void) +unsigned long uprobe_get_trampoline_vaddr(void) { + unsigned long trampoline_vaddr = UPROBE_NO_TRAMPOLINE_VADDR; struct xol_area *area; - unsigned long trampoline_vaddr = -1; - area = current->mm->uprobes_state.xol_area; - smp_read_barrier_depends(); + /* Pairs with xol_add_vma() smp_store_release() */ + area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */ if (area) trampoline_vaddr = area->vaddr; return trampoline_vaddr; } -static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) +static void cleanup_return_instances(struct uprobe_task *utask, bool chained, + struct pt_regs *regs) { - struct return_instance *ri; - struct uprobe_task *utask; + struct return_instance *ri = utask->return_instances, *ri_next; + enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; + + while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { + ri_next = ri->next; + rcu_assign_pointer(utask->return_instances, ri_next); + utask->depth--; + + free_ret_instance(utask, ri, true /* cleanup_hprobe */); + ri = ri_next; + } +} + +static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs, + struct return_instance *ri) +{ + struct uprobe_task *utask = current->utask; unsigned long orig_ret_vaddr, trampoline_vaddr; - bool chained = false; + bool chained; + int srcu_idx; if (!get_xol_area()) - return; - - utask = get_utask(); - if (!utask) - return; + goto free; if (utask->depth >= MAX_URETPROBE_DEPTH) { printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to" " nestedness limit pid/tgid=%d/%d\n", current->pid, current->tgid); - return; + goto free; } - ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL); - if (!ri) - goto fail; - - trampoline_vaddr = get_trampoline_vaddr(); + trampoline_vaddr = uprobe_get_trampoline_vaddr(); orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); if (orig_ret_vaddr == -1) - goto fail; + goto free; + + /* drop the entries invalidated by longjmp() */ + chained = (orig_ret_vaddr == trampoline_vaddr); + cleanup_return_instances(utask, chained, regs); /* * We don't want to keep trampoline address in stack, rather keep the * original return address of first caller thru all the consequent * instances. This also makes breakpoint unwrapping easier. */ - if (orig_ret_vaddr == trampoline_vaddr) { + if (chained) { if (!utask->return_instances) { /* * This situation is not possible. Likely we have an * attack from user-space. */ - pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n", - current->pid, current->tgid); - goto fail; + uprobe_warn(current, "handle tail call"); + goto free; } - - chained = true; orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } - atomic_inc(&uprobe->ref); - ri->uprobe = uprobe; + /* __srcu_read_lock() because SRCU lock survives switch to user space */ + srcu_idx = __srcu_read_lock(&uretprobes_srcu); + ri->func = instruction_pointer(regs); + ri->stack = user_stack_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; ri->chained = chained; utask->depth++; - /* add instance to the stack */ + hprobe_init_leased(&ri->hprobe, uprobe, srcu_idx); ri->next = utask->return_instances; - utask->return_instances = ri; + rcu_assign_pointer(utask->return_instances, ri); - return; + mod_timer(&utask->ri_timer, jiffies + RI_TIMER_PERIOD); - fail: - kfree(ri); + return; +free: + ri_free(ri); } /* Prepare to single-step probed instruction out of line. */ static int pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) { - struct uprobe_task *utask; - unsigned long xol_vaddr; + struct uprobe_task *utask = current->utask; int err; - utask = get_utask(); - if (!utask) - return -ENOMEM; + if (!try_get_uprobe(uprobe)) + return -EINVAL; - xol_vaddr = xol_get_insn_slot(uprobe); - if (!xol_vaddr) - return -ENOMEM; + if (!xol_get_insn_slot(uprobe, utask)) { + err = -ENOMEM; + goto err_out; + } - utask->xol_vaddr = xol_vaddr; utask->vaddr = bp_vaddr; - err = arch_uprobe_pre_xol(&uprobe->arch, regs); if (unlikely(err)) { - xol_free_insn_slot(current); - return err; + xol_free_insn_slot(utask); + goto err_out; } utask->active_uprobe = uprobe; utask->state = UTASK_SSTEP; return 0; +err_out: + put_uprobe(uprobe); + return err; } /* @@ -1503,40 +2359,25 @@ bool uprobe_deny_signal(void) WARN_ON_ONCE(utask->state != UTASK_SSTEP); - if (signal_pending(t)) { - spin_lock_irq(&t->sighand->siglock); + if (task_sigpending(t)) { + utask->signal_denied = true; clear_tsk_thread_flag(t, TIF_SIGPENDING); - spin_unlock_irq(&t->sighand->siglock); if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { utask->state = UTASK_SSTEP_TRAPPED; set_tsk_thread_flag(t, TIF_UPROBE); - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); } } return true; } -/* - * Avoid singlestepping the original instruction if the original instruction - * is a NOP or can be emulated. - */ -static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) -{ - if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) { - if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) - return true; - clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); - } - return false; -} - static void mmf_recalc_uprobes(struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!valid_vma(vma, false)) continue; /* @@ -1549,7 +2390,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm) return; } - clear_bit(MMF_HAS_UPROBES, &mm->flags); + mm_flags_clear(MMF_HAS_UPROBES, mm); } static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) @@ -1558,39 +2399,87 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) uprobe_opcode_t opcode; int result; + if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE))) + return -EINVAL; + pagefault_disable(); - result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, - sizeof(opcode)); + result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr); pagefault_enable(); if (likely(result == 0)) goto out; - result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); + result = get_user_pages(vaddr, 1, FOLL_FORCE, &page); if (result < 0) return result; - copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: /* This needs to return true for any variant of the trap insn */ return is_trap_insn(&opcode); } -static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr) { struct mm_struct *mm = current->mm; struct uprobe *uprobe = NULL; struct vm_area_struct *vma; + struct file *vm_file; + loff_t offset; + unsigned int seq; + + guard(rcu)(); - down_read(&mm->mmap_sem); - vma = find_vma(mm, bp_vaddr); - if (vma && vma->vm_start <= bp_vaddr) { - if (valid_vma(vma, false)) { + if (!mmap_lock_speculate_try_begin(mm, &seq)) + return NULL; + + vma = vma_lookup(mm, bp_vaddr); + if (!vma) + return NULL; + + /* + * vm_file memory can be reused for another instance of struct file, + * but can't be freed from under us, so it's safe to read fields from + * it, even if the values are some garbage values; ultimately + * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure + * that whatever we speculatively found is correct + */ + vm_file = READ_ONCE(vma->vm_file); + if (!vm_file) + return NULL; + + offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start); + uprobe = find_uprobe_rcu(vm_file->f_inode, offset); + if (!uprobe) + return NULL; + + /* now double check that nothing about MM changed */ + if (mmap_lock_speculate_retry(mm, seq)) + return NULL; + + return uprobe; +} + +/* assumes being inside RCU protected region */ +static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp) +{ + struct mm_struct *mm = current->mm; + struct uprobe *uprobe = NULL; + struct vm_area_struct *vma; + + uprobe = find_active_uprobe_speculative(bp_vaddr); + if (uprobe) + return uprobe; + + mmap_read_lock(mm); + vma = vma_lookup(mm, bp_vaddr); + if (vma) { + if (vma->vm_file) { struct inode *inode = file_inode(vma->vm_file); loff_t offset = vaddr_to_offset(vma, bp_vaddr); - uprobe = find_uprobe(inode, offset); + uprobe = find_uprobe_rcu(inode, offset); } if (!uprobe) @@ -1599,103 +2488,219 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) *is_swbp = -EFAULT; } - if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags)) + if (!uprobe && mm_flags_test_and_clear(MMF_RECALC_UPROBES, mm)) mmf_recalc_uprobes(mm); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return uprobe; } +static struct return_instance *push_consumer(struct return_instance *ri, __u64 id, __u64 cookie) +{ + struct return_consumer *ric; + + if (unlikely(ri == ZERO_SIZE_PTR)) + return ri; + + if (unlikely(ri->cons_cnt > 0)) { + ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL); + if (!ric) { + ri_free(ri); + return ZERO_SIZE_PTR; + } + ri->extra_consumers = ric; + } + + ric = likely(ri->cons_cnt == 0) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - 1]; + ric->id = id; + ric->cookie = cookie; + + ri->cons_cnt++; + return ri; +} + +static struct return_consumer * +return_consumer_find(struct return_instance *ri, int *iter, int id) +{ + struct return_consumer *ric; + int idx; + + for (idx = *iter; idx < ri->cons_cnt; idx++) + { + ric = likely(idx == 0) ? &ri->consumer : &ri->extra_consumers[idx - 1]; + if (ric->id == id) { + *iter = idx + 1; + return ric; + } + } + + return NULL; +} + +static bool ignore_ret_handler(int rc) +{ + return rc == UPROBE_HANDLER_REMOVE || rc == UPROBE_HANDLER_IGNORE; +} + static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) { struct uprobe_consumer *uc; - int remove = UPROBE_HANDLER_REMOVE; - bool need_prep = false; /* prepare return uprobe, when needed */ + bool has_consumers = false, remove = true; + struct return_instance *ri = NULL; + struct uprobe_task *utask = current->utask; - down_read(&uprobe->register_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { + utask->auprobe = &uprobe->arch; + + list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { + bool session = uc->handler && uc->ret_handler; + __u64 cookie = 0; int rc = 0; if (uc->handler) { - rc = uc->handler(uc, regs); - WARN(rc & ~UPROBE_HANDLER_MASK, - "bad rc=0x%x from %pf()\n", rc, uc->handler); + rc = uc->handler(uc, regs, &cookie); + WARN(rc < 0 || rc > 2, + "bad rc=0x%x from %ps()\n", rc, uc->handler); } - if (uc->ret_handler) - need_prep = true; + remove &= rc == UPROBE_HANDLER_REMOVE; + has_consumers = true; + + if (!uc->ret_handler || ignore_ret_handler(rc)) + continue; - remove &= rc; + if (!ri) + ri = alloc_return_instance(utask); + + if (session) + ri = push_consumer(ri, uc->id, cookie); } + utask->auprobe = NULL; + + if (!ZERO_OR_NULL_PTR(ri)) + prepare_uretprobe(uprobe, regs, ri); + + if (remove && has_consumers) { + down_read(&uprobe->register_rwsem); - if (need_prep && !remove) - prepare_uretprobe(uprobe, regs); /* put bp at return */ + /* re-check that removal is still required, this time under lock */ + if (!filter_chain(uprobe, current->mm)) { + WARN_ON(!uprobe_is_active(uprobe)); + unapply_uprobe(uprobe, current->mm); + } - if (remove && uprobe->consumers) { - WARN_ON(!uprobe_is_active(uprobe)); - unapply_uprobe(uprobe, current->mm); + up_read(&uprobe->register_rwsem); } - up_read(&uprobe->register_rwsem); } static void -handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) +handle_uretprobe_chain(struct return_instance *ri, struct uprobe *uprobe, struct pt_regs *regs) { - struct uprobe *uprobe = ri->uprobe; + struct return_consumer *ric; struct uprobe_consumer *uc; + int ric_idx = 0; + + /* all consumers unsubscribed meanwhile */ + if (unlikely(!uprobe)) + return; - down_read(&uprobe->register_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { - if (uc->ret_handler) - uc->ret_handler(uc, ri->func, regs); + rcu_read_lock_trace(); + list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { + bool session = uc->handler && uc->ret_handler; + + if (uc->ret_handler) { + ric = return_consumer_find(ri, &ric_idx, uc->id); + if (!session || ric) + uc->ret_handler(uc, ri->func, regs, ric ? &ric->cookie : NULL); + } } - up_read(&uprobe->register_rwsem); + rcu_read_unlock_trace(); } -static bool handle_trampoline(struct pt_regs *regs) +static struct return_instance *find_next_ret_chain(struct return_instance *ri) { - struct uprobe_task *utask; - struct return_instance *ri, *tmp; bool chained; + do { + chained = ri->chained; + ri = ri->next; /* can't be NULL if chained */ + } while (chained); + + return ri; +} + +void uprobe_handle_trampoline(struct pt_regs *regs) +{ + struct uprobe_task *utask; + struct return_instance *ri, *ri_next, *next_chain; + struct uprobe *uprobe; + enum hprobe_state hstate; + bool valid; + utask = current->utask; if (!utask) - return false; + goto sigill; ri = utask->return_instances; if (!ri) - return false; - - /* - * TODO: we should throw out return_instance's invalidated by - * longjmp(), currently we assume that the probed function always - * returns. - */ - instruction_pointer_set(regs, ri->orig_ret_vaddr); + goto sigill; - for (;;) { - handle_uretprobe_chain(ri, regs); - - chained = ri->chained; - put_uprobe(ri->uprobe); + do { + /* + * We should throw out the frames invalidated by longjmp(). + * If this chain is valid, then the next one should be alive + * or NULL; the latter case means that nobody but ri->func + * could hit this trampoline on return. TODO: sigaltstack(). + */ + next_chain = find_next_ret_chain(ri); + valid = !next_chain || arch_uretprobe_is_alive(next_chain, RP_CHECK_RET, regs); + + instruction_pointer_set(regs, ri->orig_ret_vaddr); + do { + /* pop current instance from the stack of pending return instances, + * as it's not pending anymore: we just fixed up original + * instruction pointer in regs and are about to call handlers; + * this allows fixup_uretprobe_trampoline_entries() to properly fix up + * captured stack traces from uretprobe handlers, in which pending + * trampoline addresses on the stack are replaced with correct + * original return addresses + */ + ri_next = ri->next; + rcu_assign_pointer(utask->return_instances, ri_next); + utask->depth--; - tmp = ri; - ri = ri->next; - kfree(tmp); + uprobe = hprobe_consume(&ri->hprobe, &hstate); + if (valid) + handle_uretprobe_chain(ri, uprobe, regs); + hprobe_finalize(&ri->hprobe, hstate); - if (!chained) - break; + /* We already took care of hprobe, no need to waste more time on that. */ + free_ret_instance(utask, ri, false /* !cleanup_hprobe */); + ri = ri_next; + } while (ri != next_chain); + } while (!valid); - utask->depth--; + return; - BUG_ON(!ri); - } +sigill: + uprobe_warn(current, "handle uretprobe, sending SIGILL."); + force_sig(SIGILL); +} - utask->return_instances = ri; +bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) +{ + return false; +} +bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, + struct pt_regs *regs) +{ return true; } +void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -1704,22 +2709,19 @@ static void handle_swbp(struct pt_regs *regs) { struct uprobe *uprobe; unsigned long bp_vaddr; - int uninitialized_var(is_swbp); + int is_swbp; bp_vaddr = uprobe_get_swbp_addr(regs); - if (bp_vaddr == get_trampoline_vaddr()) { - if (handle_trampoline(regs)) - return; + if (bp_vaddr == uprobe_get_trampoline_vaddr()) + return uprobe_handle_trampoline(regs); - pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n", - current->pid, current->tgid); - } + rcu_read_lock_trace(); - uprobe = find_active_uprobe(bp_vaddr, &is_swbp); + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ - send_sig(SIGTRAP, current, 0); + force_sig(SIGTRAP); } else { /* * Either we raced with uprobe_unregister() or we can't @@ -1731,7 +2733,7 @@ static void handle_swbp(struct pt_regs *regs) */ instruction_pointer_set(regs, bp_vaddr); } - return; + goto out; } /* change it in advance for ->handler() and restart */ @@ -1742,20 +2744,63 @@ static void handle_swbp(struct pt_regs *regs) * After we hit the bp, _unregister + _register can install the * new and not-yet-analyzed uprobe at the same address, restart. */ - smp_rmb(); /* pairs with wmb() in install_breakpoint() */ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) goto out; + /* + * Pairs with the smp_wmb() in prepare_uprobe(). + * + * Guarantees that if we see the UPROBE_COPY_INSN bit set, then + * we must also see the stores to &uprobe->arch performed by the + * prepare_uprobe() call. + */ + smp_rmb(); + + /* Tracing handlers use ->utask to communicate with fetch methods */ + if (!get_utask()) + goto out; + + if (arch_uprobe_ignore(&uprobe->arch, regs)) + goto out; + handler_chain(uprobe, regs); - if (can_skip_sstep(uprobe, regs)) + + /* Try to optimize after first hit. */ + arch_uprobe_optimize(&uprobe->arch, bp_vaddr); + + /* + * If user decided to take execution elsewhere, it makes little sense + * to execute the original instruction, so let's skip it. + */ + if (instruction_pointer(regs) != bp_vaddr) goto out; - if (!pre_ssout(uprobe, regs, bp_vaddr)) - return; + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) + goto out; + + if (pre_ssout(uprobe, regs, bp_vaddr)) + goto out; - /* can_skip_sstep() succeeded, or restart if can't singlestep */ out: - put_uprobe(uprobe); + /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ + rcu_read_unlock_trace(); +} + +void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr) +{ + struct uprobe *uprobe; + int is_swbp; + + guard(rcu_tasks_trace)(); + + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); + if (!uprobe) + return; + if (!get_utask()) + return; + if (arch_uprobe_ignore(&uprobe->arch, regs)) + return; + handler_chain(uprobe, regs); } /* @@ -1765,10 +2810,11 @@ out: static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) { struct uprobe *uprobe; + int err = 0; uprobe = utask->active_uprobe; if (utask->state == UTASK_SSTEP_ACK) - arch_uprobe_post_xol(&uprobe->arch, regs); + err = arch_uprobe_post_xol(&uprobe->arch, regs); else if (utask->state == UTASK_SSTEP_TRAPPED) arch_uprobe_abort_xol(&uprobe->arch, regs); else @@ -1777,11 +2823,17 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) put_uprobe(uprobe); utask->active_uprobe = NULL; utask->state = UTASK_RUNNING; - xol_free_insn_slot(current); + xol_free_insn_slot(utask); - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* see uprobe_deny_signal() */ - spin_unlock_irq(¤t->sighand->siglock); + if (utask->signal_denied) { + set_thread_flag(TIF_SIGPENDING); + utask->signal_denied = false; + } + + if (unlikely(err)) { + uprobe_warn(current, "execute the probed insn, sending SIGILL."); + force_sig(SIGILL); + } } /* @@ -1817,7 +2869,7 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs) if (!current->mm) return 0; - if (!test_bit(MMF_HAS_UPROBES, ¤t->mm->flags) && + if (!mm_flags_test(MMF_HAS_UPROBES, current->mm) && (!current->utask || !current->utask->return_instances)) return 0; @@ -1847,21 +2899,12 @@ static struct notifier_block uprobe_exception_nb = { .priority = INT_MAX-1, /* notified after kprobes, kgdb */ }; -static int __init init_uprobes(void) +void __init uprobes_init(void) { int i; for (i = 0; i < UPROBES_HASH_SZ; i++) mutex_init(&uprobes_mmap_mutex[i]); - if (percpu_init_rwsem(&dup_mmap_sem)) - return -ENOMEM; - - return register_die_notifier(&uprobe_exception_nb); -} -module_init(init_uprobes); - -static void __exit exit_uprobes(void) -{ + BUG_ON(register_die_notifier(&uprobe_exception_nb)); } -module_exit(exit_uprobes); |
