summaryrefslogtreecommitdiff
path: root/kernel/events/uprobes.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events/uprobes.c')
-rw-r--r--kernel/events/uprobes.c1392
1 files changed, 931 insertions, 461 deletions
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index bf9edd8d75be..b4ca8898fe17 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -15,18 +15,20 @@
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/export.h>
#include <linux/rmap.h> /* anon_vma_prepare */
-#include <linux/mmu_notifier.h> /* set_pte_at_notify */
-#include <linux/swap.h> /* try_to_free_swap */
+#include <linux/mmu_notifier.h>
+#include <linux/swap.h> /* folio_free_swap */
#include <linux/ptrace.h> /* user_enable_single_step */
#include <linux/kdebug.h> /* notifier mechanism */
-#include "../../mm/internal.h" /* munlock_vma_page */
#include <linux/percpu-rwsem.h>
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
#include <linux/khugepaged.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/workqueue.h>
+#include <linux/srcu.h>
+#include <linux/oom.h> /* check_stable_address_space */
#include <linux/uprobes.h>
@@ -40,7 +42,8 @@ static struct rb_root uprobes_tree = RB_ROOT;
*/
#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
-static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
+static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */
+static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock);
#define UPROBES_HASH_SZ 13
/* serialize uprobe->pending_list */
@@ -49,6 +52,9 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
+/* Covers return_instance's uprobe lifetime. */
+DEFINE_STATIC_SRCU(uretprobes_srcu);
+
/* Have a copy of original instruction */
#define UPROBE_COPY_INSN 0
@@ -58,11 +64,15 @@ struct uprobe {
struct rw_semaphore register_rwsem;
struct rw_semaphore consumer_rwsem;
struct list_head pending_list;
- struct uprobe_consumer *consumers;
+ struct list_head consumers;
struct inode *inode; /* Also hold a ref to inode */
+ union {
+ struct rcu_head rcu;
+ struct work_struct work;
+ };
loff_t offset;
loff_t ref_ctr_offset;
- unsigned long flags;
+ unsigned long flags; /* "unsigned long" so bitops work */
/*
* The generic code assumes that it has two members of unknown type
@@ -97,11 +107,9 @@ static LIST_HEAD(delayed_uprobe_list);
*/
struct xol_area {
wait_queue_head_t wq; /* if all slots are busy */
- atomic_t slot_count; /* number of in-use slots */
unsigned long *bitmap; /* 0 = free slot */
- struct vm_special_mapping xol_mapping;
- struct page *pages[2];
+ struct page *page;
/*
* We keep the vma's vm_start rather than a pointer to the vma
* itself. The probed process or a naughty kernel module could make
@@ -110,6 +118,11 @@ struct xol_area {
unsigned long vaddr; /* Page(s) of instruction slots */
};
+static void uprobe_warn(struct task_struct *t, const char *msg)
+{
+ pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg);
+}
+
/*
* valid_vma: Verify if the specified vma is an executable vma
* Relax restrictions while unregistering: vm_flags might have
@@ -154,26 +167,25 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct page *old_page, struct page *new_page)
{
+ struct folio *old_folio = page_folio(old_page);
+ struct folio *new_folio;
struct mm_struct *mm = vma->vm_mm;
- struct page_vma_mapped_walk pvmw = {
- .page = compound_head(old_page),
- .vma = vma,
- .address = addr,
- };
+ DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0);
int err;
struct mmu_notifier_range range;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
addr + PAGE_SIZE);
if (new_page) {
- err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL);
+ new_folio = page_folio(new_page);
+ err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL);
if (err)
return err;
}
- /* For try_to_free_swap() and munlock_vma_page() below */
- lock_page(old_page);
+ /* For folio_free_swap() below */
+ folio_lock(old_folio);
mmu_notifier_invalidate_range_start(&range);
err = -EAGAIN;
@@ -182,37 +194,34 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
if (new_page) {
- get_page(new_page);
- page_add_new_anon_rmap(new_page, vma, addr, false);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
+ folio_get(new_folio);
+ folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE);
+ folio_add_lru_vma(new_folio, vma);
} else
/* no new page, just dec_mm_counter for old_page */
dec_mm_counter(mm, MM_ANONPAGES);
- if (!PageAnon(old_page)) {
- dec_mm_counter(mm, mm_counter_file(old_page));
+ if (!folio_test_anon(old_folio)) {
+ dec_mm_counter(mm, mm_counter_file(old_folio));
inc_mm_counter(mm, MM_ANONPAGES);
}
- flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
- ptep_clear_flush_notify(vma, addr, pvmw.pte);
+ flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte)));
+ ptep_clear_flush(vma, addr, pvmw.pte);
if (new_page)
- set_pte_at_notify(mm, addr, pvmw.pte,
- mk_pte(new_page, vma->vm_page_prot));
+ set_pte_at(mm, addr, pvmw.pte,
+ mk_pte(new_page, vma->vm_page_prot));
- page_remove_rmap(old_page, false);
- if (!page_mapped(old_page))
- try_to_free_swap(old_page);
+ folio_remove_rmap_pte(old_folio, old_page, vma);
+ if (!folio_mapped(old_folio))
+ folio_free_swap(old_folio);
page_vma_mapped_walk_done(&pvmw);
-
- if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page))
- munlock_vma_page(old_page);
- put_page(old_page);
+ folio_put(old_folio);
err = 0;
unlock:
mmu_notifier_invalidate_range_end(&range);
- unlock_page(old_page);
+ folio_unlock(old_folio);
return err;
}
@@ -355,9 +364,10 @@ static bool valid_ref_ctr_vma(struct uprobe *uprobe,
static struct vm_area_struct *
find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *tmp;
- for (tmp = mm->mmap; tmp; tmp = tmp->vm_next)
+ for_each_vma(vmi, tmp)
if (valid_ref_ctr_vma(uprobe, tmp))
return tmp;
@@ -369,7 +379,6 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
{
void *kaddr;
struct page *page;
- struct vm_area_struct *vma;
int ret;
short *ptr;
@@ -377,7 +386,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
return -EINVAL;
ret = get_user_pages_remote(mm, vaddr, 1,
- FOLL_WRITE, &page, &vma, NULL);
+ FOLL_WRITE, &page, NULL);
if (unlikely(ret <= 0)) {
/*
* We are asking for 1 page. If get_user_pages_remote() fails,
@@ -408,7 +417,7 @@ static void update_ref_ctr_warn(struct uprobe *uprobe,
struct mm_struct *mm, short d)
{
pr_warn("ref_ctr %s failed for inode: 0x%lx offset: "
- "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n",
+ "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n",
d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
(unsigned long long) uprobe->offset,
(unsigned long long) uprobe->ref_ctr_offset, mm);
@@ -453,11 +462,12 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
* that have fixed length instructions.
*
* uprobe_write_opcode - write the opcode at a given virtual address.
+ * @auprobe: arch specific probepoint information.
* @mm: the probed process address space.
* @vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @vaddr.
*
- * Called with mm->mmap_lock held for write.
+ * Called with mm->mmap_lock held for read or write.
* Return 0 (success) or a negative errno.
*/
int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
@@ -477,15 +487,19 @@ retry:
if (is_register)
gup_flags |= FOLL_SPLIT_PMD;
/* Read the page with vaddr into memory */
- ret = get_user_pages_remote(mm, vaddr, 1, gup_flags,
- &old_page, &vma, NULL);
- if (ret <= 0)
- return ret;
+ old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
+ if (IS_ERR(old_page))
+ return PTR_ERR(old_page);
ret = verify_opcode(old_page, vaddr, &opcode);
if (ret <= 0)
goto put_old;
+ if (is_zero_page(old_page)) {
+ ret = -EINVAL;
+ goto put_old;
+ }
+
if (WARN(!is_register && PageCompound(old_page),
"uprobe unregister should never work on compound page\n")) {
ret = -EINVAL;
@@ -542,7 +556,7 @@ retry:
}
}
- ret = __replace_page(vma, vaddr, old_page, new_page);
+ ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page);
if (new_page)
put_page(new_page);
put_old:
@@ -557,7 +571,7 @@ put_old:
/* try collapse pmd for compound page */
if (!ret && orig_page_huge)
- collapse_pte_mapped_thp(mm, vaddr);
+ collapse_pte_mapped_thp(mm, vaddr, false);
return ret;
}
@@ -592,125 +606,350 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v
*(uprobe_opcode_t *)&auprobe->insn);
}
+/* uprobe should have guaranteed positive refcount */
static struct uprobe *get_uprobe(struct uprobe *uprobe)
{
refcount_inc(&uprobe->ref);
return uprobe;
}
+/*
+ * uprobe should have guaranteed lifetime, which can be either of:
+ * - caller already has refcount taken (and wants an extra one);
+ * - uprobe is RCU protected and won't be freed until after grace period;
+ * - we are holding uprobes_treelock (for read or write, doesn't matter).
+ */
+static struct uprobe *try_get_uprobe(struct uprobe *uprobe)
+{
+ if (refcount_inc_not_zero(&uprobe->ref))
+ return uprobe;
+ return NULL;
+}
+
+static inline bool uprobe_is_active(struct uprobe *uprobe)
+{
+ return !RB_EMPTY_NODE(&uprobe->rb_node);
+}
+
+static void uprobe_free_rcu_tasks_trace(struct rcu_head *rcu)
+{
+ struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
+
+ kfree(uprobe);
+}
+
+static void uprobe_free_srcu(struct rcu_head *rcu)
+{
+ struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
+
+ call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu_tasks_trace);
+}
+
+static void uprobe_free_deferred(struct work_struct *work)
+{
+ struct uprobe *uprobe = container_of(work, struct uprobe, work);
+
+ write_lock(&uprobes_treelock);
+
+ if (uprobe_is_active(uprobe)) {
+ write_seqcount_begin(&uprobes_seqcount);
+ rb_erase(&uprobe->rb_node, &uprobes_tree);
+ write_seqcount_end(&uprobes_seqcount);
+ }
+
+ write_unlock(&uprobes_treelock);
+
+ /*
+ * If application munmap(exec_vma) before uprobe_unregister()
+ * gets called, we don't get a chance to remove uprobe from
+ * delayed_uprobe_list from remove_breakpoint(). Do it here.
+ */
+ mutex_lock(&delayed_uprobe_lock);
+ delayed_uprobe_remove(uprobe, NULL);
+ mutex_unlock(&delayed_uprobe_lock);
+
+ /* start srcu -> rcu_tasks_trace -> kfree chain */
+ call_srcu(&uretprobes_srcu, &uprobe->rcu, uprobe_free_srcu);
+}
+
static void put_uprobe(struct uprobe *uprobe)
{
- if (refcount_dec_and_test(&uprobe->ref)) {
+ if (!refcount_dec_and_test(&uprobe->ref))
+ return;
+
+ INIT_WORK(&uprobe->work, uprobe_free_deferred);
+ schedule_work(&uprobe->work);
+}
+
+/* Initialize hprobe as SRCU-protected "leased" uprobe */
+static void hprobe_init_leased(struct hprobe *hprobe, struct uprobe *uprobe, int srcu_idx)
+{
+ WARN_ON(!uprobe);
+ hprobe->state = HPROBE_LEASED;
+ hprobe->uprobe = uprobe;
+ hprobe->srcu_idx = srcu_idx;
+}
+
+/* Initialize hprobe as refcounted ("stable") uprobe (uprobe can be NULL). */
+static void hprobe_init_stable(struct hprobe *hprobe, struct uprobe *uprobe)
+{
+ hprobe->state = uprobe ? HPROBE_STABLE : HPROBE_GONE;
+ hprobe->uprobe = uprobe;
+ hprobe->srcu_idx = -1;
+}
+
+/*
+ * hprobe_consume() fetches hprobe's underlying uprobe and detects whether
+ * uprobe is SRCU protected or is refcounted. hprobe_consume() can be
+ * used only once for a given hprobe.
+ *
+ * Caller has to call hprobe_finalize() and pass previous hprobe_state, so
+ * that hprobe_finalize() can perform SRCU unlock or put uprobe, whichever
+ * is appropriate.
+ */
+static inline struct uprobe *hprobe_consume(struct hprobe *hprobe, enum hprobe_state *hstate)
+{
+ *hstate = xchg(&hprobe->state, HPROBE_CONSUMED);
+ switch (*hstate) {
+ case HPROBE_LEASED:
+ case HPROBE_STABLE:
+ return hprobe->uprobe;
+ case HPROBE_GONE: /* uprobe is NULL, no SRCU */
+ case HPROBE_CONSUMED: /* uprobe was finalized already, do nothing */
+ return NULL;
+ default:
+ WARN(1, "hprobe invalid state %d", *hstate);
+ return NULL;
+ }
+}
+
+/*
+ * Reset hprobe state and, if hprobe was LEASED, release SRCU lock.
+ * hprobe_finalize() can only be used from current context after
+ * hprobe_consume() call (which determines uprobe and hstate value).
+ */
+static void hprobe_finalize(struct hprobe *hprobe, enum hprobe_state hstate)
+{
+ switch (hstate) {
+ case HPROBE_LEASED:
+ __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx);
+ break;
+ case HPROBE_STABLE:
+ put_uprobe(hprobe->uprobe);
+ break;
+ case HPROBE_GONE:
+ case HPROBE_CONSUMED:
+ break;
+ default:
+ WARN(1, "hprobe invalid state %d", hstate);
+ break;
+ }
+}
+
+/*
+ * Attempt to switch (atomically) uprobe from being SRCU protected (LEASED)
+ * to refcounted (STABLE) state. Competes with hprobe_consume(); only one of
+ * them can win the race to perform SRCU unlocking. Whoever wins must perform
+ * SRCU unlock.
+ *
+ * Returns underlying valid uprobe or NULL, if there was no underlying uprobe
+ * to begin with or we failed to bump its refcount and it's going away.
+ *
+ * Returned non-NULL uprobe can be still safely used within an ongoing SRCU
+ * locked region. If `get` is true, it's guaranteed that non-NULL uprobe has
+ * an extra refcount for caller to assume and use. Otherwise, it's not
+ * guaranteed that returned uprobe has a positive refcount, so caller has to
+ * attempt try_get_uprobe(), if it needs to preserve uprobe beyond current
+ * SRCU lock region. See dup_utask().
+ */
+static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get)
+{
+ enum hprobe_state hstate;
+
+ /*
+ * Caller should guarantee that return_instance is not going to be
+ * freed from under us. This can be achieved either through holding
+ * rcu_read_lock() or by owning return_instance in the first place.
+ *
+ * Underlying uprobe is itself protected from reuse by SRCU, so ensure
+ * SRCU lock is held properly.
+ */
+ lockdep_assert(srcu_read_lock_held(&uretprobes_srcu));
+
+ hstate = READ_ONCE(hprobe->state);
+ switch (hstate) {
+ case HPROBE_STABLE:
+ /* uprobe has positive refcount, bump refcount, if necessary */
+ return get ? get_uprobe(hprobe->uprobe) : hprobe->uprobe;
+ case HPROBE_GONE:
/*
- * If application munmap(exec_vma) before uprobe_unregister()
- * gets called, we don't get a chance to remove uprobe from
- * delayed_uprobe_list from remove_breakpoint(). Do it here.
+ * SRCU was unlocked earlier and we didn't manage to take
+ * uprobe refcnt, so it's effectively NULL
*/
- mutex_lock(&delayed_uprobe_lock);
- delayed_uprobe_remove(uprobe, NULL);
- mutex_unlock(&delayed_uprobe_lock);
- kfree(uprobe);
+ return NULL;
+ case HPROBE_CONSUMED:
+ /*
+ * uprobe was consumed, so it's effectively NULL as far as
+ * uretprobe processing logic is concerned
+ */
+ return NULL;
+ case HPROBE_LEASED: {
+ struct uprobe *uprobe = try_get_uprobe(hprobe->uprobe);
+ /*
+ * Try to switch hprobe state, guarding against
+ * hprobe_consume() or another hprobe_expire() racing with us.
+ * Note, if we failed to get uprobe refcount, we use special
+ * HPROBE_GONE state to signal that hprobe->uprobe shouldn't
+ * be used as it will be freed after SRCU is unlocked.
+ */
+ if (try_cmpxchg(&hprobe->state, &hstate, uprobe ? HPROBE_STABLE : HPROBE_GONE)) {
+ /* We won the race, we are the ones to unlock SRCU */
+ __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx);
+ return get ? get_uprobe(uprobe) : uprobe;
+ }
+
+ /*
+ * We lost the race, undo refcount bump (if it ever happened),
+ * unless caller would like an extra refcount anyways.
+ */
+ if (uprobe && !get)
+ put_uprobe(uprobe);
+ /*
+ * Even if hprobe_consume() or another hprobe_expire() wins
+ * the state update race and unlocks SRCU from under us, we
+ * still have a guarantee that underyling uprobe won't be
+ * freed due to ongoing caller's SRCU lock region, so we can
+ * return it regardless. Also, if `get` was true, we also have
+ * an extra ref for the caller to own. This is used in dup_utask().
+ */
+ return uprobe;
+ }
+ default:
+ WARN(1, "unknown hprobe state %d", hstate);
+ return NULL;
}
}
-static int match_uprobe(struct uprobe *l, struct uprobe *r)
+static __always_inline
+int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset,
+ const struct uprobe *r)
{
- if (l->inode < r->inode)
+ if (l_inode < r->inode)
return -1;
- if (l->inode > r->inode)
+ if (l_inode > r->inode)
return 1;
- if (l->offset < r->offset)
+ if (l_offset < r->offset)
return -1;
- if (l->offset > r->offset)
+ if (l_offset > r->offset)
return 1;
return 0;
}
-static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
-{
- struct uprobe u = { .inode = inode, .offset = offset };
- struct rb_node *n = uprobes_tree.rb_node;
- struct uprobe *uprobe;
- int match;
+#define __node_2_uprobe(node) \
+ rb_entry((node), struct uprobe, rb_node)
- while (n) {
- uprobe = rb_entry(n, struct uprobe, rb_node);
- match = match_uprobe(&u, uprobe);
- if (!match)
- return get_uprobe(uprobe);
+struct __uprobe_key {
+ struct inode *inode;
+ loff_t offset;
+};
- if (match < 0)
- n = n->rb_left;
- else
- n = n->rb_right;
- }
- return NULL;
+static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b)
+{
+ const struct __uprobe_key *a = key;
+ return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b));
+}
+
+static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b)
+{
+ struct uprobe *u = __node_2_uprobe(a);
+ return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
}
/*
- * Find a uprobe corresponding to a given inode:offset
- * Acquires uprobes_treelock
+ * Assumes being inside RCU protected region.
+ * No refcount is taken on returned uprobe.
*/
-static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
+static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset)
{
- struct uprobe *uprobe;
+ struct __uprobe_key key = {
+ .inode = inode,
+ .offset = offset,
+ };
+ struct rb_node *node;
+ unsigned int seq;
- spin_lock(&uprobes_treelock);
- uprobe = __find_uprobe(inode, offset);
- spin_unlock(&uprobes_treelock);
+ lockdep_assert(rcu_read_lock_trace_held());
- return uprobe;
+ do {
+ seq = read_seqcount_begin(&uprobes_seqcount);
+ node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key);
+ /*
+ * Lockless RB-tree lookups can result only in false negatives.
+ * If the element is found, it is correct and can be returned
+ * under RCU protection. If we find nothing, we need to
+ * validate that seqcount didn't change. If it did, we have to
+ * try again as we might have missed the element (false
+ * negative). If seqcount is unchanged, search truly failed.
+ */
+ if (node)
+ return __node_2_uprobe(node);
+ } while (read_seqcount_retry(&uprobes_seqcount, seq));
+
+ return NULL;
}
+/*
+ * Attempt to insert a new uprobe into uprobes_tree.
+ *
+ * If uprobe already exists (for given inode+offset), we just increment
+ * refcount of previously existing uprobe.
+ *
+ * If not, a provided new instance of uprobe is inserted into the tree (with
+ * assumed initial refcount == 1).
+ *
+ * In any case, we return a uprobe instance that ends up being in uprobes_tree.
+ * Caller has to clean up new uprobe instance, if it ended up not being
+ * inserted into the tree.
+ *
+ * We assume that uprobes_treelock is held for writing.
+ */
static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
{
- struct rb_node **p = &uprobes_tree.rb_node;
- struct rb_node *parent = NULL;
- struct uprobe *u;
- int match;
-
- while (*p) {
- parent = *p;
- u = rb_entry(parent, struct uprobe, rb_node);
- match = match_uprobe(uprobe, u);
- if (!match)
- return get_uprobe(u);
+ struct rb_node *node;
+again:
+ node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
+ if (node) {
+ struct uprobe *u = __node_2_uprobe(node);
- if (match < 0)
- p = &parent->rb_left;
- else
- p = &parent->rb_right;
+ if (!try_get_uprobe(u)) {
+ rb_erase(node, &uprobes_tree);
+ RB_CLEAR_NODE(&u->rb_node);
+ goto again;
+ }
+ return u;
}
- u = NULL;
- rb_link_node(&uprobe->rb_node, parent, p);
- rb_insert_color(&uprobe->rb_node, &uprobes_tree);
- /* get access + creation ref */
- refcount_set(&uprobe->ref, 2);
-
- return u;
+ return uprobe;
}
/*
- * Acquire uprobes_treelock.
- * Matching uprobe already exists in rbtree;
- * increment (access refcount) and return the matching uprobe.
- *
- * No matching uprobe; insert the uprobe in rb_tree;
- * get a double refcount (access + creation) and return NULL.
+ * Acquire uprobes_treelock and insert uprobe into uprobes_tree
+ * (or reuse existing one, see __insert_uprobe() comments above).
*/
static struct uprobe *insert_uprobe(struct uprobe *uprobe)
{
struct uprobe *u;
- spin_lock(&uprobes_treelock);
+ write_lock(&uprobes_treelock);
+ write_seqcount_begin(&uprobes_seqcount);
u = __insert_uprobe(uprobe);
- spin_unlock(&uprobes_treelock);
+ write_seqcount_end(&uprobes_seqcount);
+ write_unlock(&uprobes_treelock);
return u;
}
@@ -732,18 +971,21 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
if (!uprobe)
- return NULL;
+ return ERR_PTR(-ENOMEM);
uprobe->inode = inode;
uprobe->offset = offset;
uprobe->ref_ctr_offset = ref_ctr_offset;
+ INIT_LIST_HEAD(&uprobe->consumers);
init_rwsem(&uprobe->register_rwsem);
init_rwsem(&uprobe->consumer_rwsem);
+ RB_CLEAR_NODE(&uprobe->rb_node);
+ refcount_set(&uprobe->ref, 1);
/* add to uprobes_tree, sorted on inode:offset */
cur_uprobe = insert_uprobe(uprobe);
/* a uprobe exists for this inode:offset combination */
- if (cur_uprobe) {
+ if (cur_uprobe != uprobe) {
if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) {
ref_ctr_mismatch_warn(cur_uprobe, uprobe);
put_uprobe(cur_uprobe);
@@ -759,33 +1001,23 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
+ static atomic64_t id;
+
down_write(&uprobe->consumer_rwsem);
- uc->next = uprobe->consumers;
- uprobe->consumers = uc;
+ list_add_rcu(&uc->cons_node, &uprobe->consumers);
+ uc->id = (__u64) atomic64_inc_return(&id);
up_write(&uprobe->consumer_rwsem);
}
/*
* For uprobe @uprobe, delete the consumer @uc.
- * Return true if the @uc is deleted successfully
- * or return false.
+ * Should never be called with consumer that's not part of @uprobe->consumers.
*/
-static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
+static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
- struct uprobe_consumer **con;
- bool ret = false;
-
down_write(&uprobe->consumer_rwsem);
- for (con = &uprobe->consumers; *con; con = &(*con)->next) {
- if (*con == uc) {
- *con = uc->next;
- ret = true;
- break;
- }
- }
+ list_del_rcu(&uc->cons_node);
up_write(&uprobe->consumer_rwsem);
-
- return ret;
}
static int __copy_insn(struct address_space *mapping, struct file *filp,
@@ -794,10 +1026,10 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
struct page *page;
/*
* Ensure that the page that has the original instruction is populated
- * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
+ * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(),
* see uprobe_register().
*/
- if (mapping->a_ops->readpage)
+ if (mapping->a_ops->read_folio)
page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
else
page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
@@ -870,21 +1102,19 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
return ret;
}
-static inline bool consumer_filter(struct uprobe_consumer *uc,
- enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm)
{
- return !uc->filter || uc->filter(uc, ctx, mm);
+ return !uc->filter || uc->filter(uc, mm);
}
-static bool filter_chain(struct uprobe *uprobe,
- enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
{
struct uprobe_consumer *uc;
bool ret = false;
down_read(&uprobe->consumer_rwsem);
- for (uc = uprobe->consumers; uc; uc = uc->next) {
- ret = consumer_filter(uc, ctx, mm);
+ list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
+ ret = consumer_filter(uc, mm);
if (ret)
break;
}
@@ -928,27 +1158,6 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
return set_orig_insn(&uprobe->arch, mm, vaddr);
}
-static inline bool uprobe_is_active(struct uprobe *uprobe)
-{
- return !RB_EMPTY_NODE(&uprobe->rb_node);
-}
-/*
- * There could be threads that have already hit the breakpoint. They
- * will recheck the current insn and restart if find_uprobe() fails.
- * See find_active_uprobe().
- */
-static void delete_uprobe(struct uprobe *uprobe)
-{
- if (WARN_ON(!uprobe_is_active(uprobe)))
- return;
-
- spin_lock(&uprobes_treelock);
- rb_erase(&uprobe->rb_node, &uprobes_tree);
- spin_unlock(&uprobes_treelock);
- RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
- put_uprobe(uprobe);
-}
-
struct map_info {
struct map_info *next;
struct mm_struct *mm;
@@ -1053,8 +1262,17 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
if (err && is_register)
goto free;
-
+ /*
+ * We take mmap_lock for writing to avoid the race with
+ * find_active_uprobe_rcu() which takes mmap_lock for reading.
+ * Thus this install_breakpoint() can not make
+ * is_trap_at_addr() true right after find_uprobe_rcu()
+ * returns NULL in find_active_uprobe_rcu().
+ */
mmap_write_lock(mm);
+ if (check_stable_address_space(mm))
+ goto unlock;
+
vma = find_vma(mm, info->vaddr);
if (!vma || !valid_vma(vma, is_register) ||
file_inode(vma->vm_file) != uprobe->inode)
@@ -1066,12 +1284,10 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
if (is_register) {
/* consult only the "caller", new consumer. */
- if (consumer_filter(new,
- UPROBE_FILTER_REGISTER, mm))
+ if (consumer_filter(new, mm))
err = install_breakpoint(uprobe, mm, vma, info->vaddr);
} else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
- if (!filter_chain(uprobe,
- UPROBE_FILTER_UNREGISTER, mm))
+ if (!filter_chain(uprobe, mm))
err |= remove_breakpoint(uprobe, mm, info->vaddr);
}
@@ -1086,162 +1302,152 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
return err;
}
-static void
-__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
+/**
+ * uprobe_unregister_nosync - unregister an already registered probe.
+ * @uprobe: uprobe to remove
+ * @uc: identify which probe if multiple probes are colocated.
+ */
+void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
int err;
- if (WARN_ON(!consumer_del(uprobe, uc)))
- return;
-
+ down_write(&uprobe->register_rwsem);
+ consumer_del(uprobe, uc);
err = register_for_each_vma(uprobe, NULL);
- /* TODO : cant unregister? schedule a worker thread */
- if (!uprobe->consumers && !err)
- delete_uprobe(uprobe);
-}
-
-/*
- * uprobe_unregister - unregister an already registered probe.
- * @inode: the file in which the probe has to be removed.
- * @offset: offset from the start of the file.
- * @uc: identify which probe if multiple probes are colocated.
- */
-void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
-{
- struct uprobe *uprobe;
+ up_write(&uprobe->register_rwsem);
- uprobe = find_uprobe(inode, offset);
- if (WARN_ON(!uprobe))
+ /* TODO : cant unregister? schedule a worker thread */
+ if (unlikely(err)) {
+ uprobe_warn(current, "unregister, leaking uprobe");
return;
+ }
- down_write(&uprobe->register_rwsem);
- __uprobe_unregister(uprobe, uc);
- up_write(&uprobe->register_rwsem);
put_uprobe(uprobe);
}
-EXPORT_SYMBOL_GPL(uprobe_unregister);
+EXPORT_SYMBOL_GPL(uprobe_unregister_nosync);
-/*
- * __uprobe_register - register a probe
+void uprobe_unregister_sync(void)
+{
+ /*
+ * Now that handler_chain() and handle_uretprobe_chain() iterate over
+ * uprobe->consumers list under RCU protection without holding
+ * uprobe->register_rwsem, we need to wait for RCU grace period to
+ * make sure that we can't call into just unregistered
+ * uprobe_consumer's callbacks anymore. If we don't do that, fast and
+ * unlucky enough caller can free consumer's memory and cause
+ * handler_chain() or handle_uretprobe_chain() to do an use-after-free.
+ */
+ synchronize_rcu_tasks_trace();
+ synchronize_srcu(&uretprobes_srcu);
+}
+EXPORT_SYMBOL_GPL(uprobe_unregister_sync);
+
+/**
+ * uprobe_register - register a probe
* @inode: the file in which the probe has to be placed.
* @offset: offset from the start of the file.
+ * @ref_ctr_offset: offset of SDT marker / reference counter
* @uc: information on howto handle the probe..
*
- * Apart from the access refcount, __uprobe_register() takes a creation
+ * Apart from the access refcount, uprobe_register() takes a creation
* refcount (thro alloc_uprobe) if and only if this @uprobe is getting
* inserted into the rbtree (i.e first consumer for a @inode:@offset
* tuple). Creation refcount stops uprobe_unregister from freeing the
* @uprobe even before the register operation is complete. Creation
* refcount is released when the last @uc for the @uprobe
- * unregisters. Caller of __uprobe_register() is required to keep @inode
+ * unregisters. Caller of uprobe_register() is required to keep @inode
* (and the containing mount) referenced.
*
- * Return errno if it cannot successully install probes
- * else return 0 (success)
+ * Return: pointer to the new uprobe on success or an ERR_PTR on failure.
*/
-static int __uprobe_register(struct inode *inode, loff_t offset,
- loff_t ref_ctr_offset, struct uprobe_consumer *uc)
+struct uprobe *uprobe_register(struct inode *inode,
+ loff_t offset, loff_t ref_ctr_offset,
+ struct uprobe_consumer *uc)
{
struct uprobe *uprobe;
int ret;
/* Uprobe must have at least one set consumer */
if (!uc->handler && !uc->ret_handler)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
- if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
- return -EIO;
+ if (!inode->i_mapping->a_ops->read_folio &&
+ !shmem_mapping(inode->i_mapping))
+ return ERR_PTR(-EIO);
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
/*
* This ensures that copy_from_page(), copy_to_page() and
* __update_ref_ctr() can't cross page boundary.
*/
if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
- retry:
uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
- if (!uprobe)
- return -ENOMEM;
if (IS_ERR(uprobe))
- return PTR_ERR(uprobe);
+ return uprobe;
- /*
- * We can race with uprobe_unregister()->delete_uprobe().
- * Check uprobe_is_active() and retry if it is false.
- */
down_write(&uprobe->register_rwsem);
- ret = -EAGAIN;
- if (likely(uprobe_is_active(uprobe))) {
- consumer_add(uprobe, uc);
- ret = register_for_each_vma(uprobe, uc);
- if (ret)
- __uprobe_unregister(uprobe, uc);
- }
+ consumer_add(uprobe, uc);
+ ret = register_for_each_vma(uprobe, uc);
up_write(&uprobe->register_rwsem);
- put_uprobe(uprobe);
- if (unlikely(ret == -EAGAIN))
- goto retry;
- return ret;
-}
+ if (ret) {
+ uprobe_unregister_nosync(uprobe, uc);
+ /*
+ * Registration might have partially succeeded, so we can have
+ * this consumer being called right at this time. We need to
+ * sync here. It's ok, it's unlikely slow path.
+ */
+ uprobe_unregister_sync();
+ return ERR_PTR(ret);
+ }
-int uprobe_register(struct inode *inode, loff_t offset,
- struct uprobe_consumer *uc)
-{
- return __uprobe_register(inode, offset, 0, uc);
+ return uprobe;
}
EXPORT_SYMBOL_GPL(uprobe_register);
-int uprobe_register_refctr(struct inode *inode, loff_t offset,
- loff_t ref_ctr_offset, struct uprobe_consumer *uc)
-{
- return __uprobe_register(inode, offset, ref_ctr_offset, uc);
-}
-EXPORT_SYMBOL_GPL(uprobe_register_refctr);
-
-/*
- * uprobe_apply - unregister an already registered probe.
- * @inode: the file in which the probe has to be removed.
- * @offset: offset from the start of the file.
+/**
+ * uprobe_apply - add or remove the breakpoints according to @uc->filter
+ * @uprobe: uprobe which "owns" the breakpoint
* @uc: consumer which wants to add more or remove some breakpoints
* @add: add or remove the breakpoints
+ * Return: 0 on success or negative error code.
*/
-int uprobe_apply(struct inode *inode, loff_t offset,
- struct uprobe_consumer *uc, bool add)
+int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
{
- struct uprobe *uprobe;
struct uprobe_consumer *con;
int ret = -ENOENT;
- uprobe = find_uprobe(inode, offset);
- if (WARN_ON(!uprobe))
- return ret;
-
down_write(&uprobe->register_rwsem);
- for (con = uprobe->consumers; con && con != uc ; con = con->next)
- ;
- if (con)
- ret = register_for_each_vma(uprobe, add ? uc : NULL);
+
+ rcu_read_lock_trace();
+ list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
+ if (con == uc) {
+ ret = register_for_each_vma(uprobe, add ? uc : NULL);
+ break;
+ }
+ }
+ rcu_read_unlock_trace();
+
up_write(&uprobe->register_rwsem);
- put_uprobe(uprobe);
return ret;
}
static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
int err = 0;
mmap_read_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
unsigned long vaddr;
loff_t offset;
@@ -1303,25 +1509,27 @@ static void build_probe_list(struct inode *inode,
min = vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
- spin_lock(&uprobes_treelock);
+ read_lock(&uprobes_treelock);
n = find_node_in_range(inode, min, max);
if (n) {
for (t = n; t; t = rb_prev(t)) {
u = rb_entry(t, struct uprobe, rb_node);
if (u->inode != inode || u->offset < min)
break;
- list_add(&u->pending_list, head);
- get_uprobe(u);
+ /* if uprobe went away, it's safe to ignore it */
+ if (try_get_uprobe(u))
+ list_add(&u->pending_list, head);
}
for (t = n; (t = rb_next(t)); ) {
u = rb_entry(t, struct uprobe, rb_node);
if (u->inode != inode || u->offset > max)
break;
- list_add(&u->pending_list, head);
- get_uprobe(u);
+ /* if uprobe went away, it's safe to ignore it */
+ if (try_get_uprobe(u))
+ list_add(&u->pending_list, head);
}
}
- spin_unlock(&uprobes_treelock);
+ read_unlock(&uprobes_treelock);
}
/* @vma contains reference counter, not the probed instruction. */
@@ -1354,7 +1562,7 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
}
/*
- * Called from mmap_region/vma_adjust with mm->mmap_lock acquired.
+ * Called from mmap_region/vma_merge with mm->mmap_lock acquired.
*
* Currently we ignore all errors and always return 0, the callers
* can't handle the failure anyway.
@@ -1389,7 +1597,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
*/
list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
if (!fatal_signal_pending(current) &&
- filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
+ filter_chain(uprobe, vma->vm_mm)) {
unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
}
@@ -1412,9 +1620,9 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
min = vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
- spin_lock(&uprobes_treelock);
+ read_lock(&uprobes_treelock);
n = find_node_in_range(inode, min, max);
- spin_unlock(&uprobes_treelock);
+ read_unlock(&uprobes_treelock);
return !!n;
}
@@ -1438,6 +1646,27 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
}
+static vm_fault_t xol_fault(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct xol_area *area = vma->vm_mm->uprobes_state.xol_area;
+
+ vmf->page = area->page;
+ get_page(vmf->page);
+ return 0;
+}
+
+static int xol_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
+{
+ return -EPERM;
+}
+
+static const struct vm_special_mapping xol_mapping = {
+ .name = "[uprobes]",
+ .fault = xol_fault,
+ .mremap = xol_mremap,
+};
+
/* Slot allocation for XOL */
static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
{
@@ -1464,7 +1693,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
- &area->xol_mapping);
+ &xol_mapping);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto fail;
@@ -1479,13 +1708,22 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
return ret;
}
+void * __weak arch_uprobe_trampoline(unsigned long *psize)
+{
+ static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+
+ *psize = UPROBE_SWBP_INSN_SIZE;
+ return &insn;
+}
+
static struct xol_area *__create_xol_area(unsigned long vaddr)
{
struct mm_struct *mm = current->mm;
- uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+ unsigned long insns_size;
struct xol_area *area;
+ void *insns;
- area = kmalloc(sizeof(*area), GFP_KERNEL);
+ area = kzalloc(sizeof(*area), GFP_KERNEL);
if (unlikely(!area))
goto out;
@@ -1494,25 +1732,21 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
if (!area->bitmap)
goto free_area;
- area->xol_mapping.name = "[uprobes]";
- area->xol_mapping.fault = NULL;
- area->xol_mapping.pages = area->pages;
- area->pages[0] = alloc_page(GFP_HIGHUSER);
- if (!area->pages[0])
+ area->page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
+ if (!area->page)
goto free_bitmap;
- area->pages[1] = NULL;
area->vaddr = vaddr;
init_waitqueue_head(&area->wq);
/* Reserve the 1st slot for get_trampoline_vaddr() */
set_bit(0, area->bitmap);
- atomic_set(&area->slot_count, 1);
- arch_uprobe_copy_ixol(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
+ insns = arch_uprobe_trampoline(&insns_size);
+ arch_uprobe_copy_ixol(area->page, 0, insns, insns_size);
if (!xol_add_vma(mm, area))
return area;
- __free_page(area->pages[0]);
+ __free_page(area->page);
free_bitmap:
kfree(area->bitmap);
free_area:
@@ -1554,7 +1788,7 @@ void uprobe_clear_state(struct mm_struct *mm)
if (!area)
return;
- put_page(area->pages[0]);
+ put_page(area->page);
kfree(area->bitmap);
kfree(area);
}
@@ -1578,92 +1812,57 @@ void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
}
}
-/*
- * - search for a free slot.
- */
-static unsigned long xol_take_insn_slot(struct xol_area *area)
+static unsigned long xol_get_slot_nr(struct xol_area *area)
{
- unsigned long slot_addr;
- int slot_nr;
-
- do {
- slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
- if (slot_nr < UINSNS_PER_PAGE) {
- if (!test_and_set_bit(slot_nr, area->bitmap))
- break;
-
- slot_nr = UINSNS_PER_PAGE;
- continue;
- }
- wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
- } while (slot_nr >= UINSNS_PER_PAGE);
+ unsigned long slot_nr;
- slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
- atomic_inc(&area->slot_count);
+ slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
+ if (slot_nr < UINSNS_PER_PAGE) {
+ if (!test_and_set_bit(slot_nr, area->bitmap))
+ return slot_nr;
+ }
- return slot_addr;
+ return UINSNS_PER_PAGE;
}
/*
* xol_get_insn_slot - allocate a slot for xol.
- * Returns the allocated slot address or 0.
*/
-static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
+static bool xol_get_insn_slot(struct uprobe *uprobe, struct uprobe_task *utask)
{
- struct xol_area *area;
- unsigned long xol_vaddr;
+ struct xol_area *area = get_xol_area();
+ unsigned long slot_nr;
- area = get_xol_area();
if (!area)
- return 0;
+ return false;
- xol_vaddr = xol_take_insn_slot(area);
- if (unlikely(!xol_vaddr))
- return 0;
+ wait_event(area->wq, (slot_nr = xol_get_slot_nr(area)) < UINSNS_PER_PAGE);
- arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
+ utask->xol_vaddr = area->vaddr + slot_nr * UPROBE_XOL_SLOT_BYTES;
+ arch_uprobe_copy_ixol(area->page, utask->xol_vaddr,
&uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
-
- return xol_vaddr;
+ return true;
}
/*
- * xol_free_insn_slot - If slot was earlier allocated by
- * @xol_get_insn_slot(), make the slot available for
- * subsequent requests.
+ * xol_free_insn_slot - free the slot allocated by xol_get_insn_slot()
*/
-static void xol_free_insn_slot(struct task_struct *tsk)
+static void xol_free_insn_slot(struct uprobe_task *utask)
{
- struct xol_area *area;
- unsigned long vma_end;
- unsigned long slot_addr;
+ struct xol_area *area = current->mm->uprobes_state.xol_area;
+ unsigned long offset = utask->xol_vaddr - area->vaddr;
+ unsigned int slot_nr;
- if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
+ utask->xol_vaddr = 0;
+ /* xol_vaddr must fit into [area->vaddr, area->vaddr + PAGE_SIZE) */
+ if (WARN_ON_ONCE(offset >= PAGE_SIZE))
return;
- slot_addr = tsk->utask->xol_vaddr;
- if (unlikely(!slot_addr))
- return;
-
- area = tsk->mm->uprobes_state.xol_area;
- vma_end = area->vaddr + PAGE_SIZE;
- if (area->vaddr <= slot_addr && slot_addr < vma_end) {
- unsigned long offset;
- int slot_nr;
-
- offset = slot_addr - area->vaddr;
- slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
- if (slot_nr >= UINSNS_PER_PAGE)
- return;
-
- clear_bit(slot_nr, area->bitmap);
- atomic_dec(&area->slot_count);
- smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
- if (waitqueue_active(&area->wq))
- wake_up(&area->wq);
-
- tsk->utask->xol_vaddr = 0;
- }
+ slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
+ clear_bit(slot_nr, area->bitmap);
+ smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
+ if (waitqueue_active(&area->wq))
+ wake_up(&area->wq);
}
void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
@@ -1702,12 +1901,57 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
return instruction_pointer(regs);
}
-static struct return_instance *free_ret_instance(struct return_instance *ri)
+static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri)
{
- struct return_instance *next = ri->next;
- put_uprobe(ri->uprobe);
- kfree(ri);
- return next;
+ ri->cons_cnt = 0;
+ ri->next = utask->ri_pool;
+ utask->ri_pool = ri;
+}
+
+static struct return_instance *ri_pool_pop(struct uprobe_task *utask)
+{
+ struct return_instance *ri = utask->ri_pool;
+
+ if (likely(ri))
+ utask->ri_pool = ri->next;
+
+ return ri;
+}
+
+static void ri_free(struct return_instance *ri)
+{
+ kfree(ri->extra_consumers);
+ kfree_rcu(ri, rcu);
+}
+
+static void free_ret_instance(struct uprobe_task *utask,
+ struct return_instance *ri, bool cleanup_hprobe)
+{
+ unsigned seq;
+
+ if (cleanup_hprobe) {
+ enum hprobe_state hstate;
+
+ (void)hprobe_consume(&ri->hprobe, &hstate);
+ hprobe_finalize(&ri->hprobe, hstate);
+ }
+
+ /*
+ * At this point return_instance is unlinked from utask's
+ * return_instances list and this has become visible to ri_timer().
+ * If seqcount now indicates that ri_timer's return instance
+ * processing loop isn't active, we can return ri into the pool of
+ * to-be-reused return instances for future uretprobes. If ri_timer()
+ * happens to be running right now, though, we fallback to safety and
+ * just perform RCU-delated freeing of ri.
+ */
+ if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) {
+ /* immediate reuse of ri without RCU GP is OK */
+ ri_pool_push(utask, ri);
+ } else {
+ /* we might be racing with ri_timer(), so play it safe */
+ ri_free(ri);
+ }
}
/*
@@ -1717,25 +1961,73 @@ static struct return_instance *free_ret_instance(struct return_instance *ri)
void uprobe_free_utask(struct task_struct *t)
{
struct uprobe_task *utask = t->utask;
- struct return_instance *ri;
+ struct return_instance *ri, *ri_next;
if (!utask)
return;
- if (utask->active_uprobe)
- put_uprobe(utask->active_uprobe);
+ t->utask = NULL;
+ WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr);
+
+ timer_delete_sync(&utask->ri_timer);
ri = utask->return_instances;
- while (ri)
- ri = free_ret_instance(ri);
+ while (ri) {
+ ri_next = ri->next;
+ free_ret_instance(utask, ri, true /* cleanup_hprobe */);
+ ri = ri_next;
+ }
+
+ /* free_ret_instance() above might add to ri_pool, so this loop should come last */
+ ri = utask->ri_pool;
+ while (ri) {
+ ri_next = ri->next;
+ ri_free(ri);
+ ri = ri_next;
+ }
- xol_free_insn_slot(t);
kfree(utask);
- t->utask = NULL;
+}
+
+#define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */
+
+#define for_each_ret_instance_rcu(pos, head) \
+ for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next))
+
+static void ri_timer(struct timer_list *timer)
+{
+ struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer);
+ struct return_instance *ri;
+
+ /* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */
+ guard(srcu)(&uretprobes_srcu);
+ /* RCU protects return_instance from freeing. */
+ guard(rcu)();
+
+ write_seqcount_begin(&utask->ri_seqcount);
+
+ for_each_ret_instance_rcu(ri, utask->return_instances)
+ hprobe_expire(&ri->hprobe, false);
+
+ write_seqcount_end(&utask->ri_seqcount);
+}
+
+static struct uprobe_task *alloc_utask(void)
+{
+ struct uprobe_task *utask;
+
+ utask = kzalloc(sizeof(*utask), GFP_KERNEL);
+ if (!utask)
+ return NULL;
+
+ timer_setup(&utask->ri_timer, ri_timer, 0);
+ seqcount_init(&utask->ri_seqcount);
+
+ return utask;
}
/*
- * Allocate a uprobe_task object for the task if if necessary.
+ * Allocate a uprobe_task object for the task if necessary.
* Called when the thread hits a breakpoint.
*
* Returns:
@@ -1745,44 +2037,87 @@ void uprobe_free_utask(struct task_struct *t)
static struct uprobe_task *get_utask(void)
{
if (!current->utask)
- current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ current->utask = alloc_utask();
return current->utask;
}
+static struct return_instance *alloc_return_instance(struct uprobe_task *utask)
+{
+ struct return_instance *ri;
+
+ ri = ri_pool_pop(utask);
+ if (ri)
+ return ri;
+
+ ri = kzalloc(sizeof(*ri), GFP_KERNEL);
+ if (!ri)
+ return ZERO_SIZE_PTR;
+
+ return ri;
+}
+
+static struct return_instance *dup_return_instance(struct return_instance *old)
+{
+ struct return_instance *ri;
+
+ ri = kmemdup(old, sizeof(*ri), GFP_KERNEL);
+ if (!ri)
+ return NULL;
+
+ if (unlikely(old->cons_cnt > 1)) {
+ ri->extra_consumers = kmemdup(old->extra_consumers,
+ sizeof(ri->extra_consumers[0]) * (old->cons_cnt - 1),
+ GFP_KERNEL);
+ if (!ri->extra_consumers) {
+ kfree(ri);
+ return NULL;
+ }
+ }
+
+ return ri;
+}
+
static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
{
struct uprobe_task *n_utask;
struct return_instance **p, *o, *n;
+ struct uprobe *uprobe;
- n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ n_utask = alloc_utask();
if (!n_utask)
return -ENOMEM;
t->utask = n_utask;
+ /* protect uprobes from freeing, we'll need try_get_uprobe() them */
+ guard(srcu)(&uretprobes_srcu);
+
p = &n_utask->return_instances;
for (o = o_utask->return_instances; o; o = o->next) {
- n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
+ n = dup_return_instance(o);
if (!n)
return -ENOMEM;
- *n = *o;
- get_uprobe(n->uprobe);
- n->next = NULL;
+ /* if uprobe is non-NULL, we'll have an extra refcount for uprobe */
+ uprobe = hprobe_expire(&o->hprobe, true);
+
+ /*
+ * New utask will have stable properly refcounted uprobe or
+ * NULL. Even if we failed to get refcounted uprobe, we still
+ * need to preserve full set of return_instances for proper
+ * uretprobe handling and nesting in forked task.
+ */
+ hprobe_init_stable(&n->hprobe, uprobe);
- *p = n;
+ n->next = NULL;
+ rcu_assign_pointer(*p, n);
p = &n->next;
+
n_utask->depth++;
}
return 0;
}
-static void uprobe_warn(struct task_struct *t, const char *msg)
-{
- pr_warn("uprobe: %s:%d failed to %s\n",
- current->comm, current->pid, msg);
-}
-
static void dup_xol_work(struct callback_head *work)
{
if (current->flags & PF_EXITING)
@@ -1832,7 +2167,7 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags)
*
* Returns -1 in case the xol_area is not allocated.
*/
-static unsigned long get_trampoline_vaddr(void)
+unsigned long uprobe_get_trampoline_vaddr(void)
{
struct xol_area *area;
unsigned long trampoline_vaddr = -1;
@@ -1848,45 +2183,41 @@ static unsigned long get_trampoline_vaddr(void)
static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
struct pt_regs *regs)
{
- struct return_instance *ri = utask->return_instances;
+ struct return_instance *ri = utask->return_instances, *ri_next;
enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
- ri = free_ret_instance(ri);
+ ri_next = ri->next;
+ rcu_assign_pointer(utask->return_instances, ri_next);
utask->depth--;
+
+ free_ret_instance(utask, ri, true /* cleanup_hprobe */);
+ ri = ri_next;
}
- utask->return_instances = ri;
}
-static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
+static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
+ struct return_instance *ri)
{
- struct return_instance *ri;
- struct uprobe_task *utask;
+ struct uprobe_task *utask = current->utask;
unsigned long orig_ret_vaddr, trampoline_vaddr;
bool chained;
+ int srcu_idx;
if (!get_xol_area())
- return;
-
- utask = get_utask();
- if (!utask)
- return;
+ goto free;
if (utask->depth >= MAX_URETPROBE_DEPTH) {
printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
" nestedness limit pid/tgid=%d/%d\n",
current->pid, current->tgid);
- return;
+ goto free;
}
- ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
- if (!ri)
- return;
-
- trampoline_vaddr = get_trampoline_vaddr();
+ trampoline_vaddr = uprobe_get_trampoline_vaddr();
orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
if (orig_ret_vaddr == -1)
- goto fail;
+ goto free;
/* drop the entries invalidated by longjmp() */
chained = (orig_ret_vaddr == trampoline_vaddr);
@@ -1904,54 +2235,60 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
* attack from user-space.
*/
uprobe_warn(current, "handle tail call");
- goto fail;
+ goto free;
}
orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
}
- ri->uprobe = get_uprobe(uprobe);
+ /* __srcu_read_lock() because SRCU lock survives switch to user space */
+ srcu_idx = __srcu_read_lock(&uretprobes_srcu);
+
ri->func = instruction_pointer(regs);
ri->stack = user_stack_pointer(regs);
ri->orig_ret_vaddr = orig_ret_vaddr;
ri->chained = chained;
utask->depth++;
+
+ hprobe_init_leased(&ri->hprobe, uprobe, srcu_idx);
ri->next = utask->return_instances;
- utask->return_instances = ri;
+ rcu_assign_pointer(utask->return_instances, ri);
+
+ mod_timer(&utask->ri_timer, jiffies + RI_TIMER_PERIOD);
return;
- fail:
- kfree(ri);
+free:
+ ri_free(ri);
}
/* Prepare to single-step probed instruction out of line. */
static int
pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
{
- struct uprobe_task *utask;
- unsigned long xol_vaddr;
+ struct uprobe_task *utask = current->utask;
int err;
- utask = get_utask();
- if (!utask)
- return -ENOMEM;
+ if (!try_get_uprobe(uprobe))
+ return -EINVAL;
- xol_vaddr = xol_get_insn_slot(uprobe);
- if (!xol_vaddr)
- return -ENOMEM;
+ if (!xol_get_insn_slot(uprobe, utask)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
- utask->xol_vaddr = xol_vaddr;
utask->vaddr = bp_vaddr;
-
err = arch_uprobe_pre_xol(&uprobe->arch, regs);
if (unlikely(err)) {
- xol_free_insn_slot(current);
- return err;
+ xol_free_insn_slot(utask);
+ goto err_out;
}
utask->active_uprobe = uprobe;
utask->state = UTASK_SSTEP;
return 0;
+err_out:
+ put_uprobe(uprobe);
+ return err;
}
/*
@@ -1989,9 +2326,10 @@ bool uprobe_deny_signal(void)
static void mmf_recalc_uprobes(struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!valid_vma(vma, false))
continue;
/*
@@ -2023,14 +2361,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
if (likely(result == 0))
goto out;
- /*
- * The NULL 'tsk' here ensures that any faults that occur here
- * will not be accounted to the task. 'mm' *is* current->mm,
- * but we treat this as a 'remote' access since it is
- * essentially a kernel access to the memory.
- */
- result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page,
- NULL, NULL);
+ result = get_user_pages(vaddr, 1, FOLL_FORCE, &page);
if (result < 0)
return result;
@@ -2041,20 +2372,66 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
return is_trap_insn(&opcode);
}
-static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
+static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
+{
+ struct mm_struct *mm = current->mm;
+ struct uprobe *uprobe = NULL;
+ struct vm_area_struct *vma;
+ struct file *vm_file;
+ loff_t offset;
+ unsigned int seq;
+
+ guard(rcu)();
+
+ if (!mmap_lock_speculate_try_begin(mm, &seq))
+ return NULL;
+
+ vma = vma_lookup(mm, bp_vaddr);
+ if (!vma)
+ return NULL;
+
+ /*
+ * vm_file memory can be reused for another instance of struct file,
+ * but can't be freed from under us, so it's safe to read fields from
+ * it, even if the values are some garbage values; ultimately
+ * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure
+ * that whatever we speculatively found is correct
+ */
+ vm_file = READ_ONCE(vma->vm_file);
+ if (!vm_file)
+ return NULL;
+
+ offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start);
+ uprobe = find_uprobe_rcu(vm_file->f_inode, offset);
+ if (!uprobe)
+ return NULL;
+
+ /* now double check that nothing about MM changed */
+ if (mmap_lock_speculate_retry(mm, seq))
+ return NULL;
+
+ return uprobe;
+}
+
+/* assumes being inside RCU protected region */
+static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp)
{
struct mm_struct *mm = current->mm;
struct uprobe *uprobe = NULL;
struct vm_area_struct *vma;
+ uprobe = find_active_uprobe_speculative(bp_vaddr);
+ if (uprobe)
+ return uprobe;
+
mmap_read_lock(mm);
- vma = find_vma(mm, bp_vaddr);
- if (vma && vma->vm_start <= bp_vaddr) {
- if (valid_vma(vma, false)) {
+ vma = vma_lookup(mm, bp_vaddr);
+ if (vma) {
+ if (vma->vm_file) {
struct inode *inode = file_inode(vma->vm_file);
loff_t offset = vaddr_to_offset(vma, bp_vaddr);
- uprobe = find_uprobe(inode, offset);
+ uprobe = find_uprobe_rcu(inode, offset);
}
if (!uprobe)
@@ -2070,50 +2447,125 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
return uprobe;
}
+static struct return_instance *push_consumer(struct return_instance *ri, __u64 id, __u64 cookie)
+{
+ struct return_consumer *ric;
+
+ if (unlikely(ri == ZERO_SIZE_PTR))
+ return ri;
+
+ if (unlikely(ri->cons_cnt > 0)) {
+ ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL);
+ if (!ric) {
+ ri_free(ri);
+ return ZERO_SIZE_PTR;
+ }
+ ri->extra_consumers = ric;
+ }
+
+ ric = likely(ri->cons_cnt == 0) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - 1];
+ ric->id = id;
+ ric->cookie = cookie;
+
+ ri->cons_cnt++;
+ return ri;
+}
+
+static struct return_consumer *
+return_consumer_find(struct return_instance *ri, int *iter, int id)
+{
+ struct return_consumer *ric;
+ int idx;
+
+ for (idx = *iter; idx < ri->cons_cnt; idx++)
+ {
+ ric = likely(idx == 0) ? &ri->consumer : &ri->extra_consumers[idx - 1];
+ if (ric->id == id) {
+ *iter = idx + 1;
+ return ric;
+ }
+ }
+
+ return NULL;
+}
+
+static bool ignore_ret_handler(int rc)
+{
+ return rc == UPROBE_HANDLER_REMOVE || rc == UPROBE_HANDLER_IGNORE;
+}
+
static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
{
struct uprobe_consumer *uc;
- int remove = UPROBE_HANDLER_REMOVE;
- bool need_prep = false; /* prepare return uprobe, when needed */
+ bool has_consumers = false, remove = true;
+ struct return_instance *ri = NULL;
+ struct uprobe_task *utask = current->utask;
+
+ utask->auprobe = &uprobe->arch;
- down_read(&uprobe->register_rwsem);
- for (uc = uprobe->consumers; uc; uc = uc->next) {
+ list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
+ bool session = uc->handler && uc->ret_handler;
+ __u64 cookie = 0;
int rc = 0;
if (uc->handler) {
- rc = uc->handler(uc, regs);
- WARN(rc & ~UPROBE_HANDLER_MASK,
+ rc = uc->handler(uc, regs, &cookie);
+ WARN(rc < 0 || rc > 2,
"bad rc=0x%x from %ps()\n", rc, uc->handler);
}
- if (uc->ret_handler)
- need_prep = true;
+ remove &= rc == UPROBE_HANDLER_REMOVE;
+ has_consumers = true;
+
+ if (!uc->ret_handler || ignore_ret_handler(rc))
+ continue;
- remove &= rc;
+ if (!ri)
+ ri = alloc_return_instance(utask);
+
+ if (session)
+ ri = push_consumer(ri, uc->id, cookie);
}
+ utask->auprobe = NULL;
+
+ if (!ZERO_OR_NULL_PTR(ri))
+ prepare_uretprobe(uprobe, regs, ri);
+
+ if (remove && has_consumers) {
+ down_read(&uprobe->register_rwsem);
- if (need_prep && !remove)
- prepare_uretprobe(uprobe, regs); /* put bp at return */
+ /* re-check that removal is still required, this time under lock */
+ if (!filter_chain(uprobe, current->mm)) {
+ WARN_ON(!uprobe_is_active(uprobe));
+ unapply_uprobe(uprobe, current->mm);
+ }
- if (remove && uprobe->consumers) {
- WARN_ON(!uprobe_is_active(uprobe));
- unapply_uprobe(uprobe, current->mm);
+ up_read(&uprobe->register_rwsem);
}
- up_read(&uprobe->register_rwsem);
}
static void
-handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
+handle_uretprobe_chain(struct return_instance *ri, struct uprobe *uprobe, struct pt_regs *regs)
{
- struct uprobe *uprobe = ri->uprobe;
+ struct return_consumer *ric;
struct uprobe_consumer *uc;
+ int ric_idx = 0;
+
+ /* all consumers unsubscribed meanwhile */
+ if (unlikely(!uprobe))
+ return;
+
+ rcu_read_lock_trace();
+ list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
+ bool session = uc->handler && uc->ret_handler;
- down_read(&uprobe->register_rwsem);
- for (uc = uprobe->consumers; uc; uc = uc->next) {
- if (uc->ret_handler)
- uc->ret_handler(uc, ri->func, regs);
+ if (uc->ret_handler) {
+ ric = return_consumer_find(ri, &ric_idx, uc->id);
+ if (!session || ric)
+ uc->ret_handler(uc, ri->func, regs, ric ? &ric->cookie : NULL);
+ }
}
- up_read(&uprobe->register_rwsem);
+ rcu_read_unlock_trace();
}
static struct return_instance *find_next_ret_chain(struct return_instance *ri)
@@ -2128,10 +2580,12 @@ static struct return_instance *find_next_ret_chain(struct return_instance *ri)
return ri;
}
-static void handle_trampoline(struct pt_regs *regs)
+void uprobe_handle_trampoline(struct pt_regs *regs)
{
struct uprobe_task *utask;
- struct return_instance *ri, *next;
+ struct return_instance *ri, *ri_next, *next_chain;
+ struct uprobe *uprobe;
+ enum hprobe_state hstate;
bool valid;
utask = current->utask;
@@ -2149,25 +2603,39 @@ static void handle_trampoline(struct pt_regs *regs)
* or NULL; the latter case means that nobody but ri->func
* could hit this trampoline on return. TODO: sigaltstack().
*/
- next = find_next_ret_chain(ri);
- valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs);
+ next_chain = find_next_ret_chain(ri);
+ valid = !next_chain || arch_uretprobe_is_alive(next_chain, RP_CHECK_RET, regs);
instruction_pointer_set(regs, ri->orig_ret_vaddr);
do {
- if (valid)
- handle_uretprobe_chain(ri, regs);
- ri = free_ret_instance(ri);
+ /* pop current instance from the stack of pending return instances,
+ * as it's not pending anymore: we just fixed up original
+ * instruction pointer in regs and are about to call handlers;
+ * this allows fixup_uretprobe_trampoline_entries() to properly fix up
+ * captured stack traces from uretprobe handlers, in which pending
+ * trampoline addresses on the stack are replaced with correct
+ * original return addresses
+ */
+ ri_next = ri->next;
+ rcu_assign_pointer(utask->return_instances, ri_next);
utask->depth--;
- } while (ri != next);
+
+ uprobe = hprobe_consume(&ri->hprobe, &hstate);
+ if (valid)
+ handle_uretprobe_chain(ri, uprobe, regs);
+ hprobe_finalize(&ri->hprobe, hstate);
+
+ /* We already took care of hprobe, no need to waste more time on that. */
+ free_ret_instance(utask, ri, false /* !cleanup_hprobe */);
+ ri = ri_next;
+ } while (ri != next_chain);
} while (!valid);
- utask->return_instances = ri;
return;
- sigill:
+sigill:
uprobe_warn(current, "handle uretprobe, sending SIGILL.");
force_sig(SIGILL);
-
}
bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
@@ -2192,10 +2660,12 @@ static void handle_swbp(struct pt_regs *regs)
int is_swbp;
bp_vaddr = uprobe_get_swbp_addr(regs);
- if (bp_vaddr == get_trampoline_vaddr())
- return handle_trampoline(regs);
+ if (bp_vaddr == uprobe_get_trampoline_vaddr())
+ return uprobe_handle_trampoline(regs);
+
+ rcu_read_lock_trace();
- uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
+ uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
if (!uprobe) {
if (is_swbp > 0) {
/* No matching uprobe; signal SIGTRAP. */
@@ -2211,7 +2681,7 @@ static void handle_swbp(struct pt_regs *regs)
*/
instruction_pointer_set(regs, bp_vaddr);
}
- return;
+ goto out;
}
/* change it in advance for ->handler() and restart */
@@ -2246,12 +2716,12 @@ static void handle_swbp(struct pt_regs *regs)
if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
goto out;
- if (!pre_ssout(uprobe, regs, bp_vaddr))
- return;
+ if (pre_ssout(uprobe, regs, bp_vaddr))
+ goto out;
- /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
out:
- put_uprobe(uprobe);
+ /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
+ rcu_read_unlock_trace();
}
/*
@@ -2274,7 +2744,7 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
put_uprobe(uprobe);
utask->active_uprobe = NULL;
utask->state = UTASK_RUNNING;
- xol_free_insn_slot(current);
+ xol_free_insn_slot(utask);
spin_lock_irq(&current->sighand->siglock);
recalc_sigpending(); /* see uprobe_deny_signal() */