diff options
Diffstat (limited to 'kernel/events/uprobes.c')
-rw-r--r-- | kernel/events/uprobes.c | 1400 |
1 files changed, 935 insertions, 465 deletions
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5f8b0c52fd2e..b4ca8898fe17 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -15,18 +15,20 @@ #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/mm.h> -#include <linux/sched/coredump.h> #include <linux/export.h> #include <linux/rmap.h> /* anon_vma_prepare */ -#include <linux/mmu_notifier.h> /* set_pte_at_notify */ -#include <linux/swap.h> /* try_to_free_swap */ +#include <linux/mmu_notifier.h> +#include <linux/swap.h> /* folio_free_swap */ #include <linux/ptrace.h> /* user_enable_single_step */ #include <linux/kdebug.h> /* notifier mechanism */ -#include "../../mm/internal.h" /* munlock_vma_page */ #include <linux/percpu-rwsem.h> #include <linux/task_work.h> #include <linux/shmem_fs.h> #include <linux/khugepaged.h> +#include <linux/rcupdate_trace.h> +#include <linux/workqueue.h> +#include <linux/srcu.h> +#include <linux/oom.h> /* check_stable_address_space */ #include <linux/uprobes.h> @@ -40,7 +42,8 @@ static struct rb_root uprobes_tree = RB_ROOT; */ #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) -static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ +static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ +static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock); #define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ @@ -49,6 +52,9 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); +/* Covers return_instance's uprobe lifetime. */ +DEFINE_STATIC_SRCU(uretprobes_srcu); + /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 @@ -58,11 +64,15 @@ struct uprobe { struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; struct list_head pending_list; - struct uprobe_consumer *consumers; + struct list_head consumers; struct inode *inode; /* Also hold a ref to inode */ + union { + struct rcu_head rcu; + struct work_struct work; + }; loff_t offset; loff_t ref_ctr_offset; - unsigned long flags; + unsigned long flags; /* "unsigned long" so bitops work */ /* * The generic code assumes that it has two members of unknown type @@ -97,11 +107,9 @@ static LIST_HEAD(delayed_uprobe_list); */ struct xol_area { wait_queue_head_t wq; /* if all slots are busy */ - atomic_t slot_count; /* number of in-use slots */ unsigned long *bitmap; /* 0 = free slot */ - struct vm_special_mapping xol_mapping; - struct page *pages[2]; + struct page *page; /* * We keep the vma's vm_start rather than a pointer to the vma * itself. The probed process or a naughty kernel module could make @@ -110,6 +118,11 @@ struct xol_area { unsigned long vaddr; /* Page(s) of instruction slots */ }; +static void uprobe_warn(struct task_struct *t, const char *msg) +{ + pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg); +} + /* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have @@ -154,26 +167,25 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { + struct folio *old_folio = page_folio(old_page); + struct folio *new_folio; struct mm_struct *mm = vma->vm_mm; - struct page_vma_mapped_walk pvmw = { - .page = compound_head(old_page), - .vma = vma, - .address = addr, - }; + DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); int err; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + PAGE_SIZE); if (new_page) { - err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL); + new_folio = page_folio(new_page); + err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL); if (err) return err; } - /* For try_to_free_swap() and munlock_vma_page() below */ - lock_page(old_page); + /* For folio_free_swap() below */ + folio_lock(old_folio); mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; @@ -182,37 +194,34 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, VM_BUG_ON_PAGE(addr != pvmw.address, old_page); if (new_page) { - get_page(new_page); - page_add_new_anon_rmap(new_page, vma, addr, false); - lru_cache_add_active_or_unevictable(new_page, vma); + folio_get(new_folio); + folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE); + folio_add_lru_vma(new_folio, vma); } else /* no new page, just dec_mm_counter for old_page */ dec_mm_counter(mm, MM_ANONPAGES); - if (!PageAnon(old_page)) { - dec_mm_counter(mm, mm_counter_file(old_page)); + if (!folio_test_anon(old_folio)) { + dec_mm_counter(mm, mm_counter_file(old_folio)); inc_mm_counter(mm, MM_ANONPAGES); } - flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); - ptep_clear_flush_notify(vma, addr, pvmw.pte); + flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte))); + ptep_clear_flush(vma, addr, pvmw.pte); if (new_page) - set_pte_at_notify(mm, addr, pvmw.pte, - mk_pte(new_page, vma->vm_page_prot)); + set_pte_at(mm, addr, pvmw.pte, + mk_pte(new_page, vma->vm_page_prot)); - page_remove_rmap(old_page, false); - if (!page_mapped(old_page)) - try_to_free_swap(old_page); + folio_remove_rmap_pte(old_folio, old_page, vma); + if (!folio_mapped(old_folio)) + folio_free_swap(old_folio); page_vma_mapped_walk_done(&pvmw); - - if (vma->vm_flags & VM_LOCKED) - munlock_vma_page(old_page); - put_page(old_page); + folio_put(old_folio); err = 0; unlock: mmu_notifier_invalidate_range_end(&range); - unlock_page(old_page); + folio_unlock(old_folio); return err; } @@ -355,9 +364,10 @@ static bool valid_ref_ctr_vma(struct uprobe *uprobe, static struct vm_area_struct * find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *tmp; - for (tmp = mm->mmap; tmp; tmp = tmp->vm_next) + for_each_vma(vmi, tmp) if (valid_ref_ctr_vma(uprobe, tmp)) return tmp; @@ -369,15 +379,14 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) { void *kaddr; struct page *page; - struct vm_area_struct *vma; int ret; short *ptr; if (!vaddr || !d) return -EINVAL; - ret = get_user_pages_remote(NULL, mm, vaddr, 1, - FOLL_WRITE, &page, &vma, NULL); + ret = get_user_pages_remote(mm, vaddr, 1, + FOLL_WRITE, &page, NULL); if (unlikely(ret <= 0)) { /* * We are asking for 1 page. If get_user_pages_remote() fails, @@ -408,7 +417,7 @@ static void update_ref_ctr_warn(struct uprobe *uprobe, struct mm_struct *mm, short d) { pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " - "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n", + "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n", d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, (unsigned long long) uprobe->ref_ctr_offset, mm); @@ -453,11 +462,12 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, * that have fixed length instructions. * * uprobe_write_opcode - write the opcode at a given virtual address. + * @auprobe: arch specific probepoint information. * @mm: the probed process address space. * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. * - * Called with mm->mmap_lock held for write. + * Called with mm->mmap_lock held for read or write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, @@ -477,15 +487,19 @@ retry: if (is_register) gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, gup_flags, - &old_page, &vma, NULL); - if (ret <= 0) - return ret; + old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); + if (IS_ERR(old_page)) + return PTR_ERR(old_page); ret = verify_opcode(old_page, vaddr, &opcode); if (ret <= 0) goto put_old; + if (is_zero_page(old_page)) { + ret = -EINVAL; + goto put_old; + } + if (WARN(!is_register && PageCompound(old_page), "uprobe unregister should never work on compound page\n")) { ret = -EINVAL; @@ -542,7 +556,7 @@ retry: } } - ret = __replace_page(vma, vaddr, old_page, new_page); + ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page); if (new_page) put_page(new_page); put_old: @@ -557,7 +571,7 @@ put_old: /* try collapse pmd for compound page */ if (!ret && orig_page_huge) - collapse_pte_mapped_thp(mm, vaddr); + collapse_pte_mapped_thp(mm, vaddr, false); return ret; } @@ -592,125 +606,350 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v *(uprobe_opcode_t *)&auprobe->insn); } +/* uprobe should have guaranteed positive refcount */ static struct uprobe *get_uprobe(struct uprobe *uprobe) { refcount_inc(&uprobe->ref); return uprobe; } +/* + * uprobe should have guaranteed lifetime, which can be either of: + * - caller already has refcount taken (and wants an extra one); + * - uprobe is RCU protected and won't be freed until after grace period; + * - we are holding uprobes_treelock (for read or write, doesn't matter). + */ +static struct uprobe *try_get_uprobe(struct uprobe *uprobe) +{ + if (refcount_inc_not_zero(&uprobe->ref)) + return uprobe; + return NULL; +} + +static inline bool uprobe_is_active(struct uprobe *uprobe) +{ + return !RB_EMPTY_NODE(&uprobe->rb_node); +} + +static void uprobe_free_rcu_tasks_trace(struct rcu_head *rcu) +{ + struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); + + kfree(uprobe); +} + +static void uprobe_free_srcu(struct rcu_head *rcu) +{ + struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); + + call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu_tasks_trace); +} + +static void uprobe_free_deferred(struct work_struct *work) +{ + struct uprobe *uprobe = container_of(work, struct uprobe, work); + + write_lock(&uprobes_treelock); + + if (uprobe_is_active(uprobe)) { + write_seqcount_begin(&uprobes_seqcount); + rb_erase(&uprobe->rb_node, &uprobes_tree); + write_seqcount_end(&uprobes_seqcount); + } + + write_unlock(&uprobes_treelock); + + /* + * If application munmap(exec_vma) before uprobe_unregister() + * gets called, we don't get a chance to remove uprobe from + * delayed_uprobe_list from remove_breakpoint(). Do it here. + */ + mutex_lock(&delayed_uprobe_lock); + delayed_uprobe_remove(uprobe, NULL); + mutex_unlock(&delayed_uprobe_lock); + + /* start srcu -> rcu_tasks_trace -> kfree chain */ + call_srcu(&uretprobes_srcu, &uprobe->rcu, uprobe_free_srcu); +} + static void put_uprobe(struct uprobe *uprobe) { - if (refcount_dec_and_test(&uprobe->ref)) { + if (!refcount_dec_and_test(&uprobe->ref)) + return; + + INIT_WORK(&uprobe->work, uprobe_free_deferred); + schedule_work(&uprobe->work); +} + +/* Initialize hprobe as SRCU-protected "leased" uprobe */ +static void hprobe_init_leased(struct hprobe *hprobe, struct uprobe *uprobe, int srcu_idx) +{ + WARN_ON(!uprobe); + hprobe->state = HPROBE_LEASED; + hprobe->uprobe = uprobe; + hprobe->srcu_idx = srcu_idx; +} + +/* Initialize hprobe as refcounted ("stable") uprobe (uprobe can be NULL). */ +static void hprobe_init_stable(struct hprobe *hprobe, struct uprobe *uprobe) +{ + hprobe->state = uprobe ? HPROBE_STABLE : HPROBE_GONE; + hprobe->uprobe = uprobe; + hprobe->srcu_idx = -1; +} + +/* + * hprobe_consume() fetches hprobe's underlying uprobe and detects whether + * uprobe is SRCU protected or is refcounted. hprobe_consume() can be + * used only once for a given hprobe. + * + * Caller has to call hprobe_finalize() and pass previous hprobe_state, so + * that hprobe_finalize() can perform SRCU unlock or put uprobe, whichever + * is appropriate. + */ +static inline struct uprobe *hprobe_consume(struct hprobe *hprobe, enum hprobe_state *hstate) +{ + *hstate = xchg(&hprobe->state, HPROBE_CONSUMED); + switch (*hstate) { + case HPROBE_LEASED: + case HPROBE_STABLE: + return hprobe->uprobe; + case HPROBE_GONE: /* uprobe is NULL, no SRCU */ + case HPROBE_CONSUMED: /* uprobe was finalized already, do nothing */ + return NULL; + default: + WARN(1, "hprobe invalid state %d", *hstate); + return NULL; + } +} + +/* + * Reset hprobe state and, if hprobe was LEASED, release SRCU lock. + * hprobe_finalize() can only be used from current context after + * hprobe_consume() call (which determines uprobe and hstate value). + */ +static void hprobe_finalize(struct hprobe *hprobe, enum hprobe_state hstate) +{ + switch (hstate) { + case HPROBE_LEASED: + __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); + break; + case HPROBE_STABLE: + put_uprobe(hprobe->uprobe); + break; + case HPROBE_GONE: + case HPROBE_CONSUMED: + break; + default: + WARN(1, "hprobe invalid state %d", hstate); + break; + } +} + +/* + * Attempt to switch (atomically) uprobe from being SRCU protected (LEASED) + * to refcounted (STABLE) state. Competes with hprobe_consume(); only one of + * them can win the race to perform SRCU unlocking. Whoever wins must perform + * SRCU unlock. + * + * Returns underlying valid uprobe or NULL, if there was no underlying uprobe + * to begin with or we failed to bump its refcount and it's going away. + * + * Returned non-NULL uprobe can be still safely used within an ongoing SRCU + * locked region. If `get` is true, it's guaranteed that non-NULL uprobe has + * an extra refcount for caller to assume and use. Otherwise, it's not + * guaranteed that returned uprobe has a positive refcount, so caller has to + * attempt try_get_uprobe(), if it needs to preserve uprobe beyond current + * SRCU lock region. See dup_utask(). + */ +static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get) +{ + enum hprobe_state hstate; + + /* + * Caller should guarantee that return_instance is not going to be + * freed from under us. This can be achieved either through holding + * rcu_read_lock() or by owning return_instance in the first place. + * + * Underlying uprobe is itself protected from reuse by SRCU, so ensure + * SRCU lock is held properly. + */ + lockdep_assert(srcu_read_lock_held(&uretprobes_srcu)); + + hstate = READ_ONCE(hprobe->state); + switch (hstate) { + case HPROBE_STABLE: + /* uprobe has positive refcount, bump refcount, if necessary */ + return get ? get_uprobe(hprobe->uprobe) : hprobe->uprobe; + case HPROBE_GONE: /* - * If application munmap(exec_vma) before uprobe_unregister() - * gets called, we don't get a chance to remove uprobe from - * delayed_uprobe_list from remove_breakpoint(). Do it here. + * SRCU was unlocked earlier and we didn't manage to take + * uprobe refcnt, so it's effectively NULL */ - mutex_lock(&delayed_uprobe_lock); - delayed_uprobe_remove(uprobe, NULL); - mutex_unlock(&delayed_uprobe_lock); - kfree(uprobe); + return NULL; + case HPROBE_CONSUMED: + /* + * uprobe was consumed, so it's effectively NULL as far as + * uretprobe processing logic is concerned + */ + return NULL; + case HPROBE_LEASED: { + struct uprobe *uprobe = try_get_uprobe(hprobe->uprobe); + /* + * Try to switch hprobe state, guarding against + * hprobe_consume() or another hprobe_expire() racing with us. + * Note, if we failed to get uprobe refcount, we use special + * HPROBE_GONE state to signal that hprobe->uprobe shouldn't + * be used as it will be freed after SRCU is unlocked. + */ + if (try_cmpxchg(&hprobe->state, &hstate, uprobe ? HPROBE_STABLE : HPROBE_GONE)) { + /* We won the race, we are the ones to unlock SRCU */ + __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); + return get ? get_uprobe(uprobe) : uprobe; + } + + /* + * We lost the race, undo refcount bump (if it ever happened), + * unless caller would like an extra refcount anyways. + */ + if (uprobe && !get) + put_uprobe(uprobe); + /* + * Even if hprobe_consume() or another hprobe_expire() wins + * the state update race and unlocks SRCU from under us, we + * still have a guarantee that underyling uprobe won't be + * freed due to ongoing caller's SRCU lock region, so we can + * return it regardless. Also, if `get` was true, we also have + * an extra ref for the caller to own. This is used in dup_utask(). + */ + return uprobe; + } + default: + WARN(1, "unknown hprobe state %d", hstate); + return NULL; } } -static int match_uprobe(struct uprobe *l, struct uprobe *r) +static __always_inline +int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset, + const struct uprobe *r) { - if (l->inode < r->inode) + if (l_inode < r->inode) return -1; - if (l->inode > r->inode) + if (l_inode > r->inode) return 1; - if (l->offset < r->offset) + if (l_offset < r->offset) return -1; - if (l->offset > r->offset) + if (l_offset > r->offset) return 1; return 0; } -static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) -{ - struct uprobe u = { .inode = inode, .offset = offset }; - struct rb_node *n = uprobes_tree.rb_node; - struct uprobe *uprobe; - int match; +#define __node_2_uprobe(node) \ + rb_entry((node), struct uprobe, rb_node) - while (n) { - uprobe = rb_entry(n, struct uprobe, rb_node); - match = match_uprobe(&u, uprobe); - if (!match) - return get_uprobe(uprobe); +struct __uprobe_key { + struct inode *inode; + loff_t offset; +}; - if (match < 0) - n = n->rb_left; - else - n = n->rb_right; - } - return NULL; +static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b) +{ + const struct __uprobe_key *a = key; + return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b)); +} + +static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b) +{ + struct uprobe *u = __node_2_uprobe(a); + return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b)); } /* - * Find a uprobe corresponding to a given inode:offset - * Acquires uprobes_treelock + * Assumes being inside RCU protected region. + * No refcount is taken on returned uprobe. */ -static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) +static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset) { - struct uprobe *uprobe; + struct __uprobe_key key = { + .inode = inode, + .offset = offset, + }; + struct rb_node *node; + unsigned int seq; - spin_lock(&uprobes_treelock); - uprobe = __find_uprobe(inode, offset); - spin_unlock(&uprobes_treelock); + lockdep_assert(rcu_read_lock_trace_held()); - return uprobe; + do { + seq = read_seqcount_begin(&uprobes_seqcount); + node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key); + /* + * Lockless RB-tree lookups can result only in false negatives. + * If the element is found, it is correct and can be returned + * under RCU protection. If we find nothing, we need to + * validate that seqcount didn't change. If it did, we have to + * try again as we might have missed the element (false + * negative). If seqcount is unchanged, search truly failed. + */ + if (node) + return __node_2_uprobe(node); + } while (read_seqcount_retry(&uprobes_seqcount, seq)); + + return NULL; } +/* + * Attempt to insert a new uprobe into uprobes_tree. + * + * If uprobe already exists (for given inode+offset), we just increment + * refcount of previously existing uprobe. + * + * If not, a provided new instance of uprobe is inserted into the tree (with + * assumed initial refcount == 1). + * + * In any case, we return a uprobe instance that ends up being in uprobes_tree. + * Caller has to clean up new uprobe instance, if it ended up not being + * inserted into the tree. + * + * We assume that uprobes_treelock is held for writing. + */ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) { - struct rb_node **p = &uprobes_tree.rb_node; - struct rb_node *parent = NULL; - struct uprobe *u; - int match; - - while (*p) { - parent = *p; - u = rb_entry(parent, struct uprobe, rb_node); - match = match_uprobe(uprobe, u); - if (!match) - return get_uprobe(u); + struct rb_node *node; +again: + node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); + if (node) { + struct uprobe *u = __node_2_uprobe(node); - if (match < 0) - p = &parent->rb_left; - else - p = &parent->rb_right; + if (!try_get_uprobe(u)) { + rb_erase(node, &uprobes_tree); + RB_CLEAR_NODE(&u->rb_node); + goto again; + } + return u; } - u = NULL; - rb_link_node(&uprobe->rb_node, parent, p); - rb_insert_color(&uprobe->rb_node, &uprobes_tree); - /* get access + creation ref */ - refcount_set(&uprobe->ref, 2); - - return u; + return uprobe; } /* - * Acquire uprobes_treelock. - * Matching uprobe already exists in rbtree; - * increment (access refcount) and return the matching uprobe. - * - * No matching uprobe; insert the uprobe in rb_tree; - * get a double refcount (access + creation) and return NULL. + * Acquire uprobes_treelock and insert uprobe into uprobes_tree + * (or reuse existing one, see __insert_uprobe() comments above). */ static struct uprobe *insert_uprobe(struct uprobe *uprobe) { struct uprobe *u; - spin_lock(&uprobes_treelock); + write_lock(&uprobes_treelock); + write_seqcount_begin(&uprobes_seqcount); u = __insert_uprobe(uprobe); - spin_unlock(&uprobes_treelock); + write_seqcount_end(&uprobes_seqcount); + write_unlock(&uprobes_treelock); return u; } @@ -732,18 +971,21 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL); if (!uprobe) - return NULL; + return ERR_PTR(-ENOMEM); uprobe->inode = inode; uprobe->offset = offset; uprobe->ref_ctr_offset = ref_ctr_offset; + INIT_LIST_HEAD(&uprobe->consumers); init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); + RB_CLEAR_NODE(&uprobe->rb_node); + refcount_set(&uprobe->ref, 1); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); /* a uprobe exists for this inode:offset combination */ - if (cur_uprobe) { + if (cur_uprobe != uprobe) { if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) { ref_ctr_mismatch_warn(cur_uprobe, uprobe); put_uprobe(cur_uprobe); @@ -759,33 +1001,23 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { + static atomic64_t id; + down_write(&uprobe->consumer_rwsem); - uc->next = uprobe->consumers; - uprobe->consumers = uc; + list_add_rcu(&uc->cons_node, &uprobe->consumers); + uc->id = (__u64) atomic64_inc_return(&id); up_write(&uprobe->consumer_rwsem); } /* * For uprobe @uprobe, delete the consumer @uc. - * Return true if the @uc is deleted successfully - * or return false. + * Should never be called with consumer that's not part of @uprobe->consumers. */ -static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) +static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) { - struct uprobe_consumer **con; - bool ret = false; - down_write(&uprobe->consumer_rwsem); - for (con = &uprobe->consumers; *con; con = &(*con)->next) { - if (*con == uc) { - *con = uc->next; - ret = true; - break; - } - } + list_del_rcu(&uc->cons_node); up_write(&uprobe->consumer_rwsem); - - return ret; } static int __copy_insn(struct address_space *mapping, struct file *filp, @@ -794,10 +1026,10 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, struct page *page; /* * Ensure that the page that has the original instruction is populated - * and in page-cache. If ->readpage == NULL it must be shmem_mapping(), + * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(), * see uprobe_register(). */ - if (mapping->a_ops->readpage) + if (mapping->a_ops->read_folio) page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp); else page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT); @@ -870,21 +1102,19 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, return ret; } -static inline bool consumer_filter(struct uprobe_consumer *uc, - enum uprobe_filter_ctx ctx, struct mm_struct *mm) +static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm) { - return !uc->filter || uc->filter(uc, ctx, mm); + return !uc->filter || uc->filter(uc, mm); } -static bool filter_chain(struct uprobe *uprobe, - enum uprobe_filter_ctx ctx, struct mm_struct *mm) +static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) { struct uprobe_consumer *uc; bool ret = false; down_read(&uprobe->consumer_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { - ret = consumer_filter(uc, ctx, mm); + list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { + ret = consumer_filter(uc, mm); if (ret) break; } @@ -928,27 +1158,6 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad return set_orig_insn(&uprobe->arch, mm, vaddr); } -static inline bool uprobe_is_active(struct uprobe *uprobe) -{ - return !RB_EMPTY_NODE(&uprobe->rb_node); -} -/* - * There could be threads that have already hit the breakpoint. They - * will recheck the current insn and restart if find_uprobe() fails. - * See find_active_uprobe(). - */ -static void delete_uprobe(struct uprobe *uprobe) -{ - if (WARN_ON(!uprobe_is_active(uprobe))) - return; - - spin_lock(&uprobes_treelock); - rb_erase(&uprobe->rb_node, &uprobes_tree); - spin_unlock(&uprobes_treelock); - RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ - put_uprobe(uprobe); -} - struct map_info { struct map_info *next; struct mm_struct *mm; @@ -1053,8 +1262,17 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) if (err && is_register) goto free; - + /* + * We take mmap_lock for writing to avoid the race with + * find_active_uprobe_rcu() which takes mmap_lock for reading. + * Thus this install_breakpoint() can not make + * is_trap_at_addr() true right after find_uprobe_rcu() + * returns NULL in find_active_uprobe_rcu(). + */ mmap_write_lock(mm); + if (check_stable_address_space(mm)) + goto unlock; + vma = find_vma(mm, info->vaddr); if (!vma || !valid_vma(vma, is_register) || file_inode(vma->vm_file) != uprobe->inode) @@ -1066,12 +1284,10 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) if (is_register) { /* consult only the "caller", new consumer. */ - if (consumer_filter(new, - UPROBE_FILTER_REGISTER, mm)) + if (consumer_filter(new, mm)) err = install_breakpoint(uprobe, mm, vma, info->vaddr); } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { - if (!filter_chain(uprobe, - UPROBE_FILTER_UNREGISTER, mm)) + if (!filter_chain(uprobe, mm)) err |= remove_breakpoint(uprobe, mm, info->vaddr); } @@ -1086,162 +1302,152 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) return err; } -static void -__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) +/** + * uprobe_unregister_nosync - unregister an already registered probe. + * @uprobe: uprobe to remove + * @uc: identify which probe if multiple probes are colocated. + */ +void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc) { int err; - if (WARN_ON(!consumer_del(uprobe, uc))) - return; - + down_write(&uprobe->register_rwsem); + consumer_del(uprobe, uc); err = register_for_each_vma(uprobe, NULL); - /* TODO : cant unregister? schedule a worker thread */ - if (!uprobe->consumers && !err) - delete_uprobe(uprobe); -} - -/* - * uprobe_unregister - unregister an already registered probe. - * @inode: the file in which the probe has to be removed. - * @offset: offset from the start of the file. - * @uc: identify which probe if multiple probes are colocated. - */ -void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) -{ - struct uprobe *uprobe; + up_write(&uprobe->register_rwsem); - uprobe = find_uprobe(inode, offset); - if (WARN_ON(!uprobe)) + /* TODO : cant unregister? schedule a worker thread */ + if (unlikely(err)) { + uprobe_warn(current, "unregister, leaking uprobe"); return; + } - down_write(&uprobe->register_rwsem); - __uprobe_unregister(uprobe, uc); - up_write(&uprobe->register_rwsem); put_uprobe(uprobe); } -EXPORT_SYMBOL_GPL(uprobe_unregister); +EXPORT_SYMBOL_GPL(uprobe_unregister_nosync); -/* - * __uprobe_register - register a probe +void uprobe_unregister_sync(void) +{ + /* + * Now that handler_chain() and handle_uretprobe_chain() iterate over + * uprobe->consumers list under RCU protection without holding + * uprobe->register_rwsem, we need to wait for RCU grace period to + * make sure that we can't call into just unregistered + * uprobe_consumer's callbacks anymore. If we don't do that, fast and + * unlucky enough caller can free consumer's memory and cause + * handler_chain() or handle_uretprobe_chain() to do an use-after-free. + */ + synchronize_rcu_tasks_trace(); + synchronize_srcu(&uretprobes_srcu); +} +EXPORT_SYMBOL_GPL(uprobe_unregister_sync); + +/** + * uprobe_register - register a probe * @inode: the file in which the probe has to be placed. * @offset: offset from the start of the file. + * @ref_ctr_offset: offset of SDT marker / reference counter * @uc: information on howto handle the probe.. * - * Apart from the access refcount, __uprobe_register() takes a creation + * Apart from the access refcount, uprobe_register() takes a creation * refcount (thro alloc_uprobe) if and only if this @uprobe is getting * inserted into the rbtree (i.e first consumer for a @inode:@offset * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation * refcount is released when the last @uc for the @uprobe - * unregisters. Caller of __uprobe_register() is required to keep @inode + * unregisters. Caller of uprobe_register() is required to keep @inode * (and the containing mount) referenced. * - * Return errno if it cannot successully install probes - * else return 0 (success) + * Return: pointer to the new uprobe on success or an ERR_PTR on failure. */ -static int __uprobe_register(struct inode *inode, loff_t offset, - loff_t ref_ctr_offset, struct uprobe_consumer *uc) +struct uprobe *uprobe_register(struct inode *inode, + loff_t offset, loff_t ref_ctr_offset, + struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret; /* Uprobe must have at least one set consumer */ if (!uc->handler && !uc->ret_handler) - return -EINVAL; + return ERR_PTR(-EINVAL); /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ - if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping)) - return -EIO; + if (!inode->i_mapping->a_ops->read_folio && + !shmem_mapping(inode->i_mapping)) + return ERR_PTR(-EIO); /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) - return -EINVAL; + return ERR_PTR(-EINVAL); /* * This ensures that copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) - return -EINVAL; + return ERR_PTR(-EINVAL); if (!IS_ALIGNED(ref_ctr_offset, sizeof(short))) - return -EINVAL; + return ERR_PTR(-EINVAL); - retry: uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); - if (!uprobe) - return -ENOMEM; if (IS_ERR(uprobe)) - return PTR_ERR(uprobe); + return uprobe; - /* - * We can race with uprobe_unregister()->delete_uprobe(). - * Check uprobe_is_active() and retry if it is false. - */ down_write(&uprobe->register_rwsem); - ret = -EAGAIN; - if (likely(uprobe_is_active(uprobe))) { - consumer_add(uprobe, uc); - ret = register_for_each_vma(uprobe, uc); - if (ret) - __uprobe_unregister(uprobe, uc); - } + consumer_add(uprobe, uc); + ret = register_for_each_vma(uprobe, uc); up_write(&uprobe->register_rwsem); - put_uprobe(uprobe); - if (unlikely(ret == -EAGAIN)) - goto retry; - return ret; -} + if (ret) { + uprobe_unregister_nosync(uprobe, uc); + /* + * Registration might have partially succeeded, so we can have + * this consumer being called right at this time. We need to + * sync here. It's ok, it's unlikely slow path. + */ + uprobe_unregister_sync(); + return ERR_PTR(ret); + } -int uprobe_register(struct inode *inode, loff_t offset, - struct uprobe_consumer *uc) -{ - return __uprobe_register(inode, offset, 0, uc); + return uprobe; } EXPORT_SYMBOL_GPL(uprobe_register); -int uprobe_register_refctr(struct inode *inode, loff_t offset, - loff_t ref_ctr_offset, struct uprobe_consumer *uc) -{ - return __uprobe_register(inode, offset, ref_ctr_offset, uc); -} -EXPORT_SYMBOL_GPL(uprobe_register_refctr); - -/* - * uprobe_apply - unregister an already registered probe. - * @inode: the file in which the probe has to be removed. - * @offset: offset from the start of the file. +/** + * uprobe_apply - add or remove the breakpoints according to @uc->filter + * @uprobe: uprobe which "owns" the breakpoint * @uc: consumer which wants to add more or remove some breakpoints * @add: add or remove the breakpoints + * Return: 0 on success or negative error code. */ -int uprobe_apply(struct inode *inode, loff_t offset, - struct uprobe_consumer *uc, bool add) +int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add) { - struct uprobe *uprobe; struct uprobe_consumer *con; int ret = -ENOENT; - uprobe = find_uprobe(inode, offset); - if (WARN_ON(!uprobe)) - return ret; - down_write(&uprobe->register_rwsem); - for (con = uprobe->consumers; con && con != uc ; con = con->next) - ; - if (con) - ret = register_for_each_vma(uprobe, add ? uc : NULL); + + rcu_read_lock_trace(); + list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { + if (con == uc) { + ret = register_for_each_vma(uprobe, add ? uc : NULL); + break; + } + } + rcu_read_unlock_trace(); + up_write(&uprobe->register_rwsem); - put_uprobe(uprobe); return ret; } static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; int err = 0; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; @@ -1303,25 +1509,27 @@ static void build_probe_list(struct inode *inode, min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; - spin_lock(&uprobes_treelock); + read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); if (n) { for (t = n; t; t = rb_prev(t)) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset < min) break; - list_add(&u->pending_list, head); - get_uprobe(u); + /* if uprobe went away, it's safe to ignore it */ + if (try_get_uprobe(u)) + list_add(&u->pending_list, head); } for (t = n; (t = rb_next(t)); ) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset > max) break; - list_add(&u->pending_list, head); - get_uprobe(u); + /* if uprobe went away, it's safe to ignore it */ + if (try_get_uprobe(u)) + list_add(&u->pending_list, head); } } - spin_unlock(&uprobes_treelock); + read_unlock(&uprobes_treelock); } /* @vma contains reference counter, not the probed instruction. */ @@ -1354,7 +1562,7 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma) } /* - * Called from mmap_region/vma_adjust with mm->mmap_lock acquired. + * Called from mmap_region/vma_merge with mm->mmap_lock acquired. * * Currently we ignore all errors and always return 0, the callers * can't handle the failure anyway. @@ -1389,7 +1597,7 @@ int uprobe_mmap(struct vm_area_struct *vma) */ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { if (!fatal_signal_pending(current) && - filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) { + filter_chain(uprobe, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); } @@ -1412,9 +1620,9 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; - spin_lock(&uprobes_treelock); + read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); - spin_unlock(&uprobes_treelock); + read_unlock(&uprobes_treelock); return !!n; } @@ -1438,6 +1646,27 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags); } +static vm_fault_t xol_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct xol_area *area = vma->vm_mm->uprobes_state.xol_area; + + vmf->page = area->page; + get_page(vmf->page); + return 0; +} + +static int xol_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) +{ + return -EPERM; +} + +static const struct vm_special_mapping xol_mapping = { + .name = "[uprobes]", + .fault = xol_fault, + .mremap = xol_mremap, +}; + /* Slot allocation for XOL */ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { @@ -1464,7 +1693,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE, VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, - &area->xol_mapping); + &xol_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto fail; @@ -1479,13 +1708,22 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) return ret; } +void * __weak arch_uprobe_trampoline(unsigned long *psize) +{ + static uprobe_opcode_t insn = UPROBE_SWBP_INSN; + + *psize = UPROBE_SWBP_INSN_SIZE; + return &insn; +} + static struct xol_area *__create_xol_area(unsigned long vaddr) { struct mm_struct *mm = current->mm; - uprobe_opcode_t insn = UPROBE_SWBP_INSN; + unsigned long insns_size; struct xol_area *area; + void *insns; - area = kmalloc(sizeof(*area), GFP_KERNEL); + area = kzalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) goto out; @@ -1494,25 +1732,21 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) if (!area->bitmap) goto free_area; - area->xol_mapping.name = "[uprobes]"; - area->xol_mapping.fault = NULL; - area->xol_mapping.pages = area->pages; - area->pages[0] = alloc_page(GFP_HIGHUSER); - if (!area->pages[0]) + area->page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); + if (!area->page) goto free_bitmap; - area->pages[1] = NULL; area->vaddr = vaddr; init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); - atomic_set(&area->slot_count, 1); - arch_uprobe_copy_ixol(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE); + insns = arch_uprobe_trampoline(&insns_size); + arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); if (!xol_add_vma(mm, area)) return area; - __free_page(area->pages[0]); + __free_page(area->page); free_bitmap: kfree(area->bitmap); free_area: @@ -1554,7 +1788,7 @@ void uprobe_clear_state(struct mm_struct *mm) if (!area) return; - put_page(area->pages[0]); + put_page(area->page); kfree(area->bitmap); kfree(area); } @@ -1578,92 +1812,57 @@ void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) } } -/* - * - search for a free slot. - */ -static unsigned long xol_take_insn_slot(struct xol_area *area) +static unsigned long xol_get_slot_nr(struct xol_area *area) { - unsigned long slot_addr; - int slot_nr; - - do { - slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE); - if (slot_nr < UINSNS_PER_PAGE) { - if (!test_and_set_bit(slot_nr, area->bitmap)) - break; - - slot_nr = UINSNS_PER_PAGE; - continue; - } - wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE)); - } while (slot_nr >= UINSNS_PER_PAGE); + unsigned long slot_nr; - slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES); - atomic_inc(&area->slot_count); + slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE); + if (slot_nr < UINSNS_PER_PAGE) { + if (!test_and_set_bit(slot_nr, area->bitmap)) + return slot_nr; + } - return slot_addr; + return UINSNS_PER_PAGE; } /* * xol_get_insn_slot - allocate a slot for xol. - * Returns the allocated slot address or 0. */ -static unsigned long xol_get_insn_slot(struct uprobe *uprobe) +static bool xol_get_insn_slot(struct uprobe *uprobe, struct uprobe_task *utask) { - struct xol_area *area; - unsigned long xol_vaddr; + struct xol_area *area = get_xol_area(); + unsigned long slot_nr; - area = get_xol_area(); if (!area) - return 0; + return false; - xol_vaddr = xol_take_insn_slot(area); - if (unlikely(!xol_vaddr)) - return 0; + wait_event(area->wq, (slot_nr = xol_get_slot_nr(area)) < UINSNS_PER_PAGE); - arch_uprobe_copy_ixol(area->pages[0], xol_vaddr, + utask->xol_vaddr = area->vaddr + slot_nr * UPROBE_XOL_SLOT_BYTES; + arch_uprobe_copy_ixol(area->page, utask->xol_vaddr, &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); - - return xol_vaddr; + return true; } /* - * xol_free_insn_slot - If slot was earlier allocated by - * @xol_get_insn_slot(), make the slot available for - * subsequent requests. + * xol_free_insn_slot - free the slot allocated by xol_get_insn_slot() */ -static void xol_free_insn_slot(struct task_struct *tsk) +static void xol_free_insn_slot(struct uprobe_task *utask) { - struct xol_area *area; - unsigned long vma_end; - unsigned long slot_addr; + struct xol_area *area = current->mm->uprobes_state.xol_area; + unsigned long offset = utask->xol_vaddr - area->vaddr; + unsigned int slot_nr; - if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask) + utask->xol_vaddr = 0; + /* xol_vaddr must fit into [area->vaddr, area->vaddr + PAGE_SIZE) */ + if (WARN_ON_ONCE(offset >= PAGE_SIZE)) return; - slot_addr = tsk->utask->xol_vaddr; - if (unlikely(!slot_addr)) - return; - - area = tsk->mm->uprobes_state.xol_area; - vma_end = area->vaddr + PAGE_SIZE; - if (area->vaddr <= slot_addr && slot_addr < vma_end) { - unsigned long offset; - int slot_nr; - - offset = slot_addr - area->vaddr; - slot_nr = offset / UPROBE_XOL_SLOT_BYTES; - if (slot_nr >= UINSNS_PER_PAGE) - return; - - clear_bit(slot_nr, area->bitmap); - atomic_dec(&area->slot_count); - smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ - if (waitqueue_active(&area->wq)) - wake_up(&area->wq); - - tsk->utask->xol_vaddr = 0; - } + slot_nr = offset / UPROBE_XOL_SLOT_BYTES; + clear_bit(slot_nr, area->bitmap); + smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ + if (waitqueue_active(&area->wq)) + wake_up(&area->wq); } void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, @@ -1702,12 +1901,57 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) return instruction_pointer(regs); } -static struct return_instance *free_ret_instance(struct return_instance *ri) +static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri) { - struct return_instance *next = ri->next; - put_uprobe(ri->uprobe); - kfree(ri); - return next; + ri->cons_cnt = 0; + ri->next = utask->ri_pool; + utask->ri_pool = ri; +} + +static struct return_instance *ri_pool_pop(struct uprobe_task *utask) +{ + struct return_instance *ri = utask->ri_pool; + + if (likely(ri)) + utask->ri_pool = ri->next; + + return ri; +} + +static void ri_free(struct return_instance *ri) +{ + kfree(ri->extra_consumers); + kfree_rcu(ri, rcu); +} + +static void free_ret_instance(struct uprobe_task *utask, + struct return_instance *ri, bool cleanup_hprobe) +{ + unsigned seq; + + if (cleanup_hprobe) { + enum hprobe_state hstate; + + (void)hprobe_consume(&ri->hprobe, &hstate); + hprobe_finalize(&ri->hprobe, hstate); + } + + /* + * At this point return_instance is unlinked from utask's + * return_instances list and this has become visible to ri_timer(). + * If seqcount now indicates that ri_timer's return instance + * processing loop isn't active, we can return ri into the pool of + * to-be-reused return instances for future uretprobes. If ri_timer() + * happens to be running right now, though, we fallback to safety and + * just perform RCU-delated freeing of ri. + */ + if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) { + /* immediate reuse of ri without RCU GP is OK */ + ri_pool_push(utask, ri); + } else { + /* we might be racing with ri_timer(), so play it safe */ + ri_free(ri); + } } /* @@ -1717,25 +1961,73 @@ static struct return_instance *free_ret_instance(struct return_instance *ri) void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; - struct return_instance *ri; + struct return_instance *ri, *ri_next; if (!utask) return; - if (utask->active_uprobe) - put_uprobe(utask->active_uprobe); + t->utask = NULL; + WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr); + + timer_delete_sync(&utask->ri_timer); ri = utask->return_instances; - while (ri) - ri = free_ret_instance(ri); + while (ri) { + ri_next = ri->next; + free_ret_instance(utask, ri, true /* cleanup_hprobe */); + ri = ri_next; + } + + /* free_ret_instance() above might add to ri_pool, so this loop should come last */ + ri = utask->ri_pool; + while (ri) { + ri_next = ri->next; + ri_free(ri); + ri = ri_next; + } - xol_free_insn_slot(t); kfree(utask); - t->utask = NULL; +} + +#define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */ + +#define for_each_ret_instance_rcu(pos, head) \ + for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next)) + +static void ri_timer(struct timer_list *timer) +{ + struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer); + struct return_instance *ri; + + /* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */ + guard(srcu)(&uretprobes_srcu); + /* RCU protects return_instance from freeing. */ + guard(rcu)(); + + write_seqcount_begin(&utask->ri_seqcount); + + for_each_ret_instance_rcu(ri, utask->return_instances) + hprobe_expire(&ri->hprobe, false); + + write_seqcount_end(&utask->ri_seqcount); +} + +static struct uprobe_task *alloc_utask(void) +{ + struct uprobe_task *utask; + + utask = kzalloc(sizeof(*utask), GFP_KERNEL); + if (!utask) + return NULL; + + timer_setup(&utask->ri_timer, ri_timer, 0); + seqcount_init(&utask->ri_seqcount); + + return utask; } /* - * Allocate a uprobe_task object for the task if if necessary. + * Allocate a uprobe_task object for the task if necessary. * Called when the thread hits a breakpoint. * * Returns: @@ -1745,44 +2037,87 @@ void uprobe_free_utask(struct task_struct *t) static struct uprobe_task *get_utask(void) { if (!current->utask) - current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); + current->utask = alloc_utask(); return current->utask; } +static struct return_instance *alloc_return_instance(struct uprobe_task *utask) +{ + struct return_instance *ri; + + ri = ri_pool_pop(utask); + if (ri) + return ri; + + ri = kzalloc(sizeof(*ri), GFP_KERNEL); + if (!ri) + return ZERO_SIZE_PTR; + + return ri; +} + +static struct return_instance *dup_return_instance(struct return_instance *old) +{ + struct return_instance *ri; + + ri = kmemdup(old, sizeof(*ri), GFP_KERNEL); + if (!ri) + return NULL; + + if (unlikely(old->cons_cnt > 1)) { + ri->extra_consumers = kmemdup(old->extra_consumers, + sizeof(ri->extra_consumers[0]) * (old->cons_cnt - 1), + GFP_KERNEL); + if (!ri->extra_consumers) { + kfree(ri); + return NULL; + } + } + + return ri; +} + static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) { struct uprobe_task *n_utask; struct return_instance **p, *o, *n; + struct uprobe *uprobe; - n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); + n_utask = alloc_utask(); if (!n_utask) return -ENOMEM; t->utask = n_utask; + /* protect uprobes from freeing, we'll need try_get_uprobe() them */ + guard(srcu)(&uretprobes_srcu); + p = &n_utask->return_instances; for (o = o_utask->return_instances; o; o = o->next) { - n = kmalloc(sizeof(struct return_instance), GFP_KERNEL); + n = dup_return_instance(o); if (!n) return -ENOMEM; - *n = *o; - get_uprobe(n->uprobe); - n->next = NULL; + /* if uprobe is non-NULL, we'll have an extra refcount for uprobe */ + uprobe = hprobe_expire(&o->hprobe, true); + + /* + * New utask will have stable properly refcounted uprobe or + * NULL. Even if we failed to get refcounted uprobe, we still + * need to preserve full set of return_instances for proper + * uretprobe handling and nesting in forked task. + */ + hprobe_init_stable(&n->hprobe, uprobe); - *p = n; + n->next = NULL; + rcu_assign_pointer(*p, n); p = &n->next; + n_utask->depth++; } return 0; } -static void uprobe_warn(struct task_struct *t, const char *msg) -{ - pr_warn("uprobe: %s:%d failed to %s\n", - current->comm, current->pid, msg); -} - static void dup_xol_work(struct callback_head *work) { if (current->flags & PF_EXITING) @@ -1823,7 +2158,7 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags) t->utask->dup_xol_addr = area->vaddr; init_task_work(&t->utask->dup_xol_work, dup_xol_work); - task_work_add(t, &t->utask->dup_xol_work, true); + task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME); } /* @@ -1832,7 +2167,7 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags) * * Returns -1 in case the xol_area is not allocated. */ -static unsigned long get_trampoline_vaddr(void) +unsigned long uprobe_get_trampoline_vaddr(void) { struct xol_area *area; unsigned long trampoline_vaddr = -1; @@ -1848,45 +2183,41 @@ static unsigned long get_trampoline_vaddr(void) static void cleanup_return_instances(struct uprobe_task *utask, bool chained, struct pt_regs *regs) { - struct return_instance *ri = utask->return_instances; + struct return_instance *ri = utask->return_instances, *ri_next; enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { - ri = free_ret_instance(ri); + ri_next = ri->next; + rcu_assign_pointer(utask->return_instances, ri_next); utask->depth--; + + free_ret_instance(utask, ri, true /* cleanup_hprobe */); + ri = ri_next; } - utask->return_instances = ri; } -static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) +static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs, + struct return_instance *ri) { - struct return_instance *ri; - struct uprobe_task *utask; + struct uprobe_task *utask = current->utask; unsigned long orig_ret_vaddr, trampoline_vaddr; bool chained; + int srcu_idx; if (!get_xol_area()) - return; - - utask = get_utask(); - if (!utask) - return; + goto free; if (utask->depth >= MAX_URETPROBE_DEPTH) { printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to" " nestedness limit pid/tgid=%d/%d\n", current->pid, current->tgid); - return; + goto free; } - ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL); - if (!ri) - return; - - trampoline_vaddr = get_trampoline_vaddr(); + trampoline_vaddr = uprobe_get_trampoline_vaddr(); orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); if (orig_ret_vaddr == -1) - goto fail; + goto free; /* drop the entries invalidated by longjmp() */ chained = (orig_ret_vaddr == trampoline_vaddr); @@ -1904,54 +2235,60 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) * attack from user-space. */ uprobe_warn(current, "handle tail call"); - goto fail; + goto free; } orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } - ri->uprobe = get_uprobe(uprobe); + /* __srcu_read_lock() because SRCU lock survives switch to user space */ + srcu_idx = __srcu_read_lock(&uretprobes_srcu); + ri->func = instruction_pointer(regs); ri->stack = user_stack_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; ri->chained = chained; utask->depth++; + + hprobe_init_leased(&ri->hprobe, uprobe, srcu_idx); ri->next = utask->return_instances; - utask->return_instances = ri; + rcu_assign_pointer(utask->return_instances, ri); + + mod_timer(&utask->ri_timer, jiffies + RI_TIMER_PERIOD); return; - fail: - kfree(ri); +free: + ri_free(ri); } /* Prepare to single-step probed instruction out of line. */ static int pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) { - struct uprobe_task *utask; - unsigned long xol_vaddr; + struct uprobe_task *utask = current->utask; int err; - utask = get_utask(); - if (!utask) - return -ENOMEM; + if (!try_get_uprobe(uprobe)) + return -EINVAL; - xol_vaddr = xol_get_insn_slot(uprobe); - if (!xol_vaddr) - return -ENOMEM; + if (!xol_get_insn_slot(uprobe, utask)) { + err = -ENOMEM; + goto err_out; + } - utask->xol_vaddr = xol_vaddr; utask->vaddr = bp_vaddr; - err = arch_uprobe_pre_xol(&uprobe->arch, regs); if (unlikely(err)) { - xol_free_insn_slot(current); - return err; + xol_free_insn_slot(utask); + goto err_out; } utask->active_uprobe = uprobe; utask->state = UTASK_SSTEP; return 0; +err_out: + put_uprobe(uprobe); + return err; } /* @@ -1973,7 +2310,7 @@ bool uprobe_deny_signal(void) WARN_ON_ONCE(utask->state != UTASK_SSTEP); - if (signal_pending(t)) { + if (task_sigpending(t)) { spin_lock_irq(&t->sighand->siglock); clear_tsk_thread_flag(t, TIF_SIGPENDING); spin_unlock_irq(&t->sighand->siglock); @@ -1989,9 +2326,10 @@ bool uprobe_deny_signal(void) static void mmf_recalc_uprobes(struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!valid_vma(vma, false)) continue; /* @@ -2023,14 +2361,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) if (likely(result == 0)) goto out; - /* - * The NULL 'tsk' here ensures that any faults that occur here - * will not be accounted to the task. 'mm' *is* current->mm, - * but we treat this as a 'remote' access since it is - * essentially a kernel access to the memory. - */ - result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, - NULL, NULL); + result = get_user_pages(vaddr, 1, FOLL_FORCE, &page); if (result < 0) return result; @@ -2041,20 +2372,66 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) return is_trap_insn(&opcode); } -static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr) +{ + struct mm_struct *mm = current->mm; + struct uprobe *uprobe = NULL; + struct vm_area_struct *vma; + struct file *vm_file; + loff_t offset; + unsigned int seq; + + guard(rcu)(); + + if (!mmap_lock_speculate_try_begin(mm, &seq)) + return NULL; + + vma = vma_lookup(mm, bp_vaddr); + if (!vma) + return NULL; + + /* + * vm_file memory can be reused for another instance of struct file, + * but can't be freed from under us, so it's safe to read fields from + * it, even if the values are some garbage values; ultimately + * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure + * that whatever we speculatively found is correct + */ + vm_file = READ_ONCE(vma->vm_file); + if (!vm_file) + return NULL; + + offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start); + uprobe = find_uprobe_rcu(vm_file->f_inode, offset); + if (!uprobe) + return NULL; + + /* now double check that nothing about MM changed */ + if (mmap_lock_speculate_retry(mm, seq)) + return NULL; + + return uprobe; +} + +/* assumes being inside RCU protected region */ +static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp) { struct mm_struct *mm = current->mm; struct uprobe *uprobe = NULL; struct vm_area_struct *vma; + uprobe = find_active_uprobe_speculative(bp_vaddr); + if (uprobe) + return uprobe; + mmap_read_lock(mm); - vma = find_vma(mm, bp_vaddr); - if (vma && vma->vm_start <= bp_vaddr) { - if (valid_vma(vma, false)) { + vma = vma_lookup(mm, bp_vaddr); + if (vma) { + if (vma->vm_file) { struct inode *inode = file_inode(vma->vm_file); loff_t offset = vaddr_to_offset(vma, bp_vaddr); - uprobe = find_uprobe(inode, offset); + uprobe = find_uprobe_rcu(inode, offset); } if (!uprobe) @@ -2070,50 +2447,125 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) return uprobe; } +static struct return_instance *push_consumer(struct return_instance *ri, __u64 id, __u64 cookie) +{ + struct return_consumer *ric; + + if (unlikely(ri == ZERO_SIZE_PTR)) + return ri; + + if (unlikely(ri->cons_cnt > 0)) { + ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL); + if (!ric) { + ri_free(ri); + return ZERO_SIZE_PTR; + } + ri->extra_consumers = ric; + } + + ric = likely(ri->cons_cnt == 0) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - 1]; + ric->id = id; + ric->cookie = cookie; + + ri->cons_cnt++; + return ri; +} + +static struct return_consumer * +return_consumer_find(struct return_instance *ri, int *iter, int id) +{ + struct return_consumer *ric; + int idx; + + for (idx = *iter; idx < ri->cons_cnt; idx++) + { + ric = likely(idx == 0) ? &ri->consumer : &ri->extra_consumers[idx - 1]; + if (ric->id == id) { + *iter = idx + 1; + return ric; + } + } + + return NULL; +} + +static bool ignore_ret_handler(int rc) +{ + return rc == UPROBE_HANDLER_REMOVE || rc == UPROBE_HANDLER_IGNORE; +} + static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) { struct uprobe_consumer *uc; - int remove = UPROBE_HANDLER_REMOVE; - bool need_prep = false; /* prepare return uprobe, when needed */ + bool has_consumers = false, remove = true; + struct return_instance *ri = NULL; + struct uprobe_task *utask = current->utask; + + utask->auprobe = &uprobe->arch; - down_read(&uprobe->register_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { + list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { + bool session = uc->handler && uc->ret_handler; + __u64 cookie = 0; int rc = 0; if (uc->handler) { - rc = uc->handler(uc, regs); - WARN(rc & ~UPROBE_HANDLER_MASK, + rc = uc->handler(uc, regs, &cookie); + WARN(rc < 0 || rc > 2, "bad rc=0x%x from %ps()\n", rc, uc->handler); } - if (uc->ret_handler) - need_prep = true; + remove &= rc == UPROBE_HANDLER_REMOVE; + has_consumers = true; + + if (!uc->ret_handler || ignore_ret_handler(rc)) + continue; - remove &= rc; + if (!ri) + ri = alloc_return_instance(utask); + + if (session) + ri = push_consumer(ri, uc->id, cookie); } + utask->auprobe = NULL; + + if (!ZERO_OR_NULL_PTR(ri)) + prepare_uretprobe(uprobe, regs, ri); + + if (remove && has_consumers) { + down_read(&uprobe->register_rwsem); - if (need_prep && !remove) - prepare_uretprobe(uprobe, regs); /* put bp at return */ + /* re-check that removal is still required, this time under lock */ + if (!filter_chain(uprobe, current->mm)) { + WARN_ON(!uprobe_is_active(uprobe)); + unapply_uprobe(uprobe, current->mm); + } - if (remove && uprobe->consumers) { - WARN_ON(!uprobe_is_active(uprobe)); - unapply_uprobe(uprobe, current->mm); + up_read(&uprobe->register_rwsem); } - up_read(&uprobe->register_rwsem); } static void -handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) +handle_uretprobe_chain(struct return_instance *ri, struct uprobe *uprobe, struct pt_regs *regs) { - struct uprobe *uprobe = ri->uprobe; + struct return_consumer *ric; struct uprobe_consumer *uc; + int ric_idx = 0; + + /* all consumers unsubscribed meanwhile */ + if (unlikely(!uprobe)) + return; + + rcu_read_lock_trace(); + list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { + bool session = uc->handler && uc->ret_handler; - down_read(&uprobe->register_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { - if (uc->ret_handler) - uc->ret_handler(uc, ri->func, regs); + if (uc->ret_handler) { + ric = return_consumer_find(ri, &ric_idx, uc->id); + if (!session || ric) + uc->ret_handler(uc, ri->func, regs, ric ? &ric->cookie : NULL); + } } - up_read(&uprobe->register_rwsem); + rcu_read_unlock_trace(); } static struct return_instance *find_next_ret_chain(struct return_instance *ri) @@ -2128,10 +2580,12 @@ static struct return_instance *find_next_ret_chain(struct return_instance *ri) return ri; } -static void handle_trampoline(struct pt_regs *regs) +void uprobe_handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; - struct return_instance *ri, *next; + struct return_instance *ri, *ri_next, *next_chain; + struct uprobe *uprobe; + enum hprobe_state hstate; bool valid; utask = current->utask; @@ -2149,25 +2603,39 @@ static void handle_trampoline(struct pt_regs *regs) * or NULL; the latter case means that nobody but ri->func * could hit this trampoline on return. TODO: sigaltstack(). */ - next = find_next_ret_chain(ri); - valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs); + next_chain = find_next_ret_chain(ri); + valid = !next_chain || arch_uretprobe_is_alive(next_chain, RP_CHECK_RET, regs); instruction_pointer_set(regs, ri->orig_ret_vaddr); do { - if (valid) - handle_uretprobe_chain(ri, regs); - ri = free_ret_instance(ri); + /* pop current instance from the stack of pending return instances, + * as it's not pending anymore: we just fixed up original + * instruction pointer in regs and are about to call handlers; + * this allows fixup_uretprobe_trampoline_entries() to properly fix up + * captured stack traces from uretprobe handlers, in which pending + * trampoline addresses on the stack are replaced with correct + * original return addresses + */ + ri_next = ri->next; + rcu_assign_pointer(utask->return_instances, ri_next); utask->depth--; - } while (ri != next); + + uprobe = hprobe_consume(&ri->hprobe, &hstate); + if (valid) + handle_uretprobe_chain(ri, uprobe, regs); + hprobe_finalize(&ri->hprobe, hstate); + + /* We already took care of hprobe, no need to waste more time on that. */ + free_ret_instance(utask, ri, false /* !cleanup_hprobe */); + ri = ri_next; + } while (ri != next_chain); } while (!valid); - utask->return_instances = ri; return; - sigill: +sigill: uprobe_warn(current, "handle uretprobe, sending SIGILL."); force_sig(SIGILL); - } bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) @@ -2189,13 +2657,15 @@ static void handle_swbp(struct pt_regs *regs) { struct uprobe *uprobe; unsigned long bp_vaddr; - int uninitialized_var(is_swbp); + int is_swbp; bp_vaddr = uprobe_get_swbp_addr(regs); - if (bp_vaddr == get_trampoline_vaddr()) - return handle_trampoline(regs); + if (bp_vaddr == uprobe_get_trampoline_vaddr()) + return uprobe_handle_trampoline(regs); + + rcu_read_lock_trace(); - uprobe = find_active_uprobe(bp_vaddr, &is_swbp); + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ @@ -2211,7 +2681,7 @@ static void handle_swbp(struct pt_regs *regs) */ instruction_pointer_set(regs, bp_vaddr); } - return; + goto out; } /* change it in advance for ->handler() and restart */ @@ -2246,12 +2716,12 @@ static void handle_swbp(struct pt_regs *regs) if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; - if (!pre_ssout(uprobe, regs, bp_vaddr)) - return; + if (pre_ssout(uprobe, regs, bp_vaddr)) + goto out; - /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ out: - put_uprobe(uprobe); + /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ + rcu_read_unlock_trace(); } /* @@ -2274,7 +2744,7 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) put_uprobe(uprobe); utask->active_uprobe = NULL; utask->state = UTASK_RUNNING; - xol_free_insn_slot(current); + xol_free_insn_slot(utask); spin_lock_irq(¤t->sighand->siglock); recalc_sigpending(); /* see uprobe_deny_signal() */ |