diff options
Diffstat (limited to 'kernel/events')
-rw-r--r-- | kernel/events/core.c | 142 | ||||
-rw-r--r-- | kernel/events/ring_buffer.c | 4 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 357 |
3 files changed, 271 insertions, 232 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index f34c99f8ce8f..22fdf0c187cd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -207,6 +207,19 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, __perf_ctx_unlock(&cpuctx->ctx); } +typedef struct { + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; +} class_perf_ctx_lock_t; + +static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T) +{ perf_ctx_unlock(_T->cpuctx, _T->ctx); } + +static inline class_perf_ctx_lock_t +class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; } + #define TASK_TOMBSTONE ((void *)-1L) static bool is_kernel_event(struct perf_event *event) @@ -938,13 +951,19 @@ static void perf_cgroup_switch(struct task_struct *task) if (READ_ONCE(cpuctx->cgrp) == NULL) return; - WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - cgrp = perf_cgroup_from_task(task, NULL); if (READ_ONCE(cpuctx->cgrp) == cgrp) return; - perf_ctx_lock(cpuctx, cpuctx->task_ctx); + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); + /* + * Re-check, could've raced vs perf_remove_from_context(). + */ + if (READ_ONCE(cpuctx->cgrp) == NULL) + return; + + WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); + perf_ctx_disable(&cpuctx->ctx, true); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); @@ -962,7 +981,6 @@ static void perf_cgroup_switch(struct task_struct *task) ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); perf_ctx_enable(&cpuctx->ctx, true); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static int perf_cgroup_ensure_storage(struct perf_event *event, @@ -2120,18 +2138,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader == event) del_event_from_groups(event, ctx); - /* - * If event was in error state, then keep it - * that way, otherwise bogus counts will be - * returned on read(). The only way to get out - * of error state is by explicit re-enabling - * of the event - */ - if (event->state > PERF_EVENT_STATE_OFF) { - perf_cgroup_event_disable(event, ctx); - perf_event_set_state(event, PERF_EVENT_STATE_OFF); - } - ctx->generation++; event->pmu_ctx->nr_events--; } @@ -2149,8 +2155,9 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) } static void put_event(struct perf_event *event); -static void event_sched_out(struct perf_event *event, - struct perf_event_context *ctx); +static void __event_disable(struct perf_event *event, + struct perf_event_context *ctx, + enum perf_event_state state); static void perf_put_aux_event(struct perf_event *event) { @@ -2183,8 +2190,7 @@ static void perf_put_aux_event(struct perf_event *event) * state so that we don't try to schedule it again. Note * that perf_event_enable() will clear the ERROR status. */ - event_sched_out(iter, ctx); - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + __event_disable(iter, ctx, PERF_EVENT_STATE_ERROR); } } @@ -2242,18 +2248,6 @@ static inline struct list_head *get_event_list(struct perf_event *event) &event->pmu_ctx->flexible_active; } -/* - * Events that have PERF_EV_CAP_SIBLING require being part of a group and - * cannot exist on their own, schedule them out and move them into the ERROR - * state. Also see _perf_event_enable(), it will not be able to recover - * this ERROR state. - */ -static inline void perf_remove_sibling_event(struct perf_event *event) -{ - event_sched_out(event, event->ctx); - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); -} - static void perf_group_detach(struct perf_event *event) { struct perf_event *leader = event->group_leader; @@ -2289,8 +2283,15 @@ static void perf_group_detach(struct perf_event *event) */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { + /* + * Events that have PERF_EV_CAP_SIBLING require being part of + * a group and cannot exist on their own, schedule them out + * and move them into the ERROR state. Also see + * _perf_event_enable(), it will not be able to recover this + * ERROR state. + */ if (sibling->event_caps & PERF_EV_CAP_SIBLING) - perf_remove_sibling_event(sibling); + __event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR); sibling->group_leader = sibling; list_del_init(&sibling->sibling_list); @@ -2493,11 +2494,14 @@ __perf_remove_from_context(struct perf_event *event, state = PERF_EVENT_STATE_EXIT; if (flags & DETACH_REVOKE) state = PERF_EVENT_STATE_REVOKED; - if (flags & DETACH_DEAD) { - event->pending_disable = 1; + if (flags & DETACH_DEAD) state = PERF_EVENT_STATE_DEAD; - } + event_sched_out(event, ctx); + + if (event->state > PERF_EVENT_STATE_OFF) + perf_cgroup_event_disable(event, ctx); + perf_event_set_state(event, min(event->state, state)); if (flags & DETACH_GROUP) @@ -2562,6 +2566,15 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla event_function_call(event, __perf_remove_from_context, (void *)flags); } +static void __event_disable(struct perf_event *event, + struct perf_event_context *ctx, + enum perf_event_state state) +{ + event_sched_out(event, ctx); + perf_cgroup_event_disable(event, ctx); + perf_event_set_state(event, state); +} + /* * Cross CPU call to disable a performance event */ @@ -2576,13 +2589,18 @@ static void __perf_event_disable(struct perf_event *event, perf_pmu_disable(event->pmu_ctx->pmu); ctx_time_update_event(ctx, event); + /* + * When disabling a group leader, the whole group becomes ineligible + * to run, so schedule out the full group. + */ if (event == event->group_leader) group_sched_out(event, ctx); - else - event_sched_out(event, ctx); - perf_event_set_state(event, PERF_EVENT_STATE_OFF); - perf_cgroup_event_disable(event, ctx); + /* + * But only mark the leader OFF; the siblings will remain + * INACTIVE. + */ + __event_disable(event, ctx, PERF_EVENT_STATE_OFF); perf_pmu_enable(event->pmu_ctx->pmu); } @@ -2656,8 +2674,8 @@ static void perf_event_unthrottle(struct perf_event *event, bool start) static void perf_event_throttle(struct perf_event *event) { - event->pmu->stop(event, 0); event->hw.interrupts = MAX_INTERRUPTS; + event->pmu->stop(event, 0); if (event == event->group_leader) perf_log_throttle(event, 0); } @@ -7186,18 +7204,18 @@ void perf_event_wakeup(struct perf_event *event) static void perf_sigtrap(struct perf_event *event) { /* - * We'd expect this to only occur if the irq_work is delayed and either - * ctx->task or current has changed in the meantime. This can be the - * case on architectures that do not implement arch_irq_work_raise(). + * Both perf_pending_task() and perf_pending_irq() can race with the + * task exiting. */ - if (WARN_ON_ONCE(event->ctx->task != current)) + if (current->flags & PF_EXITING) return; /* - * Both perf_pending_task() and perf_pending_irq() can race with the - * task exiting. + * We'd expect this to only occur if the irq_work is delayed and either + * ctx->task or current has changed in the meantime. This can be the + * case on architectures that do not implement arch_irq_work_raise(). */ - if (current->flags & PF_EXITING) + if (WARN_ON_ONCE(event->ctx->task != current)) return; send_sig_perf((void __user *)event->pending_addr, @@ -7233,15 +7251,15 @@ static void __perf_pending_disable(struct perf_event *event) * CPU-A CPU-B * * perf_event_disable_inatomic() - * @pending_disable = CPU-A; + * @pending_disable = 1; * irq_work_queue(); * * sched-out - * @pending_disable = -1; + * @pending_disable = 0; * * sched-in * perf_event_disable_inatomic() - * @pending_disable = CPU-B; + * @pending_disable = 1; * irq_work_queue(); // FAILS * * irq_work_run() @@ -7439,6 +7457,10 @@ perf_sample_ustack_size(u16 stack_size, u16 header_size, if (!regs) return 0; + /* No mm, no stack, no dump. */ + if (!current->mm) + return 0; + /* * Check if we fit in with the requested stack size into the: * - TASK_SIZE @@ -8150,6 +8172,9 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) const u32 max_stack = event->attr.sample_max_stack; struct perf_callchain_entry *callchain; + if (!current->mm) + user = false; + if (!kernel && !user) return &__empty_callchain; @@ -11091,7 +11116,7 @@ static int perf_uprobe_event_init(struct perf_event *event) if (event->attr.type != perf_uprobe.type) return -ENOENT; - if (!perfmon_capable()) + if (!capable(CAP_SYS_ADMIN)) return -EACCES; /* @@ -11749,7 +11774,12 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - if (is_sampling_event(event)) { + /* + * The throttle can be triggered in the hrtimer handler. + * The HRTIMER_NORESTART should be used to stop the timer, + * rather than hrtimer_cancel(). See perf_swevent_hrtimer() + */ + if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) { ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); local64_set(&hwc->period_left, ktime_to_ns(remaining)); @@ -11804,7 +11834,8 @@ static void cpu_clock_event_start(struct perf_event *event, int flags) static void cpu_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); - cpu_clock_event_update(event); + if (flags & PERF_EF_UPDATE) + cpu_clock_event_update(event); } static int cpu_clock_event_add(struct perf_event *event, int flags) @@ -11882,7 +11913,8 @@ static void task_clock_event_start(struct perf_event *event, int flags) static void task_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); - task_clock_event_update(event, event->ctx->time); + if (flags & PERF_EF_UPDATE) + task_clock_event_update(event, event->ctx->time); } static int task_clock_event_add(struct perf_event *event, int flags) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index d2aef87c7e9f..aa9a759e824f 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -441,7 +441,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * store that will be enabled on successful return */ if (!handle->size) { /* A, matches D */ - event->pending_disable = smp_processor_id(); + perf_event_disable_inatomic(handle->event); perf_output_wakeup(handle); WRITE_ONCE(rb->aux_nest, 0); goto err_put; @@ -526,7 +526,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) if (wakeup) { if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) - handle->event->pending_disable = smp_processor_id(); + perf_event_disable_inatomic(handle->event); perf_output_wakeup(handle); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8d783b5882b6..4c965ba77f9f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -29,6 +29,7 @@ #include <linux/workqueue.h> #include <linux/srcu.h> #include <linux/oom.h> /* check_stable_address_space */ +#include <linux/pagewalk.h> #include <linux/uprobes.h> @@ -152,91 +153,6 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) } /** - * __replace_page - replace page in vma by new page. - * based on replace_page in mm/ksm.c - * - * @vma: vma that holds the pte pointing to page - * @addr: address the old @page is mapped at - * @old_page: the page we are replacing by new_page - * @new_page: the modified page we replace page by - * - * If @new_page is NULL, only unmap @old_page. - * - * Returns 0 on success, negative error code otherwise. - */ -static int __replace_page(struct vm_area_struct *vma, unsigned long addr, - struct page *old_page, struct page *new_page) -{ - struct folio *old_folio = page_folio(old_page); - struct folio *new_folio; - struct mm_struct *mm = vma->vm_mm; - DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); - int err; - struct mmu_notifier_range range; - pte_t pte; - - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, - addr + PAGE_SIZE); - - if (new_page) { - new_folio = page_folio(new_page); - err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL); - if (err) - return err; - } - - /* For folio_free_swap() below */ - folio_lock(old_folio); - - mmu_notifier_invalidate_range_start(&range); - err = -EAGAIN; - if (!page_vma_mapped_walk(&pvmw)) - goto unlock; - VM_BUG_ON_PAGE(addr != pvmw.address, old_page); - pte = ptep_get(pvmw.pte); - - /* - * Handle PFN swap PTES, such as device-exclusive ones, that actually - * map pages: simply trigger GUP again to fix it up. - */ - if (unlikely(!pte_present(pte))) { - page_vma_mapped_walk_done(&pvmw); - goto unlock; - } - - if (new_page) { - folio_get(new_folio); - folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE); - folio_add_lru_vma(new_folio, vma); - } else - /* no new page, just dec_mm_counter for old_page */ - dec_mm_counter(mm, MM_ANONPAGES); - - if (!folio_test_anon(old_folio)) { - dec_mm_counter(mm, mm_counter_file(old_folio)); - inc_mm_counter(mm, MM_ANONPAGES); - } - - flush_cache_page(vma, addr, pte_pfn(pte)); - ptep_clear_flush(vma, addr, pvmw.pte); - if (new_page) - set_pte_at(mm, addr, pvmw.pte, - mk_pte(new_page, vma->vm_page_prot)); - - folio_remove_rmap_pte(old_folio, old_page, vma); - if (!folio_mapped(old_folio)) - folio_free_swap(old_folio); - page_vma_mapped_walk_done(&pvmw); - folio_put(old_folio); - - err = 0; - unlock: - mmu_notifier_invalidate_range_end(&range); - folio_unlock(old_folio); - return err; -} - -/** * is_swbp_insn - check if instruction is breakpoint instruction. * @insn: instruction to be checked. * Default implementation of is_swbp_insn @@ -463,6 +379,95 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, return ret; } +static bool orig_page_is_identical(struct vm_area_struct *vma, + unsigned long vaddr, struct page *page, bool *pmd_mappable) +{ + const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT; + struct folio *orig_folio = filemap_get_folio(vma->vm_file->f_mapping, + index); + struct page *orig_page; + bool identical; + + if (IS_ERR(orig_folio)) + return false; + orig_page = folio_file_page(orig_folio, index); + + *pmd_mappable = folio_test_pmd_mappable(orig_folio); + identical = folio_test_uptodate(orig_folio) && + pages_identical(page, orig_page); + folio_put(orig_folio); + return identical; +} + +static int __uprobe_write_opcode(struct vm_area_struct *vma, + struct folio_walk *fw, struct folio *folio, + unsigned long opcode_vaddr, uprobe_opcode_t opcode) +{ + const unsigned long vaddr = opcode_vaddr & PAGE_MASK; + const bool is_register = !!is_swbp_insn(&opcode); + bool pmd_mappable; + + /* For now, we'll only handle PTE-mapped folios. */ + if (fw->level != FW_LEVEL_PTE) + return -EFAULT; + + /* + * See can_follow_write_pte(): we'd actually prefer a writable PTE here, + * but the VMA might not be writable. + */ + if (!pte_write(fw->pte)) { + if (!PageAnonExclusive(fw->page)) + return -EFAULT; + if (unlikely(userfaultfd_pte_wp(vma, fw->pte))) + return -EFAULT; + /* SOFTDIRTY is handled via pte_mkdirty() below. */ + } + + /* + * We'll temporarily unmap the page and flush the TLB, such that we can + * modify the page atomically. + */ + flush_cache_page(vma, vaddr, pte_pfn(fw->pte)); + fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep); + copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + + /* + * When unregistering, we may only zap a PTE if uffd is disabled and + * there are no unexpected folio references ... + */ + if (is_register || userfaultfd_missing(vma) || + (folio_ref_count(folio) != folio_mapcount(folio) + 1 + + folio_test_swapcache(folio) * folio_nr_pages(folio))) + goto remap; + + /* + * ... and the mapped page is identical to the original page that + * would get faulted in on next access. + */ + if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable)) + goto remap; + + dec_mm_counter(vma->vm_mm, MM_ANONPAGES); + folio_remove_rmap_pte(folio, fw->page, vma); + if (!folio_mapped(folio) && folio_test_swapcache(folio) && + folio_trylock(folio)) { + folio_free_swap(folio); + folio_unlock(folio); + } + folio_put(folio); + + return pmd_mappable; +remap: + /* + * Make sure that our copy_to_page() changes become visible before the + * set_pte_at() write. + */ + smp_wmb(); + /* We modified the page. Make sure to mark the PTE dirty. */ + set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte)); + return 0; +} + /* * NOTE: * Expect the breakpoint instruction to be the smallest size instruction for @@ -474,146 +479,146 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, * * uprobe_write_opcode - write the opcode at a given virtual address. * @auprobe: arch specific probepoint information. - * @mm: the probed process address space. - * @vaddr: the virtual address to store the opcode. - * @opcode: opcode to be written at @vaddr. + * @vma: the probed virtual memory area. + * @opcode_vaddr: the virtual address to store the opcode. + * @opcode: opcode to be written at @opcode_vaddr. * * Called with mm->mmap_lock held for read or write. * Return 0 (success) or a negative errno. */ -int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, - unsigned long vaddr, uprobe_opcode_t opcode) +int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + const unsigned long opcode_vaddr, uprobe_opcode_t opcode) { + const unsigned long vaddr = opcode_vaddr & PAGE_MASK; + struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; - struct page *old_page, *new_page; - struct vm_area_struct *vma; int ret, is_register, ref_ctr_updated = 0; - bool orig_page_huge = false; unsigned int gup_flags = FOLL_FORCE; + struct mmu_notifier_range range; + struct folio_walk fw; + struct folio *folio; + struct page *page; is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); -retry: + if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) + return -EINVAL; + + /* + * When registering, we have to break COW to get an exclusive anonymous + * page that we can safely modify. Use FOLL_WRITE to trigger a write + * fault if required. When unregistering, we might be lucky and the + * anon page is already gone. So defer write faults until really + * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode() + * cannot deal with PMDs yet. + */ if (is_register) - gup_flags |= FOLL_SPLIT_PMD; - /* Read the page with vaddr into memory */ - old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); - if (IS_ERR(old_page)) - return PTR_ERR(old_page); + gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; - ret = verify_opcode(old_page, vaddr, &opcode); +retry: + ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL); if (ret <= 0) - goto put_old; - - if (is_zero_page(old_page)) { - ret = -EINVAL; - goto put_old; - } + goto out; + folio = page_folio(page); - if (WARN(!is_register && PageCompound(old_page), - "uprobe unregister should never work on compound page\n")) { - ret = -EINVAL; - goto put_old; + ret = verify_opcode(page, opcode_vaddr, &opcode); + if (ret <= 0) { + folio_put(folio); + goto out; } /* We are going to replace instruction, update ref_ctr. */ if (!ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); - if (ret) - goto put_old; + if (ret) { + folio_put(folio); + goto out; + } ref_ctr_updated = 1; } ret = 0; - if (!is_register && !PageAnon(old_page)) - goto put_old; - - ret = anon_vma_prepare(vma); - if (ret) - goto put_old; - - ret = -ENOMEM; - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); - if (!new_page) - goto put_old; - - __SetPageUptodate(new_page); - copy_highpage(new_page, old_page); - copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + if (unlikely(!folio_test_anon(folio))) { + VM_WARN_ON_ONCE(is_register); + folio_put(folio); + goto out; + } if (!is_register) { - struct page *orig_page; - pgoff_t index; - - VM_BUG_ON_PAGE(!PageAnon(old_page), old_page); - - index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT; - orig_page = find_get_page(vma->vm_file->f_inode->i_mapping, - index); - - if (orig_page) { - if (PageUptodate(orig_page) && - pages_identical(new_page, orig_page)) { - /* let go new_page */ - put_page(new_page); - new_page = NULL; - - if (PageCompound(orig_page)) - orig_page_huge = true; - } - put_page(orig_page); - } + /* + * In the common case, we'll be able to zap the page when + * unregistering. So trigger MMU notifiers now, as we won't + * be able to do it under PTL. + */ + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, + vaddr, vaddr + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); } - ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page); - if (new_page) - put_page(new_page); -put_old: - put_page(old_page); + ret = -EAGAIN; + /* Walk the page tables again, to perform the actual update. */ + if (folio_walk_start(&fw, vma, vaddr, 0)) { + if (fw.page == page) + ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode); + folio_walk_end(&fw, vma); + } + + if (!is_register) + mmu_notifier_invalidate_range_end(&range); - if (unlikely(ret == -EAGAIN)) + folio_put(folio); + switch (ret) { + case -EFAULT: + gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; + fallthrough; + case -EAGAIN: goto retry; + default: + break; + } +out: /* Revert back reference counter if instruction update failed. */ - if (ret && is_register && ref_ctr_updated) + if (ret < 0 && is_register && ref_ctr_updated) update_ref_ctr(uprobe, mm, -1); /* try collapse pmd for compound page */ - if (!ret && orig_page_huge) + if (ret > 0) collapse_pte_mapped_thp(mm, vaddr, false); - return ret; + return ret < 0 ? ret : 0; } /** * set_swbp - store breakpoint at a given address. * @auprobe: arch specific probepoint information. - * @mm: the probed process address space. + * @vma: the probed virtual memory area. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, store the breakpoint instruction at @vaddr. * Return 0 (success) or a negative errno. */ -int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) { - return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN); } /** * set_orig_insn - Restore the original instruction. - * @mm: the probed process address space. + * @vma: the probed virtual memory area. * @auprobe: arch specific probepoint information. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, restore the original opcode (opcode) at @vaddr. * Return 0 (success) or a negative errno. */ -int __weak -set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +int __weak set_orig_insn(struct arch_uprobe *auprobe, + struct vm_area_struct *vma, unsigned long vaddr) { - return uprobe_write_opcode(auprobe, mm, vaddr, + return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn); } @@ -1134,10 +1139,10 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) return ret; } -static int -install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long vaddr) +static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, + unsigned long vaddr) { + struct mm_struct *mm = vma->vm_mm; bool first_uprobe; int ret; @@ -1153,7 +1158,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (first_uprobe) set_bit(MMF_HAS_UPROBES, &mm->flags); - ret = set_swbp(&uprobe->arch, mm, vaddr); + ret = set_swbp(&uprobe->arch, vma, vaddr); if (!ret) clear_bit(MMF_RECALC_UPROBES, &mm->flags); else if (first_uprobe) @@ -1162,11 +1167,13 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, return ret; } -static int -remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) +static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, + unsigned long vaddr) { + struct mm_struct *mm = vma->vm_mm; + set_bit(MMF_RECALC_UPROBES, &mm->flags); - return set_orig_insn(&uprobe->arch, mm, vaddr); + return set_orig_insn(&uprobe->arch, vma, vaddr); } struct map_info { @@ -1296,10 +1303,10 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) if (is_register) { /* consult only the "caller", new consumer. */ if (consumer_filter(new, mm)) - err = install_breakpoint(uprobe, mm, vma, info->vaddr); + err = install_breakpoint(uprobe, vma, info->vaddr); } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { if (!filter_chain(uprobe, mm)) - err |= remove_breakpoint(uprobe, mm, info->vaddr); + err |= remove_breakpoint(uprobe, vma, info->vaddr); } unlock: @@ -1472,7 +1479,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) continue; vaddr = offset_to_vaddr(vma, uprobe->offset); - err |= remove_breakpoint(uprobe, mm, vaddr); + err |= remove_breakpoint(uprobe, vma, vaddr); } mmap_read_unlock(mm); @@ -1610,7 +1617,7 @@ int uprobe_mmap(struct vm_area_struct *vma) if (!fatal_signal_pending(current) && filter_chain(uprobe, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); - install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); + install_breakpoint(uprobe, vma, vaddr); } put_uprobe(uprobe); } |