summaryrefslogtreecommitdiff
path: root/kernel/events
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events')
-rw-r--r--kernel/events/core.c857
-rw-r--r--kernel/events/ring_buffer.c33
-rw-r--r--kernel/events/uprobes.c376
3 files changed, 779 insertions, 487 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0bb21659e252..8060c2857bb2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -207,6 +207,19 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
__perf_ctx_unlock(&cpuctx->ctx);
}
+typedef struct {
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+} class_perf_ctx_lock_t;
+
+static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T)
+{ perf_ctx_unlock(_T->cpuctx, _T->ctx); }
+
+static inline class_perf_ctx_lock_t
+class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{ perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; }
+
#define TASK_TOMBSTONE ((void *)-1L)
static bool is_kernel_event(struct perf_event *event)
@@ -938,13 +951,19 @@ static void perf_cgroup_switch(struct task_struct *task)
if (READ_ONCE(cpuctx->cgrp) == NULL)
return;
- WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
-
cgrp = perf_cgroup_from_task(task, NULL);
if (READ_ONCE(cpuctx->cgrp) == cgrp)
return;
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx);
+ /*
+ * Re-check, could've raced vs perf_remove_from_context().
+ */
+ if (READ_ONCE(cpuctx->cgrp) == NULL)
+ return;
+
+ WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
+
perf_ctx_disable(&cpuctx->ctx, true);
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
@@ -962,7 +981,6 @@ static void perf_cgroup_switch(struct task_struct *task)
ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
perf_ctx_enable(&cpuctx->ctx, true);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
static int perf_cgroup_ensure_storage(struct perf_event *event,
@@ -1270,6 +1288,10 @@ static void put_ctx(struct perf_event_context *ctx)
if (ctx->task && ctx->task != TASK_TOMBSTONE)
put_task_struct(ctx->task);
call_rcu(&ctx->rcu_head, free_ctx);
+ } else {
+ smp_mb__after_atomic(); /* pairs with wait_var_event() */
+ if (ctx->task == TASK_TOMBSTONE)
+ wake_up_var(&ctx->refcount);
}
}
@@ -2116,18 +2138,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
if (event->group_leader == event)
del_event_from_groups(event, ctx);
- /*
- * If event was in error state, then keep it
- * that way, otherwise bogus counts will be
- * returned on read(). The only way to get out
- * of error state is by explicit re-enabling
- * of the event
- */
- if (event->state > PERF_EVENT_STATE_OFF) {
- perf_cgroup_event_disable(event, ctx);
- perf_event_set_state(event, PERF_EVENT_STATE_OFF);
- }
-
ctx->generation++;
event->pmu_ctx->nr_events--;
}
@@ -2145,8 +2155,9 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
}
static void put_event(struct perf_event *event);
-static void event_sched_out(struct perf_event *event,
- struct perf_event_context *ctx);
+static void __event_disable(struct perf_event *event,
+ struct perf_event_context *ctx,
+ enum perf_event_state state);
static void perf_put_aux_event(struct perf_event *event)
{
@@ -2167,7 +2178,7 @@ static void perf_put_aux_event(struct perf_event *event)
* If the event is an aux_event, tear down all links to
* it from other events.
*/
- for_each_sibling_event(iter, event->group_leader) {
+ for_each_sibling_event(iter, event) {
if (iter->aux_event != event)
continue;
@@ -2179,8 +2190,7 @@ static void perf_put_aux_event(struct perf_event *event)
* state so that we don't try to schedule it again. Note
* that perf_event_enable() will clear the ERROR status.
*/
- event_sched_out(iter, ctx);
- perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ __event_disable(iter, ctx, PERF_EVENT_STATE_ERROR);
}
}
@@ -2238,18 +2248,6 @@ static inline struct list_head *get_event_list(struct perf_event *event)
&event->pmu_ctx->flexible_active;
}
-/*
- * Events that have PERF_EV_CAP_SIBLING require being part of a group and
- * cannot exist on their own, schedule them out and move them into the ERROR
- * state. Also see _perf_event_enable(), it will not be able to recover
- * this ERROR state.
- */
-static inline void perf_remove_sibling_event(struct perf_event *event)
-{
- event_sched_out(event, event->ctx);
- perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
-}
-
static void perf_group_detach(struct perf_event *event)
{
struct perf_event *leader = event->group_leader;
@@ -2285,8 +2283,15 @@ static void perf_group_detach(struct perf_event *event)
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
+ /*
+ * Events that have PERF_EV_CAP_SIBLING require being part of
+ * a group and cannot exist on their own, schedule them out
+ * and move them into the ERROR state. Also see
+ * _perf_event_enable(), it will not be able to recover this
+ * ERROR state.
+ */
if (sibling->event_caps & PERF_EV_CAP_SIBLING)
- perf_remove_sibling_event(sibling);
+ __event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR);
sibling->group_leader = sibling;
list_del_init(&sibling->sibling_list);
@@ -2325,7 +2330,11 @@ static void perf_child_detach(struct perf_event *event)
if (WARN_ON_ONCE(!parent_event))
return;
+ /*
+ * Can't check this from an IPI, the holder is likey another CPU.
+ *
lockdep_assert_held(&parent_event->child_mutex);
+ */
sync_child_event(event);
list_del_init(&event->child_list);
@@ -2343,6 +2352,11 @@ event_filter_match(struct perf_event *event)
perf_cgroup_match(event);
}
+static inline bool is_event_in_freq_mode(struct perf_event *event)
+{
+ return event->attr.freq && event->attr.sample_freq;
+}
+
static void
event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
{
@@ -2380,7 +2394,7 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
if (!is_software_event(event))
cpc->active_oncpu--;
- if (event->attr.freq && event->attr.sample_freq) {
+ if (is_event_in_freq_mode(event)) {
ctx->nr_freq--;
epc->nr_freq--;
}
@@ -2450,7 +2464,9 @@ ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
#define DETACH_GROUP 0x01UL
#define DETACH_CHILD 0x02UL
-#define DETACH_DEAD 0x04UL
+#define DETACH_EXIT 0x04UL
+#define DETACH_REVOKE 0x08UL
+#define DETACH_DEAD 0x10UL
/*
* Cross CPU call to remove a performance event
@@ -2465,6 +2481,7 @@ __perf_remove_from_context(struct perf_event *event,
void *info)
{
struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
+ enum perf_event_state state = PERF_EVENT_STATE_OFF;
unsigned long flags = (unsigned long)info;
ctx_time_update(cpuctx, ctx);
@@ -2473,16 +2490,25 @@ __perf_remove_from_context(struct perf_event *event,
* Ensure event_sched_out() switches to OFF, at the very least
* this avoids raising perf_pending_task() at this time.
*/
+ if (flags & DETACH_EXIT)
+ state = PERF_EVENT_STATE_EXIT;
+ if (flags & DETACH_REVOKE)
+ state = PERF_EVENT_STATE_REVOKED;
if (flags & DETACH_DEAD)
- event->pending_disable = 1;
+ state = PERF_EVENT_STATE_DEAD;
+
event_sched_out(event, ctx);
+
+ if (event->state > PERF_EVENT_STATE_OFF)
+ perf_cgroup_event_disable(event, ctx);
+
+ perf_event_set_state(event, min(event->state, state));
+
if (flags & DETACH_GROUP)
perf_group_detach(event);
if (flags & DETACH_CHILD)
perf_child_detach(event);
list_del_event(event, ctx);
- if (flags & DETACH_DEAD)
- event->state = PERF_EVENT_STATE_DEAD;
if (!pmu_ctx->nr_events) {
pmu_ctx->rotate_necessary = 0;
@@ -2540,6 +2566,15 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla
event_function_call(event, __perf_remove_from_context, (void *)flags);
}
+static void __event_disable(struct perf_event *event,
+ struct perf_event_context *ctx,
+ enum perf_event_state state)
+{
+ event_sched_out(event, ctx);
+ perf_cgroup_event_disable(event, ctx);
+ perf_event_set_state(event, state);
+}
+
/*
* Cross CPU call to disable a performance event
*/
@@ -2554,13 +2589,18 @@ static void __perf_event_disable(struct perf_event *event,
perf_pmu_disable(event->pmu_ctx->pmu);
ctx_time_update_event(ctx, event);
+ /*
+ * When disabling a group leader, the whole group becomes ineligible
+ * to run, so schedule out the full group.
+ */
if (event == event->group_leader)
group_sched_out(event, ctx);
- else
- event_sched_out(event, ctx);
- perf_event_set_state(event, PERF_EVENT_STATE_OFF);
- perf_cgroup_event_disable(event, ctx);
+ /*
+ * But only mark the leader OFF; the siblings will remain
+ * INACTIVE.
+ */
+ __event_disable(event, ctx, PERF_EVENT_STATE_OFF);
perf_pmu_enable(event->pmu_ctx->pmu);
}
@@ -2623,6 +2663,41 @@ void perf_event_disable_inatomic(struct perf_event *event)
static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);
+static void perf_event_unthrottle(struct perf_event *event, bool start)
+{
+ event->hw.interrupts = 0;
+ if (start)
+ event->pmu->start(event, 0);
+ if (event == event->group_leader)
+ perf_log_throttle(event, 1);
+}
+
+static void perf_event_throttle(struct perf_event *event)
+{
+ event->hw.interrupts = MAX_INTERRUPTS;
+ event->pmu->stop(event, 0);
+ if (event == event->group_leader)
+ perf_log_throttle(event, 0);
+}
+
+static void perf_event_unthrottle_group(struct perf_event *event, bool skip_start_event)
+{
+ struct perf_event *sibling, *leader = event->group_leader;
+
+ perf_event_unthrottle(leader, skip_start_event ? leader != event : true);
+ for_each_sibling_event(sibling, leader)
+ perf_event_unthrottle(sibling, skip_start_event ? sibling != event : true);
+}
+
+static void perf_event_throttle_group(struct perf_event *event)
+{
+ struct perf_event *sibling, *leader = event->group_leader;
+
+ perf_event_throttle(leader);
+ for_each_sibling_event(sibling, leader)
+ perf_event_throttle(sibling);
+}
+
static int
event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
{
@@ -2651,10 +2726,8 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
* ticks already, also for a heavily scheduling task there is little
* guarantee it'll get a tick in a timely manner.
*/
- if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
- perf_log_throttle(event, 1);
- event->hw.interrupts = 0;
- }
+ if (unlikely(event->hw.interrupts == MAX_INTERRUPTS))
+ perf_event_unthrottle(event, false);
perf_pmu_disable(event->pmu);
@@ -2669,7 +2742,7 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
if (!is_software_event(event))
cpc->active_oncpu++;
- if (event->attr.freq && event->attr.sample_freq) {
+ if (is_event_in_freq_mode(event)) {
ctx->nr_freq++;
epc->nr_freq++;
}
@@ -3938,7 +4011,7 @@ static int merge_sched_in(struct perf_event *event, void *data)
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
if (*perf_event_fasync(event))
- event->pending_kill = POLL_HUP;
+ event->pending_kill = POLL_ERR;
perf_event_wakeup(event);
} else {
@@ -4232,14 +4305,10 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list)
hwc = &event->hw;
- if (hwc->interrupts == MAX_INTERRUPTS) {
- hwc->interrupts = 0;
- perf_log_throttle(event, 1);
- if (!event->attr.freq || !event->attr.sample_freq)
- event->pmu->start(event, 0);
- }
+ if (hwc->interrupts == MAX_INTERRUPTS)
+ perf_event_unthrottle_group(event, is_event_in_freq_mode(event));
- if (!event->attr.freq || !event->attr.sample_freq)
+ if (!is_event_in_freq_mode(event))
continue;
/*
@@ -4511,7 +4580,8 @@ out:
static void perf_remove_from_owner(struct perf_event *event);
static void perf_event_exit_event(struct perf_event *event,
- struct perf_event_context *ctx);
+ struct perf_event_context *ctx,
+ bool revoke);
/*
* Removes all events from the current task that have been marked
@@ -4538,7 +4608,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx)
modified = true;
- perf_event_exit_event(event, ctx);
+ perf_event_exit_event(event, ctx, false);
}
raw_spin_lock_irqsave(&ctx->lock, flags);
@@ -5120,6 +5190,7 @@ static bool is_sb_event(struct perf_event *event)
attr->context_switch || attr->text_poke ||
attr->bpf_event)
return true;
+
return false;
}
@@ -5513,33 +5584,11 @@ static bool exclusive_event_installable(struct perf_event *event,
static void perf_free_addr_filters(struct perf_event *event);
-static void perf_pending_task_sync(struct perf_event *event)
-{
- struct callback_head *head = &event->pending_task;
-
- if (!event->pending_work)
- return;
- /*
- * If the task is queued to the current task's queue, we
- * obviously can't wait for it to complete. Simply cancel it.
- */
- if (task_work_cancel(current, head)) {
- event->pending_work = 0;
- local_dec(&event->ctx->nr_no_switch_fast);
- return;
- }
-
- /*
- * All accesses related to the event are within the same RCU section in
- * perf_pending_task(). The RCU grace period before the event is freed
- * will make sure all those accesses are complete by then.
- */
- rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE);
-}
-
/* vs perf_event_alloc() error */
static void __free_event(struct perf_event *event)
{
+ struct pmu *pmu = event->pmu;
+
if (event->attach_state & PERF_ATTACH_CALLCHAIN)
put_callchain_buffers();
@@ -5569,6 +5618,7 @@ static void __free_event(struct perf_event *event)
* put_pmu_ctx() needs an event->ctx reference, because of
* epc->ctx.
*/
+ WARN_ON_ONCE(!pmu);
WARN_ON_ONCE(!event->ctx);
WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
put_pmu_ctx(event->pmu_ctx);
@@ -5581,8 +5631,13 @@ static void __free_event(struct perf_event *event)
if (event->ctx)
put_ctx(event->ctx);
- if (event->pmu)
- module_put(event->pmu->module);
+ if (pmu) {
+ module_put(pmu->module);
+ scoped_guard (spinlock, &pmu->events_lock) {
+ list_del(&event->pmu_list);
+ wake_up_var(pmu);
+ }
+ }
call_rcu(&event->rcu_head, free_event_rcu);
}
@@ -5594,7 +5649,6 @@ static void _free_event(struct perf_event *event)
{
irq_work_sync(&event->pending_irq);
irq_work_sync(&event->pending_disable_irq);
- perf_pending_task_sync(event);
unaccount_event(event);
@@ -5620,13 +5674,13 @@ static void _free_event(struct perf_event *event)
/*
* Used to free events which have a known refcount of 1, such as in error paths
- * where the event isn't exposed yet and inherited events.
+ * of inherited events.
*/
static void free_event(struct perf_event *event)
{
if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
- "unexpected event refcount: %ld; ptr=%p\n",
- atomic_long_read(&event->refcount), event)) {
+ "unexpected event refcount: %ld; ptr=%p\n",
+ atomic_long_read(&event->refcount), event)) {
/* leak to avoid use-after-free */
return;
}
@@ -5687,10 +5741,17 @@ static void perf_remove_from_owner(struct perf_event *event)
static void put_event(struct perf_event *event)
{
+ struct perf_event *parent;
+
if (!atomic_long_dec_and_test(&event->refcount))
return;
+ parent = event->parent;
_free_event(event);
+
+ /* Matches the refcount bump in inherit_event() */
+ if (parent)
+ put_event(parent);
}
/*
@@ -5702,7 +5763,6 @@ int perf_event_release_kernel(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct perf_event *child, *tmp;
- LIST_HEAD(free_list);
/*
* If we got here through err_alloc: free_event(event); we will not
@@ -5731,15 +5791,17 @@ int perf_event_release_kernel(struct perf_event *event)
* Thus this guarantees that we will in fact observe and kill _ALL_
* child events.
*/
- perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+ if (event->state > PERF_EVENT_STATE_REVOKED) {
+ perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+ } else {
+ event->state = PERF_EVENT_STATE_DEAD;
+ }
perf_event_ctx_unlock(event, ctx);
again:
mutex_lock(&event->child_mutex);
list_for_each_entry(child, &event->child_list, child_list) {
- void *var = NULL;
-
/*
* Cannot change, child events are not migrated, see the
* comment with perf_event_ctx_lock_nested().
@@ -5772,50 +5834,30 @@ again:
tmp = list_first_entry_or_null(&event->child_list,
struct perf_event, child_list);
if (tmp == child) {
- perf_remove_from_context(child, DETACH_GROUP);
- list_move(&child->child_list, &free_list);
- /*
- * This matches the refcount bump in inherit_event();
- * this can't be the last reference.
- */
- put_event(event);
+ perf_remove_from_context(child, DETACH_GROUP | DETACH_CHILD);
} else {
- var = &ctx->refcount;
+ child = NULL;
}
mutex_unlock(&event->child_mutex);
mutex_unlock(&ctx->mutex);
- put_ctx(ctx);
- if (var) {
- /*
- * If perf_event_free_task() has deleted all events from the
- * ctx while the child_mutex got released above, make sure to
- * notify about the preceding put_ctx().
- */
- smp_mb(); /* pairs with wait_var_event() */
- wake_up_var(var);
+ if (child) {
+ /* Last reference unless ->pending_task work is pending */
+ put_event(child);
}
+ put_ctx(ctx);
+
goto again;
}
mutex_unlock(&event->child_mutex);
- list_for_each_entry_safe(child, tmp, &free_list, child_list) {
- void *var = &child->ctx->refcount;
-
- list_del(&child->child_list);
- free_event(child);
-
- /*
- * Wake any perf_event_free_task() waiting for this event to be
- * freed.
- */
- smp_mb(); /* pairs with wait_var_event() */
- wake_up_var(var);
- }
-
no_ctx:
- put_event(event); /* Must be the 'last' reference */
+ /*
+ * Last reference unless ->pending_task work is pending on this event
+ * or any of its children.
+ */
+ put_event(event);
return 0;
}
EXPORT_SYMBOL_GPL(perf_event_release_kernel);
@@ -6081,14 +6123,20 @@ static __poll_t perf_poll(struct file *file, poll_table *wait)
struct perf_buffer *rb;
__poll_t events = EPOLLHUP;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return EPOLLERR;
+
poll_wait(file, &event->waitq, wait);
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return EPOLLERR;
+
if (is_event_hup(event))
return events;
if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR &&
event->attr.pinned))
- return events;
+ return EPOLLERR;
/*
* Pin the event->rb by taking event->mmap_mutex; otherwise
@@ -6180,14 +6228,6 @@ static void __perf_event_period(struct perf_event *event,
active = (event->state == PERF_EVENT_STATE_ACTIVE);
if (active) {
perf_pmu_disable(event->pmu);
- /*
- * We could be throttled; unthrottle now to avoid the tick
- * trying to unthrottle while we already re-started the event.
- */
- if (event->hw.interrupts == MAX_INTERRUPTS) {
- event->hw.interrupts = 0;
- perf_log_throttle(event, 1);
- }
event->pmu->stop(event, PERF_EF_UPDATE);
}
@@ -6195,6 +6235,14 @@ static void __perf_event_period(struct perf_event *event,
if (active) {
event->pmu->start(event, PERF_EF_RELOAD);
+ /*
+ * Once the period is force-reset, the event starts immediately.
+ * But the event/group could be throttled. Unthrottle the
+ * event/group now to avoid the next tick trying to unthrottle
+ * while we already re-started the event/group.
+ */
+ if (event->hw.interrupts == MAX_INTERRUPTS)
+ perf_event_unthrottle_group(event, true);
perf_pmu_enable(event->pmu);
}
}
@@ -6252,12 +6300,18 @@ static int perf_event_set_output(struct perf_event *event,
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
static int perf_copy_attr(struct perf_event_attr __user *uattr,
struct perf_event_attr *attr);
+static int __perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie);
static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
void (*func)(struct perf_event *);
u32 flags = arg;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
switch (cmd) {
case PERF_EVENT_IOC_ENABLE:
func = _perf_event_enable;
@@ -6314,7 +6368,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
if (IS_ERR(prog))
return PTR_ERR(prog);
- err = perf_event_set_bpf_prog(event, prog, 0);
+ err = __perf_event_set_bpf_prog(event, prog, 0);
if (err) {
bpf_prog_put(prog);
return err;
@@ -6633,9 +6687,22 @@ void ring_buffer_put(struct perf_buffer *rb)
call_rcu(&rb->rcu_head, rb_free_rcu);
}
+typedef void (*mapped_f)(struct perf_event *event, struct mm_struct *mm);
+
+#define get_mapped(event, func) \
+({ struct pmu *pmu; \
+ mapped_f f = NULL; \
+ guard(rcu)(); \
+ pmu = READ_ONCE(event->pmu); \
+ if (pmu) \
+ f = pmu->func; \
+ f; \
+})
+
static void perf_mmap_open(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
+ mapped_f mapped = get_mapped(event, event_mapped);
atomic_inc(&event->mmap_count);
atomic_inc(&event->rb->mmap_count);
@@ -6643,8 +6710,8 @@ static void perf_mmap_open(struct vm_area_struct *vma)
if (vma->vm_pgoff)
atomic_inc(&event->rb->aux_mmap_count);
- if (event->pmu->event_mapped)
- event->pmu->event_mapped(event, vma->vm_mm);
+ if (mapped)
+ mapped(event, vma->vm_mm);
}
static void perf_pmu_output_stop(struct perf_event *event);
@@ -6660,14 +6727,16 @@ static void perf_pmu_output_stop(struct perf_event *event);
static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
+ mapped_f unmapped = get_mapped(event, event_unmapped);
struct perf_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
bool detach_rest = false;
- if (event->pmu->event_unmapped)
- event->pmu->event_unmapped(event, vma->vm_mm);
+ /* FIXIES vs perf_pmu_unregister() */
+ if (unmapped)
+ unmapped(event, vma->vm_mm);
/*
* The AUX buffer is strictly a sub-buffer, serialize using aux_mutex
@@ -6773,10 +6842,20 @@ static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf)
return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS;
}
+static int perf_mmap_may_split(struct vm_area_struct *vma, unsigned long addr)
+{
+ /*
+ * Forbid splitting perf mappings to prevent refcount leaks due to
+ * the resulting non-matching offsets and sizes. See open()/close().
+ */
+ return -EINVAL;
+}
+
static const struct vm_operations_struct perf_mmap_vmops = {
.open = perf_mmap_open,
.close = perf_mmap_close, /* non mergeable */
.pfn_mkwrite = perf_mmap_pfn_mkwrite,
+ .may_split = perf_mmap_may_split,
};
static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
@@ -6860,6 +6939,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
unsigned long nr_pages;
long user_extra = 0, extra = 0;
int ret, flags = 0;
+ mapped_f mapped;
/*
* Don't allow mmap() of inherited per-task counters. This would
@@ -6890,6 +6970,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
mutex_lock(&event->mmap_mutex);
ret = -EINVAL;
+ /*
+ * This relies on __pmu_detach_event() taking mmap_mutex after marking
+ * the event REVOKED. Either we observe the state, or __pmu_detach_event()
+ * will detach the rb created here.
+ */
+ if (event->state <= PERF_EVENT_STATE_REVOKED) {
+ ret = -ENODEV;
+ goto unlock;
+ }
+
if (vma->vm_pgoff == 0) {
nr_pages -= 1;
@@ -6971,8 +7061,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
ret = 0;
goto unlock;
}
-
- atomic_set(&rb->aux_mmap_count, 1);
}
user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
@@ -7035,15 +7123,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
perf_event_update_time(event);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
+ ret = 0;
} else {
ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
event->attr.aux_watermark, flags);
- if (!ret)
+ if (!ret) {
+ atomic_set(&rb->aux_mmap_count, 1);
rb->aux_mmap_locked = extra;
+ }
}
- ret = 0;
-
unlock:
if (!ret) {
atomic_long_add(user_extra, &user->locked_vm);
@@ -7051,6 +7140,7 @@ unlock:
atomic_inc(&event->mmap_count);
} else if (rb) {
+ /* AUX allocation failed */
atomic_dec(&rb->mmap_count);
}
aux_unlock:
@@ -7058,6 +7148,9 @@ aux_unlock:
mutex_unlock(aux_mutex);
mutex_unlock(&event->mmap_mutex);
+ if (ret)
+ return ret;
+
/*
* Since pinned accounting is per vm we cannot allow fork() to copy our
* vma.
@@ -7065,11 +7158,19 @@ aux_unlock:
vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
vma->vm_ops = &perf_mmap_vmops;
- if (!ret)
- ret = map_range(rb, vma);
+ mapped = get_mapped(event, event_mapped);
+ if (mapped)
+ mapped(event, vma->vm_mm);
- if (!ret && event->pmu->event_mapped)
- event->pmu->event_mapped(event, vma->vm_mm);
+ /*
+ * Try to map it into the page table. On fail, invoke
+ * perf_mmap_close() to undo the above, as the callsite expects
+ * full cleanup in this case and therefore does not invoke
+ * vmops::close().
+ */
+ ret = map_range(rb, vma);
+ if (ret)
+ perf_mmap_close(vma);
return ret;
}
@@ -7080,6 +7181,9 @@ static int perf_fasync(int fd, struct file *filp, int on)
struct perf_event *event = filp->private_data;
int retval;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
inode_lock(inode);
retval = fasync_helper(fd, filp, on, &event->fasync);
inode_unlock(inode);
@@ -7120,18 +7224,18 @@ void perf_event_wakeup(struct perf_event *event)
static void perf_sigtrap(struct perf_event *event)
{
/*
- * We'd expect this to only occur if the irq_work is delayed and either
- * ctx->task or current has changed in the meantime. This can be the
- * case on architectures that do not implement arch_irq_work_raise().
+ * Both perf_pending_task() and perf_pending_irq() can race with the
+ * task exiting.
*/
- if (WARN_ON_ONCE(event->ctx->task != current))
+ if (current->flags & PF_EXITING)
return;
/*
- * Both perf_pending_task() and perf_pending_irq() can race with the
- * task exiting.
+ * We'd expect this to only occur if the irq_work is delayed and either
+ * ctx->task or current has changed in the meantime. This can be the
+ * case on architectures that do not implement arch_irq_work_raise().
*/
- if (current->flags & PF_EXITING)
+ if (WARN_ON_ONCE(event->ctx->task != current))
return;
send_sig_perf((void __user *)event->pending_addr,
@@ -7167,15 +7271,15 @@ static void __perf_pending_disable(struct perf_event *event)
* CPU-A CPU-B
*
* perf_event_disable_inatomic()
- * @pending_disable = CPU-A;
+ * @pending_disable = 1;
* irq_work_queue();
*
* sched-out
- * @pending_disable = -1;
+ * @pending_disable = 0;
*
* sched-in
* perf_event_disable_inatomic()
- * @pending_disable = CPU-B;
+ * @pending_disable = 1;
* irq_work_queue(); // FAILS
*
* irq_work_run()
@@ -7231,12 +7335,6 @@ static void perf_pending_task(struct callback_head *head)
int rctx;
/*
- * All accesses to the event must belong to the same implicit RCU read-side
- * critical section as the ->pending_work reset. See comment in
- * perf_pending_task_sync().
- */
- rcu_read_lock();
- /*
* If we 'fail' here, that's OK, it means recursion is already disabled
* and we won't recurse 'further'.
*/
@@ -7246,9 +7344,8 @@ static void perf_pending_task(struct callback_head *head)
event->pending_work = 0;
perf_sigtrap(event);
local_dec(&event->ctx->nr_no_switch_fast);
- rcuwait_wake_up(&event->pending_work_wait);
}
- rcu_read_unlock();
+ put_event(event);
if (rctx >= 0)
perf_swevent_put_recursion_context(rctx);
@@ -7380,6 +7477,10 @@ perf_sample_ustack_size(u16 stack_size, u16 header_size,
if (!regs)
return 0;
+ /* No mm, no stack, no dump. */
+ if (!current->mm)
+ return 0;
+
/*
* Check if we fit in with the requested stack size into the:
* - TASK_SIZE
@@ -8091,6 +8192,9 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
const u32 max_stack = event->attr.sample_max_stack;
struct perf_callchain_entry *callchain;
+ if (!current->mm)
+ user = false;
+
if (!kernel && !user)
return &__empty_callchain;
@@ -9966,7 +10070,7 @@ void perf_event_text_poke(const void *addr, const void *old_bytes,
void perf_event_itrace_started(struct perf_event *event)
{
- event->attach_state |= PERF_ATTACH_ITRACE;
+ WRITE_ONCE(event->attach_state, event->attach_state | PERF_ATTACH_ITRACE);
}
static void perf_log_itrace_start(struct perf_event *event)
@@ -10049,14 +10153,13 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle)
hwc->interrupts = 1;
} else {
hwc->interrupts++;
- if (unlikely(throttle &&
- hwc->interrupts > max_samples_per_tick)) {
- __this_cpu_inc(perf_throttled_count);
- tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
- hwc->interrupts = MAX_INTERRUPTS;
- perf_log_throttle(event, 0);
- ret = 1;
- }
+ }
+
+ if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) {
+ __this_cpu_inc(perf_throttled_count);
+ tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
+ perf_event_throttle_group(event);
+ ret = 1;
}
if (event->attr.freq) {
@@ -10243,6 +10346,7 @@ static int __perf_event_overflow(struct perf_event *event,
!task_work_add(current, &event->pending_task, notify_mode)) {
event->pending_work = pending_id;
local_inc(&event->ctx->nr_no_switch_fast);
+ WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
event->pending_addr = 0;
if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
@@ -11032,7 +11136,7 @@ static int perf_uprobe_event_init(struct perf_event *event)
if (event->attr.type != perf_uprobe.type)
return -ENOENT;
- if (!perfmon_capable())
+ if (!capable(CAP_SYS_ADMIN))
return -EACCES;
/*
@@ -11088,11 +11192,15 @@ static inline bool perf_event_is_tracing(struct perf_event *event)
return false;
}
-int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
- u64 bpf_cookie)
+static int __perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
{
bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
if (!perf_event_is_tracing(event))
return perf_event_set_bpf_handler(event, prog, bpf_cookie);
@@ -11127,6 +11235,20 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
}
+int perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
+{
+ struct perf_event_context *ctx;
+ int ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = __perf_event_set_bpf_prog(event, prog, bpf_cookie);
+ perf_event_ctx_unlock(event, ctx);
+
+ return ret;
+}
+
void perf_event_free_bpf_prog(struct perf_event *event)
{
if (!event->prog)
@@ -11149,7 +11271,15 @@ static void perf_event_free_filter(struct perf_event *event)
{
}
-int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
+static int __perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
+{
+ return -ENOENT;
+}
+
+int perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
u64 bpf_cookie)
{
return -ENOENT;
@@ -11664,7 +11794,12 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- if (is_sampling_event(event)) {
+ /*
+ * The throttle can be triggered in the hrtimer handler.
+ * The HRTIMER_NORESTART should be used to stop the timer,
+ * rather than hrtimer_cancel(). See perf_swevent_hrtimer()
+ */
+ if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) {
ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
local64_set(&hwc->period_left, ktime_to_ns(remaining));
@@ -11719,7 +11854,8 @@ static void cpu_clock_event_start(struct perf_event *event, int flags)
static void cpu_clock_event_stop(struct perf_event *event, int flags)
{
perf_swevent_cancel_hrtimer(event);
- cpu_clock_event_update(event);
+ if (flags & PERF_EF_UPDATE)
+ cpu_clock_event_update(event);
}
static int cpu_clock_event_add(struct perf_event *event, int flags)
@@ -11797,7 +11933,8 @@ static void task_clock_event_start(struct perf_event *event, int flags)
static void task_clock_event_stop(struct perf_event *event, int flags)
{
perf_swevent_cancel_hrtimer(event);
- task_clock_event_update(event, event->ctx->time);
+ if (flags & PERF_EF_UPDATE)
+ task_clock_event_update(event, event->ctx->time);
}
static int task_clock_event_add(struct perf_event *event, int flags)
@@ -12254,6 +12391,9 @@ int perf_pmu_register(struct pmu *_pmu, const char *name, int type)
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
+ INIT_LIST_HEAD(&pmu->events);
+ spin_lock_init(&pmu->events_lock);
+
/*
* Now that the PMU is complete, make it visible to perf_try_init_event().
*/
@@ -12267,21 +12407,143 @@ int perf_pmu_register(struct pmu *_pmu, const char *name, int type)
}
EXPORT_SYMBOL_GPL(perf_pmu_register);
-void perf_pmu_unregister(struct pmu *pmu)
+static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ /*
+ * De-schedule the event and mark it REVOKED.
+ */
+ perf_event_exit_event(event, ctx, true);
+
+ /*
+ * All _free_event() bits that rely on event->pmu:
+ *
+ * Notably, perf_mmap() relies on the ordering here.
+ */
+ scoped_guard (mutex, &event->mmap_mutex) {
+ WARN_ON_ONCE(pmu->event_unmapped);
+ /*
+ * Mostly an empty lock sequence, such that perf_mmap(), which
+ * relies on mmap_mutex, is sure to observe the state change.
+ */
+ }
+
+ perf_event_free_bpf_prog(event);
+ perf_free_addr_filters(event);
+
+ if (event->destroy) {
+ event->destroy(event);
+ event->destroy = NULL;
+ }
+
+ if (event->pmu_ctx) {
+ put_pmu_ctx(event->pmu_ctx);
+ event->pmu_ctx = NULL;
+ }
+
+ exclusive_event_destroy(event);
+ module_put(pmu->module);
+
+ event->pmu = NULL; /* force fault instead of UAF */
+}
+
+static void pmu_detach_event(struct pmu *pmu, struct perf_event *event)
+{
+ struct perf_event_context *ctx;
+
+ ctx = perf_event_ctx_lock(event);
+ __pmu_detach_event(pmu, event, ctx);
+ perf_event_ctx_unlock(event, ctx);
+
+ scoped_guard (spinlock, &pmu->events_lock)
+ list_del(&event->pmu_list);
+}
+
+static struct perf_event *pmu_get_event(struct pmu *pmu)
+{
+ struct perf_event *event;
+
+ guard(spinlock)(&pmu->events_lock);
+ list_for_each_entry(event, &pmu->events, pmu_list) {
+ if (atomic_long_inc_not_zero(&event->refcount))
+ return event;
+ }
+
+ return NULL;
+}
+
+static bool pmu_empty(struct pmu *pmu)
+{
+ guard(spinlock)(&pmu->events_lock);
+ return list_empty(&pmu->events);
+}
+
+static void pmu_detach_events(struct pmu *pmu)
+{
+ struct perf_event *event;
+
+ for (;;) {
+ event = pmu_get_event(pmu);
+ if (!event)
+ break;
+
+ pmu_detach_event(pmu, event);
+ put_event(event);
+ }
+
+ /*
+ * wait for pending _free_event()s
+ */
+ wait_var_event(pmu, pmu_empty(pmu));
+}
+
+int perf_pmu_unregister(struct pmu *pmu)
{
scoped_guard (mutex, &pmus_lock) {
+ if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL))
+ return -EINVAL;
+
list_del_rcu(&pmu->entry);
- idr_remove(&pmu_idr, pmu->type);
}
/*
* We dereference the pmu list under both SRCU and regular RCU, so
* synchronize against both of those.
+ *
+ * Notably, the entirety of event creation, from perf_init_event()
+ * (which will now fail, because of the above) until
+ * perf_install_in_context() should be under SRCU such that
+ * this synchronizes against event creation. This avoids trying to
+ * detach events that are not fully formed.
*/
synchronize_srcu(&pmus_srcu);
synchronize_rcu();
+ if (pmu->event_unmapped && !pmu_empty(pmu)) {
+ /*
+ * Can't force remove events when pmu::event_unmapped()
+ * is used in perf_mmap_close().
+ */
+ guard(mutex)(&pmus_lock);
+ idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu);
+ list_add_rcu(&pmu->entry, &pmus);
+ return -EBUSY;
+ }
+
+ scoped_guard (mutex, &pmus_lock)
+ idr_remove(&pmu_idr, pmu->type);
+
+ /*
+ * PMU is removed from the pmus list, so no new events will
+ * be created, now take care of the existing ones.
+ */
+ pmu_detach_events(pmu);
+
+ /*
+ * PMU is unused, make it go away.
+ */
perf_pmu_free(pmu);
+ return 0;
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
@@ -12375,7 +12637,7 @@ static struct pmu *perf_init_event(struct perf_event *event)
struct pmu *pmu;
int type, ret;
- guard(srcu)(&pmus_srcu);
+ guard(srcu)(&pmus_srcu); /* pmu idr/list access */
/*
* Save original type before calling pmu->event_init() since certain
@@ -12599,13 +12861,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
INIT_LIST_HEAD(&event->active_entry);
INIT_LIST_HEAD(&event->addr_filters.list);
INIT_HLIST_NODE(&event->hlist_entry);
+ INIT_LIST_HEAD(&event->pmu_list);
init_waitqueue_head(&event->waitq);
init_irq_work(&event->pending_irq, perf_pending_irq);
event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable);
init_task_work(&event->pending_task, perf_pending_task);
- rcuwait_init(&event->pending_work_wait);
mutex_init(&event->mmap_mutex);
raw_spin_lock_init(&event->addr_filters.lock);
@@ -12671,7 +12933,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
hwc = &event->hw;
hwc->sample_period = attr->sample_period;
- if (attr->freq && attr->sample_freq)
+ if (is_event_in_freq_mode(event))
hwc->sample_period = 1;
hwc->last_period = hwc->sample_period;
@@ -12778,6 +13040,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ lockdep_assert_held(&pmus_srcu);
+ scoped_guard (spinlock, &pmu->events_lock)
+ list_add(&event->pmu_list, &pmu->events);
+
return_ptr(event);
}
@@ -12977,6 +13246,9 @@ set:
goto unlock;
if (output_event) {
+ if (output_event->state <= PERF_EVENT_STATE_REVOKED)
+ goto unlock;
+
/* get the rb we want to redirect to */
rb = ring_buffer_get(output_event);
if (!rb)
@@ -13158,6 +13430,11 @@ SYSCALL_DEFINE5(perf_event_open,
if (event_fd < 0)
return event_fd;
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ guard(srcu)(&pmus_srcu);
+
CLASS(fd, group)(group_fd); // group_fd == -1 => empty
if (group_fd != -1) {
if (!is_perf_file(group)) {
@@ -13165,6 +13442,10 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_fd;
}
group_leader = fd_file(group)->private_data;
+ if (group_leader->state <= PERF_EVENT_STATE_REVOKED) {
+ err = -ENODEV;
+ goto err_fd;
+ }
if (flags & PERF_FLAG_FD_OUTPUT)
output_event = group_leader;
if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -13461,7 +13742,7 @@ err_cred:
if (task)
up_read(&task->signal->exec_update_lock);
err_alloc:
- free_event(event);
+ put_event(event);
err_task:
if (task)
put_task_struct(task);
@@ -13498,6 +13779,11 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
if (attr->aux_output || attr->aux_action)
return ERR_PTR(-EINVAL);
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ guard(srcu)(&pmus_srcu);
+
event = perf_event_alloc(attr, cpu, task, NULL, NULL,
overflow_handler, context, -1);
if (IS_ERR(event)) {
@@ -13569,7 +13855,7 @@ err_unlock:
perf_unpin_context(ctx);
put_ctx(ctx);
err_alloc:
- free_event(event);
+ put_event(event);
err:
return ERR_PTR(err);
}
@@ -13709,10 +13995,12 @@ static void sync_child_event(struct perf_event *child_event)
}
static void
-perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
+perf_event_exit_event(struct perf_event *event,
+ struct perf_event_context *ctx, bool revoke)
{
struct perf_event *parent_event = event->parent;
- unsigned long detach_flags = 0;
+ unsigned long detach_flags = DETACH_EXIT;
+ unsigned int attach_state;
if (parent_event) {
/*
@@ -13727,28 +14015,38 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
* Do destroy all inherited groups, we don't care about those
* and being thorough is better.
*/
- detach_flags = DETACH_GROUP | DETACH_CHILD;
+ detach_flags |= DETACH_GROUP | DETACH_CHILD;
mutex_lock(&parent_event->child_mutex);
+ /* PERF_ATTACH_ITRACE might be set concurrently */
+ attach_state = READ_ONCE(event->attach_state);
}
- perf_remove_from_context(event, detach_flags);
-
- raw_spin_lock_irq(&ctx->lock);
- if (event->state > PERF_EVENT_STATE_EXIT)
- perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
- raw_spin_unlock_irq(&ctx->lock);
+ if (revoke)
+ detach_flags |= DETACH_GROUP | DETACH_REVOKE;
+ perf_remove_from_context(event, detach_flags);
/*
* Child events can be freed.
*/
if (parent_event) {
mutex_unlock(&parent_event->child_mutex);
+
/*
- * Kick perf_poll() for is_event_hup();
+ * Match the refcount initialization. Make sure it doesn't happen
+ * twice if pmu_detach_event() calls it on an already exited task.
*/
- perf_event_wakeup(parent_event);
- free_event(event);
- put_event(parent_event);
+ if (attach_state & PERF_ATTACH_CHILD) {
+ /*
+ * Kick perf_poll() for is_event_hup();
+ */
+ perf_event_wakeup(parent_event);
+ /*
+ * pmu_detach_event() will have an extra refcount.
+ * perf_pending_task() might have one too.
+ */
+ put_event(event);
+ }
+
return;
}
@@ -13758,15 +14056,13 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
perf_event_wakeup(event);
}
-static void perf_event_exit_task_context(struct task_struct *child)
+static void perf_event_exit_task_context(struct task_struct *task, bool exit)
{
- struct perf_event_context *child_ctx, *clone_ctx = NULL;
+ struct perf_event_context *ctx, *clone_ctx = NULL;
struct perf_event *child_event, *next;
- WARN_ON_ONCE(child != current);
-
- child_ctx = perf_pin_task_context(child);
- if (!child_ctx)
+ ctx = perf_pin_task_context(task);
+ if (!ctx)
return;
/*
@@ -13779,27 +14075,28 @@ static void perf_event_exit_task_context(struct task_struct *child)
* without ctx::mutex (it cannot because of the move_group double mutex
* lock thing). See the comments in perf_install_in_context().
*/
- mutex_lock(&child_ctx->mutex);
+ mutex_lock(&ctx->mutex);
/*
* In a single ctx::lock section, de-schedule the events and detach the
* context from the task such that we cannot ever get it scheduled back
* in.
*/
- raw_spin_lock_irq(&child_ctx->lock);
- task_ctx_sched_out(child_ctx, NULL, EVENT_ALL);
+ raw_spin_lock_irq(&ctx->lock);
+ if (exit)
+ task_ctx_sched_out(ctx, NULL, EVENT_ALL);
/*
* Now that the context is inactive, destroy the task <-> ctx relation
* and mark the context dead.
*/
- RCU_INIT_POINTER(child->perf_event_ctxp, NULL);
- put_ctx(child_ctx); /* cannot be last */
- WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
- put_task_struct(current); /* cannot be last */
+ RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
+ put_ctx(ctx); /* cannot be last */
+ WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
+ put_task_struct(task); /* cannot be last */
- clone_ctx = unclone_ctx(child_ctx);
- raw_spin_unlock_irq(&child_ctx->lock);
+ clone_ctx = unclone_ctx(ctx);
+ raw_spin_unlock_irq(&ctx->lock);
if (clone_ctx)
put_ctx(clone_ctx);
@@ -13809,28 +14106,48 @@ static void perf_event_exit_task_context(struct task_struct *child)
* won't get any samples after PERF_RECORD_EXIT. We can however still
* get a few PERF_RECORD_READ events.
*/
- perf_event_task(child, child_ctx, 0);
+ if (exit)
+ perf_event_task(task, ctx, 0);
- list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
- perf_event_exit_event(child_event, child_ctx);
+ list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry)
+ perf_event_exit_event(child_event, ctx, false);
- mutex_unlock(&child_ctx->mutex);
+ mutex_unlock(&ctx->mutex);
- put_ctx(child_ctx);
+ if (!exit) {
+ /*
+ * perf_event_release_kernel() could still have a reference on
+ * this context. In that case we must wait for these events to
+ * have been freed (in particular all their references to this
+ * task must've been dropped).
+ *
+ * Without this copy_process() will unconditionally free this
+ * task (irrespective of its reference count) and
+ * _free_event()'s put_task_struct(event->hw.target) will be a
+ * use-after-free.
+ *
+ * Wait for all events to drop their context reference.
+ */
+ wait_var_event(&ctx->refcount,
+ refcount_read(&ctx->refcount) == 1);
+ }
+ put_ctx(ctx);
}
/*
- * When a child task exits, feed back event values to parent events.
+ * When a task exits, feed back event values to parent events.
*
* Can be called with exec_update_lock held when called from
* setup_new_exec().
*/
-void perf_event_exit_task(struct task_struct *child)
+void perf_event_exit_task(struct task_struct *task)
{
struct perf_event *event, *tmp;
- mutex_lock(&child->perf_event_mutex);
- list_for_each_entry_safe(event, tmp, &child->perf_event_list,
+ WARN_ON_ONCE(task != current);
+
+ mutex_lock(&task->perf_event_mutex);
+ list_for_each_entry_safe(event, tmp, &task->perf_event_list,
owner_entry) {
list_del_init(&event->owner_entry);
@@ -13841,44 +14158,23 @@ void perf_event_exit_task(struct task_struct *child)
*/
smp_store_release(&event->owner, NULL);
}
- mutex_unlock(&child->perf_event_mutex);
+ mutex_unlock(&task->perf_event_mutex);
- perf_event_exit_task_context(child);
+ perf_event_exit_task_context(task, true);
/*
* The perf_event_exit_task_context calls perf_event_task
- * with child's task_ctx, which generates EXIT events for
- * child contexts and sets child->perf_event_ctxp[] to NULL.
+ * with task's task_ctx, which generates EXIT events for
+ * task contexts and sets task->perf_event_ctxp[] to NULL.
* At this point we need to send EXIT events to cpu contexts.
*/
- perf_event_task(child, NULL, 0);
+ perf_event_task(task, NULL, 0);
/*
* Detach the perf_ctx_data for the system-wide event.
*/
guard(percpu_read)(&global_ctx_data_rwsem);
- detach_task_ctx_data(child);
-}
-
-static void perf_free_event(struct perf_event *event,
- struct perf_event_context *ctx)
-{
- struct perf_event *parent = event->parent;
-
- if (WARN_ON_ONCE(!parent))
- return;
-
- mutex_lock(&parent->child_mutex);
- list_del_init(&event->child_list);
- mutex_unlock(&parent->child_mutex);
-
- put_event(parent);
-
- raw_spin_lock_irq(&ctx->lock);
- perf_group_detach(event);
- list_del_event(event, ctx);
- raw_spin_unlock_irq(&ctx->lock);
- free_event(event);
+ detach_task_ctx_data(task);
}
/*
@@ -13890,48 +14186,7 @@ static void perf_free_event(struct perf_event *event,
*/
void perf_event_free_task(struct task_struct *task)
{
- struct perf_event_context *ctx;
- struct perf_event *event, *tmp;
-
- ctx = rcu_access_pointer(task->perf_event_ctxp);
- if (!ctx)
- return;
-
- mutex_lock(&ctx->mutex);
- raw_spin_lock_irq(&ctx->lock);
- /*
- * Destroy the task <-> ctx relation and mark the context dead.
- *
- * This is important because even though the task hasn't been
- * exposed yet the context has been (through child_list).
- */
- RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
- WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
- put_task_struct(task); /* cannot be last */
- raw_spin_unlock_irq(&ctx->lock);
-
-
- list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
- perf_free_event(event, ctx);
-
- mutex_unlock(&ctx->mutex);
-
- /*
- * perf_event_release_kernel() could've stolen some of our
- * child events and still have them on its free_list. In that
- * case we must wait for these events to have been freed (in
- * particular all their references to this task must've been
- * dropped).
- *
- * Without this copy_process() will unconditionally free this
- * task (irrespective of its reference count) and
- * _free_event()'s put_task_struct(event->hw.target) will be a
- * use-after-free.
- *
- * Wait for all events to drop their context reference.
- */
- wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
- put_ctx(ctx); /* must be last */
+ perf_event_exit_task_context(task, false);
}
void perf_event_delayed_put(struct task_struct *task)
@@ -14008,6 +14263,14 @@ inherit_event(struct perf_event *parent_event,
if (parent_event->parent)
parent_event = parent_event->parent;
+ if (parent_event->state <= PERF_EVENT_STATE_REVOKED)
+ return NULL;
+
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ guard(srcu)(&pmus_srcu);
+
child_event = perf_event_alloc(&parent_event->attr,
parent_event->cpu,
child,
@@ -14016,6 +14279,9 @@ inherit_event(struct perf_event *parent_event,
if (IS_ERR(child_event))
return child_event;
+ get_ctx(child_ctx);
+ child_event->ctx = child_ctx;
+
pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
if (IS_ERR(pmu_ctx)) {
free_event(child_event);
@@ -14037,8 +14303,6 @@ inherit_event(struct perf_event *parent_event,
return NULL;
}
- get_ctx(child_ctx);
-
/*
* Make the child state follow the state of the parent event,
* not its attr.disabled bit. We hold the parent's mutex,
@@ -14059,7 +14323,6 @@ inherit_event(struct perf_event *parent_event,
local64_set(&hwc->period_left, sample_period);
}
- child_event->ctx = child_ctx;
child_event->overflow_handler = parent_event->overflow_handler;
child_event->overflow_handler_context
= parent_event->overflow_handler_context;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 5130b119d0ae..aa9a759e824f 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -441,7 +441,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
* store that will be enabled on successful return
*/
if (!handle->size) { /* A, matches D */
- event->pending_disable = smp_processor_id();
+ perf_event_disable_inatomic(handle->event);
perf_output_wakeup(handle);
WRITE_ONCE(rb->aux_nest, 0);
goto err_put;
@@ -526,7 +526,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
if (wakeup) {
if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
- handle->event->pending_disable = smp_processor_id();
+ perf_event_disable_inatomic(handle->event);
perf_output_wakeup(handle);
}
@@ -679,7 +679,15 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
{
bool overwrite = !(flags & RING_BUFFER_WRITABLE);
int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
- int ret = -ENOMEM, max_order;
+ bool use_contiguous_pages = event->pmu->capabilities & (
+ PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_PREFER_LARGE);
+ /*
+ * Initialize max_order to 0 for page allocation. This allocates single
+ * pages to minimize memory fragmentation. This is overridden if the
+ * PMU needs or prefers contiguous pages (use_contiguous_pages = true).
+ */
+ int max_order = 0;
+ int ret = -ENOMEM;
if (!has_aux(event))
return -EOPNOTSUPP;
@@ -689,8 +697,8 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
if (!overwrite) {
/*
- * Watermark defaults to half the buffer, and so does the
- * max_order, to aid PMU drivers in double buffering.
+ * Watermark defaults to half the buffer, to aid PMU drivers
+ * in double buffering.
*/
if (!watermark)
watermark = min_t(unsigned long,
@@ -698,16 +706,19 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
(unsigned long)nr_pages << (PAGE_SHIFT - 1));
/*
- * Use aux_watermark as the basis for chunking to
- * help PMU drivers honor the watermark.
+ * If using contiguous pages, use aux_watermark as the basis
+ * for chunking to help PMU drivers honor the watermark.
*/
- max_order = get_order(watermark);
+ if (use_contiguous_pages)
+ max_order = get_order(watermark);
} else {
/*
- * We need to start with the max_order that fits in nr_pages,
- * not the other way around, hence ilog2() and not get_order.
+ * If using contiguous pages, we need to start with the
+ * max_order that fits in nr_pages, not the other way around,
+ * hence ilog2() and not get_order.
*/
- max_order = ilog2(nr_pages);
+ if (use_contiguous_pages)
+ max_order = ilog2(nr_pages);
watermark = 0;
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2746791ce1e2..7ca1940607bd 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -29,6 +29,7 @@
#include <linux/workqueue.h>
#include <linux/srcu.h>
#include <linux/oom.h> /* check_stable_address_space */
+#include <linux/pagewalk.h>
#include <linux/uprobes.h>
@@ -152,91 +153,6 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
}
/**
- * __replace_page - replace page in vma by new page.
- * based on replace_page in mm/ksm.c
- *
- * @vma: vma that holds the pte pointing to page
- * @addr: address the old @page is mapped at
- * @old_page: the page we are replacing by new_page
- * @new_page: the modified page we replace page by
- *
- * If @new_page is NULL, only unmap @old_page.
- *
- * Returns 0 on success, negative error code otherwise.
- */
-static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
- struct page *old_page, struct page *new_page)
-{
- struct folio *old_folio = page_folio(old_page);
- struct folio *new_folio;
- struct mm_struct *mm = vma->vm_mm;
- DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0);
- int err;
- struct mmu_notifier_range range;
- pte_t pte;
-
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
- addr + PAGE_SIZE);
-
- if (new_page) {
- new_folio = page_folio(new_page);
- err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL);
- if (err)
- return err;
- }
-
- /* For folio_free_swap() below */
- folio_lock(old_folio);
-
- mmu_notifier_invalidate_range_start(&range);
- err = -EAGAIN;
- if (!page_vma_mapped_walk(&pvmw))
- goto unlock;
- VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
- pte = ptep_get(pvmw.pte);
-
- /*
- * Handle PFN swap PTES, such as device-exclusive ones, that actually
- * map pages: simply trigger GUP again to fix it up.
- */
- if (unlikely(!pte_present(pte))) {
- page_vma_mapped_walk_done(&pvmw);
- goto unlock;
- }
-
- if (new_page) {
- folio_get(new_folio);
- folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE);
- folio_add_lru_vma(new_folio, vma);
- } else
- /* no new page, just dec_mm_counter for old_page */
- dec_mm_counter(mm, MM_ANONPAGES);
-
- if (!folio_test_anon(old_folio)) {
- dec_mm_counter(mm, mm_counter_file(old_folio));
- inc_mm_counter(mm, MM_ANONPAGES);
- }
-
- flush_cache_page(vma, addr, pte_pfn(pte));
- ptep_clear_flush(vma, addr, pvmw.pte);
- if (new_page)
- set_pte_at(mm, addr, pvmw.pte,
- mk_pte(new_page, vma->vm_page_prot));
-
- folio_remove_rmap_pte(old_folio, old_page, vma);
- if (!folio_mapped(old_folio))
- folio_free_swap(old_folio);
- page_vma_mapped_walk_done(&pvmw);
- folio_put(old_folio);
-
- err = 0;
- unlock:
- mmu_notifier_invalidate_range_end(&range);
- folio_unlock(old_folio);
- return err;
-}
-
-/**
* is_swbp_insn - check if instruction is breakpoint instruction.
* @insn: instruction to be checked.
* Default implementation of is_swbp_insn
@@ -463,6 +379,94 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
return ret;
}
+static bool orig_page_is_identical(struct vm_area_struct *vma,
+ unsigned long vaddr, struct page *page, bool *pmd_mappable)
+{
+ const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT;
+ struct folio *orig_folio = filemap_get_folio(vma->vm_file->f_mapping,
+ index);
+ struct page *orig_page;
+ bool identical;
+
+ if (IS_ERR(orig_folio))
+ return false;
+ orig_page = folio_file_page(orig_folio, index);
+
+ *pmd_mappable = folio_test_pmd_mappable(orig_folio);
+ identical = folio_test_uptodate(orig_folio) &&
+ pages_identical(page, orig_page);
+ folio_put(orig_folio);
+ return identical;
+}
+
+static int __uprobe_write_opcode(struct vm_area_struct *vma,
+ struct folio_walk *fw, struct folio *folio,
+ unsigned long opcode_vaddr, uprobe_opcode_t opcode)
+{
+ const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
+ const bool is_register = !!is_swbp_insn(&opcode);
+ bool pmd_mappable;
+
+ /* For now, we'll only handle PTE-mapped folios. */
+ if (fw->level != FW_LEVEL_PTE)
+ return -EFAULT;
+
+ /*
+ * See can_follow_write_pte(): we'd actually prefer a writable PTE here,
+ * but the VMA might not be writable.
+ */
+ if (!pte_write(fw->pte)) {
+ if (!PageAnonExclusive(fw->page))
+ return -EFAULT;
+ if (unlikely(userfaultfd_pte_wp(vma, fw->pte)))
+ return -EFAULT;
+ /* SOFTDIRTY is handled via pte_mkdirty() below. */
+ }
+
+ /*
+ * We'll temporarily unmap the page and flush the TLB, such that we can
+ * modify the page atomically.
+ */
+ flush_cache_page(vma, vaddr, pte_pfn(fw->pte));
+ fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep);
+ copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+
+ /*
+ * When unregistering, we may only zap a PTE if uffd is disabled and
+ * there are no unexpected folio references ...
+ */
+ if (is_register || userfaultfd_missing(vma) ||
+ (folio_ref_count(folio) != folio_expected_ref_count(folio) + 1))
+ goto remap;
+
+ /*
+ * ... and the mapped page is identical to the original page that
+ * would get faulted in on next access.
+ */
+ if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable))
+ goto remap;
+
+ dec_mm_counter(vma->vm_mm, MM_ANONPAGES);
+ folio_remove_rmap_pte(folio, fw->page, vma);
+ if (!folio_mapped(folio) && folio_test_swapcache(folio) &&
+ folio_trylock(folio)) {
+ folio_free_swap(folio);
+ folio_unlock(folio);
+ }
+ folio_put(folio);
+
+ return pmd_mappable;
+remap:
+ /*
+ * Make sure that our copy_to_page() changes become visible before the
+ * set_pte_at() write.
+ */
+ smp_wmb();
+ /* We modified the page. Make sure to mark the PTE dirty. */
+ set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte));
+ return 0;
+}
+
/*
* NOTE:
* Expect the breakpoint instruction to be the smallest size instruction for
@@ -474,146 +478,146 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
*
* uprobe_write_opcode - write the opcode at a given virtual address.
* @auprobe: arch specific probepoint information.
- * @mm: the probed process address space.
- * @vaddr: the virtual address to store the opcode.
- * @opcode: opcode to be written at @vaddr.
+ * @vma: the probed virtual memory area.
+ * @opcode_vaddr: the virtual address to store the opcode.
+ * @opcode: opcode to be written at @opcode_vaddr.
*
* Called with mm->mmap_lock held for read or write.
* Return 0 (success) or a negative errno.
*/
-int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
- unsigned long vaddr, uprobe_opcode_t opcode)
+int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ const unsigned long opcode_vaddr, uprobe_opcode_t opcode)
{
+ const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
+ struct mm_struct *mm = vma->vm_mm;
struct uprobe *uprobe;
- struct page *old_page, *new_page;
- struct vm_area_struct *vma;
int ret, is_register, ref_ctr_updated = 0;
- bool orig_page_huge = false;
unsigned int gup_flags = FOLL_FORCE;
+ struct mmu_notifier_range range;
+ struct folio_walk fw;
+ struct folio *folio;
+ struct page *page;
is_register = is_swbp_insn(&opcode);
uprobe = container_of(auprobe, struct uprobe, arch);
-retry:
+ if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
+ return -EINVAL;
+
+ /*
+ * When registering, we have to break COW to get an exclusive anonymous
+ * page that we can safely modify. Use FOLL_WRITE to trigger a write
+ * fault if required. When unregistering, we might be lucky and the
+ * anon page is already gone. So defer write faults until really
+ * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode()
+ * cannot deal with PMDs yet.
+ */
if (is_register)
- gup_flags |= FOLL_SPLIT_PMD;
- /* Read the page with vaddr into memory */
- old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
- if (IS_ERR(old_page))
- return PTR_ERR(old_page);
+ gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;
- ret = verify_opcode(old_page, vaddr, &opcode);
+retry:
+ ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL);
if (ret <= 0)
- goto put_old;
-
- if (is_zero_page(old_page)) {
- ret = -EINVAL;
- goto put_old;
- }
+ goto out;
+ folio = page_folio(page);
- if (WARN(!is_register && PageCompound(old_page),
- "uprobe unregister should never work on compound page\n")) {
- ret = -EINVAL;
- goto put_old;
+ ret = verify_opcode(page, opcode_vaddr, &opcode);
+ if (ret <= 0) {
+ folio_put(folio);
+ goto out;
}
/* We are going to replace instruction, update ref_ctr. */
if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
- if (ret)
- goto put_old;
+ if (ret) {
+ folio_put(folio);
+ goto out;
+ }
ref_ctr_updated = 1;
}
ret = 0;
- if (!is_register && !PageAnon(old_page))
- goto put_old;
-
- ret = anon_vma_prepare(vma);
- if (ret)
- goto put_old;
-
- ret = -ENOMEM;
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
- if (!new_page)
- goto put_old;
-
- __SetPageUptodate(new_page);
- copy_highpage(new_page, old_page);
- copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ if (unlikely(!folio_test_anon(folio) || folio_is_zone_device(folio))) {
+ VM_WARN_ON_ONCE(is_register);
+ folio_put(folio);
+ goto out;
+ }
if (!is_register) {
- struct page *orig_page;
- pgoff_t index;
-
- VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);
-
- index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
- orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
- index);
-
- if (orig_page) {
- if (PageUptodate(orig_page) &&
- pages_identical(new_page, orig_page)) {
- /* let go new_page */
- put_page(new_page);
- new_page = NULL;
-
- if (PageCompound(orig_page))
- orig_page_huge = true;
- }
- put_page(orig_page);
- }
+ /*
+ * In the common case, we'll be able to zap the page when
+ * unregistering. So trigger MMU notifiers now, as we won't
+ * be able to do it under PTL.
+ */
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+ vaddr, vaddr + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ }
+
+ ret = -EAGAIN;
+ /* Walk the page tables again, to perform the actual update. */
+ if (folio_walk_start(&fw, vma, vaddr, 0)) {
+ if (fw.page == page)
+ ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode);
+ folio_walk_end(&fw, vma);
}
- ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page);
- if (new_page)
- put_page(new_page);
-put_old:
- put_page(old_page);
+ if (!is_register)
+ mmu_notifier_invalidate_range_end(&range);
- if (unlikely(ret == -EAGAIN))
+ folio_put(folio);
+ switch (ret) {
+ case -EFAULT:
+ gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;
+ fallthrough;
+ case -EAGAIN:
goto retry;
+ default:
+ break;
+ }
+out:
/* Revert back reference counter if instruction update failed. */
- if (ret && is_register && ref_ctr_updated)
- update_ref_ctr(uprobe, mm, -1);
+ if (ret < 0 && ref_ctr_updated)
+ update_ref_ctr(uprobe, mm, is_register ? -1 : 1);
/* try collapse pmd for compound page */
- if (!ret && orig_page_huge)
+ if (ret > 0)
collapse_pte_mapped_thp(mm, vaddr, false);
- return ret;
+ return ret < 0 ? ret : 0;
}
/**
* set_swbp - store breakpoint at a given address.
* @auprobe: arch specific probepoint information.
- * @mm: the probed process address space.
+ * @vma: the probed virtual memory area.
* @vaddr: the virtual address to insert the opcode.
*
* For mm @mm, store the breakpoint instruction at @vaddr.
* Return 0 (success) or a negative errno.
*/
-int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
{
- return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
+ return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN);
}
/**
* set_orig_insn - Restore the original instruction.
- * @mm: the probed process address space.
+ * @vma: the probed virtual memory area.
* @auprobe: arch specific probepoint information.
* @vaddr: the virtual address to insert the opcode.
*
* For mm @mm, restore the original opcode (opcode) at @vaddr.
* Return 0 (success) or a negative errno.
*/
-int __weak
-set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+int __weak set_orig_insn(struct arch_uprobe *auprobe,
+ struct vm_area_struct *vma, unsigned long vaddr)
{
- return uprobe_write_opcode(auprobe, mm, vaddr,
+ return uprobe_write_opcode(auprobe, vma, vaddr,
*(uprobe_opcode_t *)&auprobe->insn);
}
@@ -1134,10 +1138,10 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
return ret;
}
-static int
-install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long vaddr)
+static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
{
+ struct mm_struct *mm = vma->vm_mm;
bool first_uprobe;
int ret;
@@ -1153,7 +1157,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
if (first_uprobe)
set_bit(MMF_HAS_UPROBES, &mm->flags);
- ret = set_swbp(&uprobe->arch, mm, vaddr);
+ ret = set_swbp(&uprobe->arch, vma, vaddr);
if (!ret)
clear_bit(MMF_RECALC_UPROBES, &mm->flags);
else if (first_uprobe)
@@ -1162,11 +1166,13 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
return ret;
}
-static int
-remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
+static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
{
+ struct mm_struct *mm = vma->vm_mm;
+
set_bit(MMF_RECALC_UPROBES, &mm->flags);
- return set_orig_insn(&uprobe->arch, mm, vaddr);
+ return set_orig_insn(&uprobe->arch, vma, vaddr);
}
struct map_info {
@@ -1296,10 +1302,10 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
if (is_register) {
/* consult only the "caller", new consumer. */
if (consumer_filter(new, mm))
- err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+ err = install_breakpoint(uprobe, vma, info->vaddr);
} else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
if (!filter_chain(uprobe, mm))
- err |= remove_breakpoint(uprobe, mm, info->vaddr);
+ err |= remove_breakpoint(uprobe, vma, info->vaddr);
}
unlock:
@@ -1472,7 +1478,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
continue;
vaddr = offset_to_vaddr(vma, uprobe->offset);
- err |= remove_breakpoint(uprobe, mm, vaddr);
+ err |= remove_breakpoint(uprobe, vma, vaddr);
}
mmap_read_unlock(mm);
@@ -1610,7 +1616,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
if (!fatal_signal_pending(current) &&
filter_chain(uprobe, vma->vm_mm)) {
unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
- install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
+ install_breakpoint(uprobe, vma, vaddr);
}
put_uprobe(uprobe);
}
@@ -1703,7 +1709,8 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
}
vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
- VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
+ VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO|
+ VM_SEALED_SYSMAP,
&xol_mapping);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
@@ -1955,6 +1962,9 @@ static void free_ret_instance(struct uprobe_task *utask,
* to-be-reused return instances for future uretprobes. If ri_timer()
* happens to be running right now, though, we fallback to safety and
* just perform RCU-delated freeing of ri.
+ * Admittedly, this is a rather simple use of seqcount, but it nicely
+ * abstracts away all the necessary memory barriers, so we use
+ * a well-supported kernel primitive here.
*/
if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) {
/* immediate reuse of ri without RCU GP is OK */
@@ -2015,12 +2025,20 @@ static void ri_timer(struct timer_list *timer)
/* RCU protects return_instance from freeing. */
guard(rcu)();
- write_seqcount_begin(&utask->ri_seqcount);
+ /*
+ * See free_ret_instance() for notes on seqcount use.
+ * We also employ raw API variants to avoid lockdep false-positive
+ * warning complaining about enabled preemption. The timer can only be
+ * invoked once for a uprobe_task. Therefore there can only be one
+ * writer. The reader does not require an even sequence count to make
+ * progress, so it is OK to remain preemptible on PREEMPT_RT.
+ */
+ raw_write_seqcount_begin(&utask->ri_seqcount);
for_each_ret_instance_rcu(ri, utask->return_instances)
hprobe_expire(&ri->hprobe, false);
- write_seqcount_end(&utask->ri_seqcount);
+ raw_write_seqcount_end(&utask->ri_seqcount);
}
static struct uprobe_task *alloc_utask(void)