From 06d0713904e508f765e0d7146c14b67bbd248fe7 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 27 Jun 2014 19:01:40 +0200
Subject: uprobes: Change unregister/apply to WARN() if uprobe/consumer is gone

Add WARN_ON's into uprobe_unregister() and uprobe_apply() to ensure
that nobody tries to play with the dead uprobe/consumer. This helps
to catch the bugs like the one fixed by the previous patch.

In the longer term we should fix this poorly designed interface.
uprobe_register() should return "struct uprobe *" which should be
passed to apply/unregister. Plus other semantic changes, see the
changelog in commit 41ccba029e94.

Link: http://lkml.kernel.org/p/20140627170140.GA18322@redhat.com

Acked-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/events/uprobes.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/events')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c445e392e93f..6f3254e8c137 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -846,7 +846,7 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u
 {
 	int err;
 
-	if (!consumer_del(uprobe, uc))	/* WARN? */
+	if (WARN_ON(!consumer_del(uprobe, uc)))
 		return;
 
 	err = register_for_each_vma(uprobe, NULL);
@@ -927,7 +927,7 @@ int uprobe_apply(struct inode *inode, loff_t offset,
 	int ret = -ENOENT;
 
 	uprobe = find_uprobe(inode, offset);
-	if (!uprobe)
+	if (WARN_ON(!uprobe))
 		return ret;
 
 	down_write(&uprobe->register_rwsem);
@@ -952,7 +952,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
 	struct uprobe *uprobe;
 
 	uprobe = find_uprobe(inode, offset);
-	if (!uprobe)
+	if (WARN_ON(!uprobe))
 		return;
 
 	down_write(&uprobe->register_rwsem);
-- 
cgit 


From 1f9a7268c67f0290837aada443d28fd953ddca90 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Tue, 24 Jun 2014 10:20:25 +0200
Subject: perf: Do not allow optimized switch for non-cloned events

The context check in perf_event_context_sched_out allows
non-cloned context to be part of the optimized schedule
out switch.

This could move non-cloned context into another workload
child. Once this child exits, the context is closed and
leaves all original (parent) events in closed state.

Any other new cloned event will have closed state and not
measure anything. And probably causing other odd bugs.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1403598026-2310-2-git-send-email-jolsa@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/events')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index a33d9a2bcbd7..b0c95f0f06fd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2320,7 +2320,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 	next_parent = rcu_dereference(next_ctx->parent_ctx);
 
 	/* If neither context have a parent context; they cannot be clones. */
-	if (!parent && !next_parent)
+	if (!parent || !next_parent)
 		goto unlock;
 
 	if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
-- 
cgit 


From 1903d50cba54261a6562a476c05085f3d7a54097 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 15 Jul 2014 17:27:27 +0200
Subject: perf: Revert ("perf: Always destroy groups on exit")

Vince reported that commit 15a2d4de0eab5 ("perf: Always destroy groups
on exit") causes a regression with grouped events. In particular his
read_group_attached.c test fails.

  https://github.com/deater/perf_event_tests/blob/master/tests/bugs/read_group_attached.c

Because of the context switch optimization in
perf_event_context_sched_out() the 'original' event may end up in the
child process and when that exits the change in the patch in question
destroys the actual grouping.

Therefore revert that change and only destroy inherited groups.

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-zedy3uktcp753q8fw8dagx7a@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'kernel/events')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index b0c95f0f06fd..c46b02bfe179 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7458,7 +7458,19 @@ __perf_event_exit_task(struct perf_event *child_event,
 			 struct perf_event_context *child_ctx,
 			 struct task_struct *child)
 {
-	perf_remove_from_context(child_event, true);
+	/*
+	 * Do not destroy the 'original' grouping; because of the context
+	 * switch optimization the original events could've ended up in a
+	 * random child task.
+	 *
+	 * If we were to destroy the original group, all group related
+	 * operations would cease to function properly after this random
+	 * child dies.
+	 *
+	 * Do destroy all inherited groups, we don't care about those
+	 * and being thorough is better.
+	 */
+	perf_remove_from_context(child_event, !!child_event->parent);
 
 	/*
 	 * It can happen that the parent exits first, and has events
-- 
cgit 


From 4a1c0f262f88e2676fda80a6bf80e7dbccae1dcb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 23 Jun 2014 16:12:42 +0200
Subject: perf: Fix lockdep warning on process exit

Sasha Levin reported:

> While fuzzing with trinity inside a KVM tools guest running the latest -next
> kernel I've stumbled on the following spew:
>
> ======================================================
> [ INFO: possible circular locking dependency detected ]
> 3.15.0-next-20140613-sasha-00026-g6dd125d-dirty #654 Not tainted
> -------------------------------------------------------
> trinity-c578/9725 is trying to acquire lock:
> (&(&pool->lock)->rlock){-.-...}, at: __queue_work (kernel/workqueue.c:1346)
>
> but task is already holding lock:
> (&ctx->lock){-.....}, at: perf_event_exit_task (kernel/events/core.c:7471 kernel/events/core.c:7533)
>
> which lock already depends on the new lock.

> 1 lock held by trinity-c578/9725:
> #0: (&ctx->lock){-.....}, at: perf_event_exit_task (kernel/events/core.c:7471 kernel/events/core.c:7533)
>
>  Call Trace:
>  dump_stack (lib/dump_stack.c:52)
>  print_circular_bug (kernel/locking/lockdep.c:1216)
>  __lock_acquire (kernel/locking/lockdep.c:1840 kernel/locking/lockdep.c:1945 kernel/locking/lockdep.c:2131 kernel/locking/lockdep.c:3182)
>  lock_acquire (./arch/x86/include/asm/current.h:14 kernel/locking/lockdep.c:3602)
>  _raw_spin_lock (include/linux/spinlock_api_smp.h:143 kernel/locking/spinlock.c:151)
>  __queue_work (kernel/workqueue.c:1346)
>  queue_work_on (kernel/workqueue.c:1424)
>  free_object (lib/debugobjects.c:209)
>  __debug_check_no_obj_freed (lib/debugobjects.c:715)
>  debug_check_no_obj_freed (lib/debugobjects.c:727)
>  kmem_cache_free (mm/slub.c:2683 mm/slub.c:2711)
>  free_task (kernel/fork.c:221)
>  __put_task_struct (kernel/fork.c:250)
>  put_ctx (include/linux/sched.h:1855 kernel/events/core.c:898)
>  perf_event_exit_task (kernel/events/core.c:907 kernel/events/core.c:7478 kernel/events/core.c:7533)
>  do_exit (kernel/exit.c:766)
>  do_group_exit (kernel/exit.c:884)
>  get_signal_to_deliver (kernel/signal.c:2347)
>  do_signal (arch/x86/kernel/signal.c:698)
>  do_notify_resume (arch/x86/kernel/signal.c:751)
>  int_signal (arch/x86/kernel/entry_64.S:600)

Urgh.. so the only way I can make that happen is through:

  perf_event_exit_task_context()
    raw_spin_lock(&child_ctx->lock);
    unclone_ctx(child_ctx)
      put_ctx(ctx->parent_ctx);
    raw_spin_unlock_irqrestore(&child_ctx->lock);

And we can avoid this by doing the change below.

I can't immediately see how this changed recently, but given that you
say it's easy to reproduce, lets fix this.

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20140623141242.GB19860@laptop.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel/events')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index c46b02bfe179..6b17ac1b0c2a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7486,7 +7486,7 @@ __perf_event_exit_task(struct perf_event *child_event,
 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 {
 	struct perf_event *child_event, *next;
-	struct perf_event_context *child_ctx;
+	struct perf_event_context *child_ctx, *parent_ctx;
 	unsigned long flags;
 
 	if (likely(!child->perf_event_ctxp[ctxn])) {
@@ -7511,6 +7511,15 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 	raw_spin_lock(&child_ctx->lock);
 	task_ctx_sched_out(child_ctx);
 	child->perf_event_ctxp[ctxn] = NULL;
+
+	/*
+	 * In order to avoid freeing: child_ctx->parent_ctx->task
+	 * under perf_event_context::lock, grab another reference.
+	 */
+	parent_ctx = child_ctx->parent_ctx;
+	if (parent_ctx)
+		get_ctx(parent_ctx);
+
 	/*
 	 * If this context is a clone; unclone it so it can't get
 	 * swapped to another process while we're removing all
@@ -7520,6 +7529,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 	update_context_time(child_ctx);
 	raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
 
+	/*
+	 * Now that we no longer hold perf_event_context::lock, drop
+	 * our extra child_ctx->parent_ctx reference.
+	 */
+	if (parent_ctx)
+		put_ctx(parent_ctx);
+
 	/*
 	 * Report the task dead after unscheduling the events so that we
 	 * won't get any samples after PERF_RECORD_EXIT. We can however still
-- 
cgit