From 7d613f9f72ec8f90ddefcae038fdae5adb8404b3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 1 Sep 2021 13:21:34 -0500
Subject: signal: Remove the bogus sigkill_pending in ptrace_stop

The existence of sigkill_pending is a little silly as it is
functionally a duplicate of fatal_signal_pending that is used in
exactly one place.

Checking for pending fatal signals and returning early in ptrace_stop
is actively harmful.  It casues the ptrace_stop called by
ptrace_signal to return early before setting current->exit_code.
Later when ptrace_signal reads the signal number from
current->exit_code is undefined, making it unpredictable what will
happen.

Instead rely on the fact that schedule will not sleep if there is a
pending signal that can awaken a task.

Removing the explict sigkill_pending test fixes fixes ptrace_signal
when ptrace_stop does not stop because current->exit_code is always
set to to signr.

Cc: stable@vger.kernel.org
Fixes: 3d749b9e676b ("ptrace: simplify ptrace_stop()->sigkill_pending() path")
Fixes: 1a669c2f16d4 ("Add arch_ptrace_stop")
Link: https://lkml.kernel.org/r/87pmsyx29t.fsf@disp2133
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/signal.c | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 952741f6d0f9..9f2dc9cf3208 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2182,15 +2182,6 @@ static inline bool may_ptrace_stop(void)
 	return true;
 }
 
-/*
- * Return non-zero if there is a SIGKILL that should be waking us up.
- * Called with the siglock held.
- */
-static bool sigkill_pending(struct task_struct *tsk)
-{
-	return sigismember(&tsk->pending.signal, SIGKILL) ||
-	       sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
-}
 
 /*
  * This must be called with current->sighand->siglock held.
@@ -2217,17 +2208,16 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
 		 * calling arch_ptrace_stop, so we must release it now.
 		 * To preserve proper semantics, we must do this before
 		 * any signal bookkeeping like checking group_stop_count.
-		 * Meanwhile, a SIGKILL could come in before we retake the
-		 * siglock.  That must prevent us from sleeping in TASK_TRACED.
-		 * So after regaining the lock, we must check for SIGKILL.
 		 */
 		spin_unlock_irq(&current->sighand->siglock);
 		arch_ptrace_stop(exit_code, info);
 		spin_lock_irq(&current->sighand->siglock);
-		if (sigkill_pending(current))
-			return;
 	}
 
+	/*
+	 * schedule() will not sleep if there is a pending signal that
+	 * can awaken the task.
+	 */
 	set_special_state(TASK_TRACED);
 
 	/*
-- 
cgit 


From 4f627af8e6068892cafe031df6c14e8a0aaaa426 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 2 Sep 2021 16:10:11 -0500
Subject: ptrace: Remove the unnecessary arguments from arch_ptrace_stop

Both arch_ptrace_stop_needed and arch_ptrace_stop are called with an
exit_code and a siginfo structure.  Neither argument is used by any of
the implementations so just remove the unneeded arguments.

The two arechitectures that implement arch_ptrace_stop are ia64 and
sparc.  Both architectures flush their register stacks before a
ptrace_stack so that all of the register information can be accessed
by debuggers.

As the question of if a register stack needs to be flushed is
independent of why ptrace is stopping not needing arguments make sense.

Cc: David Miller <davem@davemloft.net>
Cc: sparclinux@vger.kernel.org
Link: https://lkml.kernel.org/r/87lf3mx290.fsf@disp2133
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/ia64/include/asm/ptrace.h  |  4 ++--
 arch/sparc/include/asm/ptrace.h |  8 ++++----
 include/linux/ptrace.h          | 22 +++++++++-------------
 kernel/signal.c                 |  4 ++--
 4 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h
index 08179135905c..f15504f75f10 100644
--- a/arch/ia64/include/asm/ptrace.h
+++ b/arch/ia64/include/asm/ptrace.h
@@ -129,9 +129,9 @@ static inline long regs_return_value(struct pt_regs *regs)
   extern void ia64_decrement_ip (struct pt_regs *pt);
 
   extern void ia64_ptrace_stop(void);
-  #define arch_ptrace_stop(code, info) \
+  #define arch_ptrace_stop() \
 	ia64_ptrace_stop()
-  #define arch_ptrace_stop_needed(code, info) \
+  #define arch_ptrace_stop_needed() \
 	(!test_thread_flag(TIF_RESTORE_RSE))
 
   extern void ptrace_attach_sync_user_rbs (struct task_struct *);
diff --git a/arch/sparc/include/asm/ptrace.h b/arch/sparc/include/asm/ptrace.h
index 71dd82b43cc5..d1419e669027 100644
--- a/arch/sparc/include/asm/ptrace.h
+++ b/arch/sparc/include/asm/ptrace.h
@@ -26,12 +26,12 @@ static inline bool pt_regs_clear_syscall(struct pt_regs *regs)
 	return (regs->tstate &= ~TSTATE_SYSCALL);
 }
 
-#define arch_ptrace_stop_needed(exit_code, info) \
+#define arch_ptrace_stop_needed() \
 ({	flush_user_windows(); \
 	get_thread_wsaved() != 0; \
 })
 
-#define arch_ptrace_stop(exit_code, info) \
+#define arch_ptrace_stop() \
 	synchronize_user_stack()
 
 #define current_pt_regs() \
@@ -129,12 +129,12 @@ static inline bool pt_regs_clear_syscall(struct pt_regs *regs)
 	return (regs->psr &= ~PSR_SYSCALL);
 }
 
-#define arch_ptrace_stop_needed(exit_code, info) \
+#define arch_ptrace_stop_needed() \
 ({	flush_user_windows(); \
 	current_thread_info()->w_saved != 0;	\
 })
 
-#define arch_ptrace_stop(exit_code, info) \
+#define arch_ptrace_stop() \
 	synchronize_user_stack()
 
 #define current_pt_regs() \
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index b5ebf6c01292..8aee2945ff08 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -362,29 +362,25 @@ static inline void user_single_step_report(struct pt_regs *regs)
 #ifndef arch_ptrace_stop_needed
 /**
  * arch_ptrace_stop_needed - Decide whether arch_ptrace_stop() should be called
- * @code:	current->exit_code value ptrace will stop with
- * @info:	siginfo_t pointer (or %NULL) for signal ptrace will stop with
  *
  * This is called with the siglock held, to decide whether or not it's
- * necessary to release the siglock and call arch_ptrace_stop() with the
- * same @code and @info arguments.  It can be defined to a constant if
- * arch_ptrace_stop() is never required, or always is.  On machines where
- * this makes sense, it should be defined to a quick test to optimize out
- * calling arch_ptrace_stop() when it would be superfluous.  For example,
- * if the thread has not been back to user mode since the last stop, the
- * thread state might indicate that nothing needs to be done.
+ * necessary to release the siglock and call arch_ptrace_stop().  It can be
+ * defined to a constant if arch_ptrace_stop() is never required, or always
+ * is.  On machines where this makes sense, it should be defined to a quick
+ * test to optimize out calling arch_ptrace_stop() when it would be
+ * superfluous.  For example, if the thread has not been back to user mode
+ * since the last stop, the thread state might indicate that nothing needs
+ * to be done.
  *
  * This is guaranteed to be invoked once before a task stops for ptrace and
  * may include arch-specific operations necessary prior to a ptrace stop.
  */
-#define arch_ptrace_stop_needed(code, info)	(0)
+#define arch_ptrace_stop_needed()	(0)
 #endif
 
 #ifndef arch_ptrace_stop
 /**
  * arch_ptrace_stop - Do machine-specific work before stopping for ptrace
- * @code:	current->exit_code value ptrace will stop with
- * @info:	siginfo_t pointer (or %NULL) for signal ptrace will stop with
  *
  * This is called with no locks held when arch_ptrace_stop_needed() has
  * just returned nonzero.  It is allowed to block, e.g. for user memory
@@ -394,7 +390,7 @@ static inline void user_single_step_report(struct pt_regs *regs)
  * we only do it when the arch requires it for this particular stop, as
  * indicated by arch_ptrace_stop_needed().
  */
-#define arch_ptrace_stop(code, info)		do { } while (0)
+#define arch_ptrace_stop()		do { } while (0)
 #endif
 
 #ifndef current_pt_regs
diff --git a/kernel/signal.c b/kernel/signal.c
index 9f2dc9cf3208..c9759ff2cb43 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2200,7 +2200,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
 {
 	bool gstop_done = false;
 
-	if (arch_ptrace_stop_needed(exit_code, info)) {
+	if (arch_ptrace_stop_needed()) {
 		/*
 		 * The arch code has something special to do before a
 		 * ptrace stop.  This is allowed to block, e.g. for faults
@@ -2210,7 +2210,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
 		 * any signal bookkeeping like checking group_stop_count.
 		 */
 		spin_unlock_irq(&current->sighand->siglock);
-		arch_ptrace_stop(exit_code, info);
+		arch_ptrace_stop();
 		spin_lock_irq(&current->sighand->siglock);
 	}
 
-- 
cgit 


From 7e3c4fb7fc19bcf20657de3edb718ec1b26c7df3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 3 Sep 2021 10:26:05 -0500
Subject: exec: Check for a pending fatal signal instead of core_state

Prevent exec continuing when a fatal signal is pending by replacing
mmap_read_lock with mmap_read_lock_killable.  This is always the right
thing to do as userspace will never observe an exec complete when
there is a fatal signal pending.

With that change it becomes unnecessary to explicitly test for a core
dump in progress.  In coredump_wait zap_threads arranges under
mmap_write_lock for all tasks that use a mm to also have SIGKILL
pending, which means mmap_read_lock_killable will always return -EINTR
when old_mm->core_state is present.

Link: https://lkml.kernel.org/r/87fstux27w.fsf@disp2133
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/exec.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index a098c133d8d7..b6079f1a098e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -987,16 +987,14 @@ static int exec_mmap(struct mm_struct *mm)
 
 	if (old_mm) {
 		/*
-		 * Make sure that if there is a core dump in progress
-		 * for the old mm, we get out and die instead of going
-		 * through with the exec.  We must hold mmap_lock around
-		 * checking core_state and changing tsk->mm.
+		 * If there is a pending fatal signal perhaps a signal
+		 * whose default action is to create a coredump get
+		 * out and die instead of going through with the exec.
 		 */
-		mmap_read_lock(old_mm);
-		if (unlikely(old_mm->core_state)) {
-			mmap_read_unlock(old_mm);
+		ret = mmap_read_lock_killable(old_mm);
+		if (ret) {
 			up_write(&tsk->signal->exec_update_lock);
-			return -EINTR;
+			return ret;
 		}
 	}
 
-- 
cgit 


From d67e03e361619b20c51aaef3b7dd1497617c371d Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 1 Sep 2021 11:23:38 -0500
Subject: exit: Factor coredump_exit_mm out of exit_mm

Separate the coredump logic from the ordinary exit_mm logic
by moving the coredump logic out of exit_mm into it's own
function coredump_exit_mm.

Link: https://lkml.kernel.org/r/87a6k2x277.fsf@disp2133
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/coredump.c |  6 ++---
 kernel/exit.c | 76 ++++++++++++++++++++++++++++++++---------------------------
 mm/oom_kill.c |  6 ++---
 3 files changed, 47 insertions(+), 41 deletions(-)

diff --git a/fs/coredump.c b/fs/coredump.c
index 3224dee44d30..5e0e08a7fb9b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -404,8 +404,8 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
 	 *
 	 * do_exit:
 	 *	The caller holds mm->mmap_lock. This means that the task which
-	 *	uses this mm can't pass exit_mm(), so it can't exit or clear
-	 *	its ->mm.
+	 *	uses this mm can't pass coredump_exit_mm(), so it can't exit or
+	 *	clear its ->mm.
 	 *
 	 * de_thread:
 	 *	It does list_replace_rcu(&leader->tasks, &current->tasks),
@@ -500,7 +500,7 @@ static void coredump_finish(struct mm_struct *mm, bool core_dumped)
 		next = curr->next;
 		task = curr->task;
 		/*
-		 * see exit_mm(), curr->task must not see
+		 * see coredump_exit_mm(), curr->task must not see
 		 * ->task == NULL before we read ->next.
 		 */
 		smp_mb();
diff --git a/kernel/exit.c b/kernel/exit.c
index 91a43e57a32e..cb1619d8fd64 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -339,6 +339,46 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
 	}
 }
 
+static void coredump_exit_mm(struct mm_struct *mm)
+{
+	struct core_state *core_state;
+
+	/*
+	 * Serialize with any possible pending coredump.
+	 * We must hold mmap_lock around checking core_state
+	 * and clearing tsk->mm.  The core-inducing thread
+	 * will increment ->nr_threads for each thread in the
+	 * group with ->mm != NULL.
+	 */
+	core_state = mm->core_state;
+	if (core_state) {
+		struct core_thread self;
+
+		mmap_read_unlock(mm);
+
+		self.task = current;
+		if (self.task->flags & PF_SIGNALED)
+			self.next = xchg(&core_state->dumper.next, &self);
+		else
+			self.task = NULL;
+		/*
+		 * Implies mb(), the result of xchg() must be visible
+		 * to core_state->dumper.
+		 */
+		if (atomic_dec_and_test(&core_state->nr_threads))
+			complete(&core_state->startup);
+
+		for (;;) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (!self.task) /* see coredump_finish() */
+				break;
+			freezable_schedule();
+		}
+		__set_current_state(TASK_RUNNING);
+		mmap_read_lock(mm);
+	}
+}
+
 #ifdef CONFIG_MEMCG
 /*
  * A task is exiting.   If it owned this mm, find a new owner for the mm.
@@ -434,47 +474,13 @@ assign_new_owner:
 static void exit_mm(void)
 {
 	struct mm_struct *mm = current->mm;
-	struct core_state *core_state;
 
 	exit_mm_release(current, mm);
 	if (!mm)
 		return;
 	sync_mm_rss(mm);
-	/*
-	 * Serialize with any possible pending coredump.
-	 * We must hold mmap_lock around checking core_state
-	 * and clearing tsk->mm.  The core-inducing thread
-	 * will increment ->nr_threads for each thread in the
-	 * group with ->mm != NULL.
-	 */
 	mmap_read_lock(mm);
-	core_state = mm->core_state;
-	if (core_state) {
-		struct core_thread self;
-
-		mmap_read_unlock(mm);
-
-		self.task = current;
-		if (self.task->flags & PF_SIGNALED)
-			self.next = xchg(&core_state->dumper.next, &self);
-		else
-			self.task = NULL;
-		/*
-		 * Implies mb(), the result of xchg() must be visible
-		 * to core_state->dumper.
-		 */
-		if (atomic_dec_and_test(&core_state->nr_threads))
-			complete(&core_state->startup);
-
-		for (;;) {
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			if (!self.task) /* see coredump_finish() */
-				break;
-			freezable_schedule();
-		}
-		__set_current_state(TASK_RUNNING);
-		mmap_read_lock(mm);
-	}
+	coredump_exit_mm(mm);
 	mmgrab(mm);
 	BUG_ON(mm != current->active_mm);
 	/* more a memory barrier than a real lock */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 831340e7ad8b..295c8bdfd6c8 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -787,9 +787,9 @@ static inline bool __task_will_free_mem(struct task_struct *task)
 	struct signal_struct *sig = task->signal;
 
 	/*
-	 * A coredumping process may sleep for an extended period in exit_mm(),
-	 * so the oom killer cannot assume that the process will promptly exit
-	 * and release memory.
+	 * A coredumping process may sleep for an extended period in
+	 * coredump_exit_mm(), so the oom killer cannot assume that
+	 * the process will promptly exit and release memory.
 	 */
 	if (sig->flags & SIGNAL_GROUP_COREDUMP)
 		return false;
-- 
cgit 


From 92307383082daff5df884a25df9e283efb7ef261 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 1 Sep 2021 11:33:50 -0500
Subject: coredump:  Don't perform any cleanups before dumping core

Rename coredump_exit_mm to coredump_task_exit and call it from do_exit
before PTRACE_EVENT_EXIT, and before any cleanup work for a task
happens.  This ensures that an accurate copy of the process can be
captured in the coredump as no cleanup for the process happens before
the coredump completes.  This also ensures that PTRACE_EVENT_EXIT
will not be visited by any thread until the coredump is complete.

Add a new flag PF_POSTCOREDUMP so that tasks that have passed through
coredump_task_exit can be recognized and ignored in zap_process.

Now that all of the coredumping happens before exit_mm remove code to
test for a coredump in progress from mm_release.

Replace "may_ptrace_stop()" with a simple test of "current->ptrace".
The other tests in may_ptrace_stop all concern avoiding stopping
during a coredump.  These tests are no longer necessary as it is now
guaranteed that fatal_signal_pending will be set if the code enters
ptrace_stop during a coredump.  The code in ptrace_stop is guaranteed
not to stop if fatal_signal_pending returns true.

Until this change "ptrace_event(PTRACE_EVENT_EXIT)" could call
ptrace_stop without fatal_signal_pending being true, as signals are
dequeued in get_signal before calling do_exit.  This is no longer
an issue as "ptrace_event(PTRACE_EVENT_EXIT)" is no longer reached
until after the coredump completes.

Link: https://lkml.kernel.org/r/874kaax26c.fsf@disp2133
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/coredump.c         |  8 ++++----
 include/linux/sched.h |  1 +
 kernel/exit.c         | 19 ++++++++++++-------
 kernel/fork.c         |  3 +--
 kernel/signal.c       | 27 +--------------------------
 mm/oom_kill.c         |  2 +-
 6 files changed, 20 insertions(+), 40 deletions(-)

diff --git a/fs/coredump.c b/fs/coredump.c
index 5e0e08a7fb9b..d576287fb88b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -359,7 +359,7 @@ static int zap_process(struct task_struct *start, int exit_code, int flags)
 
 	for_each_thread(start, t) {
 		task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
-		if (t != current && t->mm) {
+		if (t != current && !(t->flags & PF_POSTCOREDUMP)) {
 			sigaddset(&t->pending.signal, SIGKILL);
 			signal_wake_up(t, 1);
 			nr++;
@@ -404,8 +404,8 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
 	 *
 	 * do_exit:
 	 *	The caller holds mm->mmap_lock. This means that the task which
-	 *	uses this mm can't pass coredump_exit_mm(), so it can't exit or
-	 *	clear its ->mm.
+	 *	uses this mm can't pass coredump_task_exit(), so it can't exit
+	 *	or clear its ->mm.
 	 *
 	 * de_thread:
 	 *	It does list_replace_rcu(&leader->tasks, &current->tasks),
@@ -500,7 +500,7 @@ static void coredump_finish(struct mm_struct *mm, bool core_dumped)
 		next = curr->next;
 		task = curr->task;
 		/*
-		 * see coredump_exit_mm(), curr->task must not see
+		 * see coredump_task_exit(), curr->task must not see
 		 * ->task == NULL before we read ->next.
 		 */
 		smp_mb();
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e12b524426b0..f3741f23935e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1664,6 +1664,7 @@ extern struct pid *cad_pid;
 #define PF_VCPU			0x00000001	/* I'm a virtual CPU */
 #define PF_IDLE			0x00000002	/* I am an IDLE thread */
 #define PF_EXITING		0x00000004	/* Getting shut down */
+#define PF_POSTCOREDUMP		0x00000008	/* Coredumps should ignore this task */
 #define PF_IO_WORKER		0x00000010	/* Task is an IO worker */
 #define PF_WQ_WORKER		0x00000020	/* I'm a workqueue worker */
 #define PF_FORKNOEXEC		0x00000040	/* Forked but didn't exec */
diff --git a/kernel/exit.c b/kernel/exit.c
index cb1619d8fd64..774e6b5061b8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -339,23 +339,29 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
 	}
 }
 
-static void coredump_exit_mm(struct mm_struct *mm)
+static void coredump_task_exit(struct task_struct *tsk)
 {
 	struct core_state *core_state;
+	struct mm_struct *mm;
+
+	mm = tsk->mm;
+	if (!mm)
+		return;
 
 	/*
 	 * Serialize with any possible pending coredump.
 	 * We must hold mmap_lock around checking core_state
-	 * and clearing tsk->mm.  The core-inducing thread
+	 * and setting PF_POSTCOREDUMP.  The core-inducing thread
 	 * will increment ->nr_threads for each thread in the
-	 * group with ->mm != NULL.
+	 * group without PF_POSTCOREDUMP set.
 	 */
+	mmap_read_lock(mm);
+	tsk->flags |= PF_POSTCOREDUMP;
 	core_state = mm->core_state;
+	mmap_read_unlock(mm);
 	if (core_state) {
 		struct core_thread self;
 
-		mmap_read_unlock(mm);
-
 		self.task = current;
 		if (self.task->flags & PF_SIGNALED)
 			self.next = xchg(&core_state->dumper.next, &self);
@@ -375,7 +381,6 @@ static void coredump_exit_mm(struct mm_struct *mm)
 			freezable_schedule();
 		}
 		__set_current_state(TASK_RUNNING);
-		mmap_read_lock(mm);
 	}
 }
 
@@ -480,7 +485,6 @@ static void exit_mm(void)
 		return;
 	sync_mm_rss(mm);
 	mmap_read_lock(mm);
-	coredump_exit_mm(mm);
 	mmgrab(mm);
 	BUG_ON(mm != current->active_mm);
 	/* more a memory barrier than a real lock */
@@ -768,6 +772,7 @@ void __noreturn do_exit(long code)
 	profile_task_exit(tsk);
 	kcov_task_exit(tsk);
 
+	coredump_task_exit(tsk);
 	ptrace_event(PTRACE_EVENT_EXIT, code);
 
 	validate_creds_for_do_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 38681ad44c76..9bd9f2da9e41 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1392,8 +1392,7 @@ static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 	 * purposes.
 	 */
 	if (tsk->clear_child_tid) {
-		if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
-		    atomic_read(&mm->mm_users) > 1) {
+		if (atomic_read(&mm->mm_users) > 1) {
 			/*
 			 * We don't check the error code - if userspace has
 			 * not set up a proper pointer then tough luck.
diff --git a/kernel/signal.c b/kernel/signal.c
index c9759ff2cb43..b0db80acc6ef 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2158,31 +2158,6 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
 	spin_unlock_irqrestore(&sighand->siglock, flags);
 }
 
-static inline bool may_ptrace_stop(void)
-{
-	if (!likely(current->ptrace))
-		return false;
-	/*
-	 * Are we in the middle of do_coredump?
-	 * If so and our tracer is also part of the coredump stopping
-	 * is a deadlock situation, and pointless because our tracer
-	 * is dead so don't allow us to stop.
-	 * If SIGKILL was already sent before the caller unlocked
-	 * ->siglock we must see ->core_state != NULL. Otherwise it
-	 * is safe to enter schedule().
-	 *
-	 * This is almost outdated, a task with the pending SIGKILL can't
-	 * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported
-	 * after SIGKILL was already dequeued.
-	 */
-	if (unlikely(current->mm->core_state) &&
-	    unlikely(current->mm == current->parent->mm))
-		return false;
-
-	return true;
-}
-
-
 /*
  * This must be called with current->sighand->siglock held.
  *
@@ -2263,7 +2238,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
 
 	spin_unlock_irq(&current->sighand->siglock);
 	read_lock(&tasklist_lock);
-	if (may_ptrace_stop()) {
+	if (likely(current->ptrace)) {
 		/*
 		 * Notify parents of the stop.
 		 *
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 295c8bdfd6c8..7877c755ab37 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -788,7 +788,7 @@ static inline bool __task_will_free_mem(struct task_struct *task)
 
 	/*
 	 * A coredumping process may sleep for an extended period in
-	 * coredump_exit_mm(), so the oom killer cannot assume that
+	 * coredump_task_exit(), so the oom killer cannot assume that
 	 * the process will promptly exit and release memory.
 	 */
 	if (sig->flags & SIGNAL_GROUP_COREDUMP)
-- 
cgit 


From 0258b5fd7c7124b87e185a1a9322d2c66b1876b7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 22 Sep 2021 11:24:02 -0500
Subject: coredump: Limit coredumps to a single thread group

Today when a signal is delivered with a handler of SIG_DFL whose
default behavior is to generate a core dump not only that process but
every process that shares the mm is killed.

In the case of vfork this looks like a real world problem.  Consider
the following well defined sequence.

	if (vfork() == 0) {
		execve(...);
		_exit(EXIT_FAILURE);
	}

If a signal that generates a core dump is received after vfork but
before the execve changes the mm the process that called vfork will
also be killed (as the mm is shared).

Similarly if the execve fails after the point of no return the kernel
delivers SIGSEGV which will kill both the exec'ing process and because
the mm is shared the process that called vfork as well.

As far as I can tell this behavior is a violation of people's
reasonable expectations, POSIX, and is unnecessarily fragile when the
system is low on memory.

Solve this by making a userspace visible change to only kill a single
process/thread group.  This is possible because Jann Horn recently
modified[1] the coredump code so that the mm can safely be modified
while the coredump is happening.  With LinuxThreads long gone I don't
expect anyone to have a notice this behavior change in practice.

To accomplish this move the core_state pointer from mm_struct to
signal_struct, which allows different thread groups to coredump
simultatenously.

In zap_threads remove the work to kill anything except for the current
thread group.

v2: Remove core_state from the VM_BUG_ON_MM print to fix
    compile failure when CONFIG_DEBUG_VM is enabled.
    Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>

[1] a07279c9a8cd ("binfmt_elf, binfmt_elf_fdpic: use a VMA list snapshot")
Fixes: d89f3847def4 ("[PATCH] thread-aware coredumps, 2.5.43-C3")
History-tree: git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
Link: https://lkml.kernel.org/r/87y27mvnke.fsf@disp2133
Link: https://lkml.kernel.org/r/20211007144701.67592574@canb.auug.org.au
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/binfmt_elf.c              |  4 +--
 fs/binfmt_elf_fdpic.c        |  2 +-
 fs/coredump.c                | 84 +++++---------------------------------------
 fs/proc/array.c              |  6 ++--
 include/linux/mm_types.h     | 13 -------
 include/linux/sched/signal.h | 13 +++++++
 kernel/exit.c                | 13 +++----
 kernel/fork.c                |  1 -
 mm/debug.c                   |  4 +--
 9 files changed, 34 insertions(+), 106 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 69d900a8473d..796e5327ee7d 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1834,7 +1834,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	/*
 	 * Allocate a structure for each thread.
 	 */
-	for (ct = &dump_task->mm->core_state->dumper; ct; ct = ct->next) {
+	for (ct = &dump_task->signal->core_state->dumper; ct; ct = ct->next) {
 		t = kzalloc(offsetof(struct elf_thread_core_info,
 				     notes[info->thread_notes]),
 			    GFP_KERNEL);
@@ -2024,7 +2024,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	if (!elf_note_info_init(info))
 		return 0;
 
-	for (ct = current->mm->core_state->dumper.next;
+	for (ct = current->signal->core_state->dumper.next;
 					ct; ct = ct->next) {
 		ets = kzalloc(sizeof(*ets), GFP_KERNEL);
 		if (!ets)
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 6d8fd6030cbb..c6f588dc4a9d 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1494,7 +1494,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 	if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
 		goto end_coredump;
 
-	for (ct = current->mm->core_state->dumper.next;
+	for (ct = current->signal->core_state->dumper.next;
 					ct; ct = ct->next) {
 		tmp = elf_dump_thread_status(cprm->siginfo->si_signo,
 					     ct->task, &thread_status_size);
diff --git a/fs/coredump.c b/fs/coredump.c
index d576287fb88b..a6b3c196cdef 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -369,99 +369,34 @@ static int zap_process(struct task_struct *start, int exit_code, int flags)
 	return nr;
 }
 
-static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
+static int zap_threads(struct task_struct *tsk,
 			struct core_state *core_state, int exit_code)
 {
-	struct task_struct *g, *p;
-	unsigned long flags;
 	int nr = -EAGAIN;
 
 	spin_lock_irq(&tsk->sighand->siglock);
 	if (!signal_group_exit(tsk->signal)) {
-		mm->core_state = core_state;
+		tsk->signal->core_state = core_state;
 		tsk->signal->group_exit_task = tsk;
 		nr = zap_process(tsk, exit_code, 0);
 		clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
+		tsk->flags |= PF_DUMPCORE;
+		atomic_set(&core_state->nr_threads, nr);
 	}
 	spin_unlock_irq(&tsk->sighand->siglock);
-	if (unlikely(nr < 0))
-		return nr;
-
-	tsk->flags |= PF_DUMPCORE;
-	if (atomic_read(&mm->mm_users) == nr + 1)
-		goto done;
-	/*
-	 * We should find and kill all tasks which use this mm, and we should
-	 * count them correctly into ->nr_threads. We don't take tasklist
-	 * lock, but this is safe wrt:
-	 *
-	 * fork:
-	 *	None of sub-threads can fork after zap_process(leader). All
-	 *	processes which were created before this point should be
-	 *	visible to zap_threads() because copy_process() adds the new
-	 *	process to the tail of init_task.tasks list, and lock/unlock
-	 *	of ->siglock provides a memory barrier.
-	 *
-	 * do_exit:
-	 *	The caller holds mm->mmap_lock. This means that the task which
-	 *	uses this mm can't pass coredump_task_exit(), so it can't exit
-	 *	or clear its ->mm.
-	 *
-	 * de_thread:
-	 *	It does list_replace_rcu(&leader->tasks, &current->tasks),
-	 *	we must see either old or new leader, this does not matter.
-	 *	However, it can change p->sighand, so lock_task_sighand(p)
-	 *	must be used. Since p->mm != NULL and we hold ->mmap_lock
-	 *	it can't fail.
-	 *
-	 *	Note also that "g" can be the old leader with ->mm == NULL
-	 *	and already unhashed and thus removed from ->thread_group.
-	 *	This is OK, __unhash_process()->list_del_rcu() does not
-	 *	clear the ->next pointer, we will find the new leader via
-	 *	next_thread().
-	 */
-	rcu_read_lock();
-	for_each_process(g) {
-		if (g == tsk->group_leader)
-			continue;
-		if (g->flags & PF_KTHREAD)
-			continue;
-
-		for_each_thread(g, p) {
-			if (unlikely(!p->mm))
-				continue;
-			if (unlikely(p->mm == mm)) {
-				lock_task_sighand(p, &flags);
-				nr += zap_process(p, exit_code,
-							SIGNAL_GROUP_EXIT);
-				unlock_task_sighand(p, &flags);
-			}
-			break;
-		}
-	}
-	rcu_read_unlock();
-done:
-	atomic_set(&core_state->nr_threads, nr);
 	return nr;
 }
 
 static int coredump_wait(int exit_code, struct core_state *core_state)
 {
 	struct task_struct *tsk = current;
-	struct mm_struct *mm = tsk->mm;
 	int core_waiters = -EBUSY;
 
 	init_completion(&core_state->startup);
 	core_state->dumper.task = tsk;
 	core_state->dumper.next = NULL;
 
-	if (mmap_write_lock_killable(mm))
-		return -EINTR;
-
-	if (!mm->core_state)
-		core_waiters = zap_threads(tsk, mm, core_state, exit_code);
-	mmap_write_unlock(mm);
-
+	core_waiters = zap_threads(tsk, core_state, exit_code);
 	if (core_waiters > 0) {
 		struct core_thread *ptr;
 
@@ -483,7 +418,7 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
 	return core_waiters;
 }
 
-static void coredump_finish(struct mm_struct *mm, bool core_dumped)
+static void coredump_finish(bool core_dumped)
 {
 	struct core_thread *curr, *next;
 	struct task_struct *task;
@@ -493,9 +428,10 @@ static void coredump_finish(struct mm_struct *mm, bool core_dumped)
 		current->signal->group_exit_code |= 0x80;
 	current->signal->group_exit_task = NULL;
 	current->signal->flags = SIGNAL_GROUP_EXIT;
+	next = current->signal->core_state->dumper.next;
+	current->signal->core_state = NULL;
 	spin_unlock_irq(&current->sighand->siglock);
 
-	next = mm->core_state->dumper.next;
 	while ((curr = next) != NULL) {
 		next = curr->next;
 		task = curr->task;
@@ -507,8 +443,6 @@ static void coredump_finish(struct mm_struct *mm, bool core_dumped)
 		curr->task = NULL;
 		wake_up_process(task);
 	}
-
-	mm->core_state = NULL;
 }
 
 static bool dump_interrupted(void)
@@ -839,7 +773,7 @@ fail_dropcount:
 fail_unlock:
 	kfree(argv);
 	kfree(cn.corename);
-	coredump_finish(mm, core_dumped);
+	coredump_finish(core_dumped);
 	revert_creds(old_cred);
 fail_creds:
 	put_cred(cred);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 49be8c8ef555..520c51be1e57 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -408,9 +408,9 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
 		   cpumask_pr_args(&task->cpus_mask));
 }
 
-static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
+static inline void task_core_dumping(struct seq_file *m, struct task_struct *task)
 {
-	seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state);
+	seq_put_decimal_ull(m, "CoreDumping:\t", !!task->signal->core_state);
 	seq_putc(m, '\n');
 }
 
@@ -436,7 +436,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 
 	if (mm) {
 		task_mem(m, mm);
-		task_core_dumping(m, mm);
+		task_core_dumping(m, task);
 		task_thp_status(m, mm);
 		mmput(mm);
 	}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7f8ee09c711f..1039f6ae922c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -387,17 +387,6 @@ struct vm_area_struct {
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 } __randomize_layout;
 
-struct core_thread {
-	struct task_struct *task;
-	struct core_thread *next;
-};
-
-struct core_state {
-	atomic_t nr_threads;
-	struct core_thread dumper;
-	struct completion startup;
-};
-
 struct kioctx_table;
 struct mm_struct {
 	struct {
@@ -518,8 +507,6 @@ struct mm_struct {
 
 		unsigned long flags; /* Must use atomic bitops to access */
 
-		struct core_state *core_state; /* coredumping support */
-
 #ifdef CONFIG_AIO
 		spinlock_t			ioctx_lock;
 		struct kioctx_table __rcu	*ioctx_table;
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index e5f4ce622ee6..a8fe2a593a3a 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -72,6 +72,17 @@ struct multiprocess_signals {
 	struct hlist_node node;
 };
 
+struct core_thread {
+	struct task_struct *task;
+	struct core_thread *next;
+};
+
+struct core_state {
+	atomic_t nr_threads;
+	struct core_thread dumper;
+	struct completion startup;
+};
+
 /*
  * NOTE! "signal_struct" does not have its own
  * locking, because a shared signal_struct always
@@ -110,6 +121,8 @@ struct signal_struct {
 	int			group_stop_count;
 	unsigned int		flags; /* see SIGNAL_* flags below */
 
+	struct core_state *core_state; /* coredumping support */
+
 	/*
 	 * PR_SET_CHILD_SUBREAPER marks a process, like a service
 	 * manager, to re-parent orphan (double-forking) child processes
diff --git a/kernel/exit.c b/kernel/exit.c
index 774e6b5061b8..2b355e926c13 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -342,23 +342,18 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
 static void coredump_task_exit(struct task_struct *tsk)
 {
 	struct core_state *core_state;
-	struct mm_struct *mm;
-
-	mm = tsk->mm;
-	if (!mm)
-		return;
 
 	/*
 	 * Serialize with any possible pending coredump.
-	 * We must hold mmap_lock around checking core_state
+	 * We must hold siglock around checking core_state
 	 * and setting PF_POSTCOREDUMP.  The core-inducing thread
 	 * will increment ->nr_threads for each thread in the
 	 * group without PF_POSTCOREDUMP set.
 	 */
-	mmap_read_lock(mm);
+	spin_lock_irq(&tsk->sighand->siglock);
 	tsk->flags |= PF_POSTCOREDUMP;
-	core_state = mm->core_state;
-	mmap_read_unlock(mm);
+	core_state = tsk->signal->core_state;
+	spin_unlock_irq(&tsk->sighand->siglock);
 	if (core_state) {
 		struct core_thread self;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 9bd9f2da9e41..c8adb76982f7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1044,7 +1044,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	seqcount_init(&mm->write_protect_seq);
 	mmap_init_lock(mm);
 	INIT_LIST_HEAD(&mm->mmlist);
-	mm->core_state = NULL;
 	mm_pgtables_bytes_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
diff --git a/mm/debug.c b/mm/debug.c
index e73fe0a8ec3d..aa5fe4d5c4b4 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -214,7 +214,7 @@ void dump_mm(const struct mm_struct *mm)
 		"start_code %lx end_code %lx start_data %lx end_data %lx\n"
 		"start_brk %lx brk %lx start_stack %lx\n"
 		"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
-		"binfmt %px flags %lx core_state %px\n"
+		"binfmt %px flags %lx\n"
 #ifdef CONFIG_AIO
 		"ioctx_table %px\n"
 #endif
@@ -246,7 +246,7 @@ void dump_mm(const struct mm_struct *mm)
 		mm->start_code, mm->end_code, mm->start_data, mm->end_data,
 		mm->start_brk, mm->brk, mm->start_stack,
 		mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
-		mm->binfmt, mm->flags, mm->core_state,
+		mm->binfmt, mm->flags,
 #ifdef CONFIG_AIO
 		mm->ioctx_table,
 #endif
-- 
cgit