From 1fb53567a3633740aac8761eb7023dc5671f0edb Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 5 May 2017 13:45:14 -0500
Subject: pids: Move task_pid_type into sched/signal.h

The function is general and inline so there is no need
to hide it inside of exit.c

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/exit.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index c3c7ac560114..16432428fc6c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1001,14 +1001,6 @@ struct wait_opts {
 	int			notask_error;
 };
 
-static inline
-struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
-{
-	if (type != PIDTYPE_PID)
-		task = task->group_leader;
-	return task->pids[type].pid;
-}
-
 static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
 {
 	return	wo->wo_type == PIDTYPE_MAX ||
-- 
cgit 


From 7a36094d61bfe9843de5484ff0140227983ac5d5 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 26 Sep 2017 12:45:33 -0500
Subject: pids: Compute task_tgid using signal->leader_pid

The cost is the the same and this removes the need
to worry about complications that come from de_thread
and group_leader changing.

__task_pid_nr_ns has been updated to take advantage of this change.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/pid.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 157fe4b19971..d0de2b59f86f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -421,13 +421,14 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 	if (!ns)
 		ns = task_active_pid_ns(current);
 	if (likely(pid_alive(task))) {
-		if (type != PIDTYPE_PID) {
-			if (type == __PIDTYPE_TGID)
-				type = PIDTYPE_PID;
-
-			task = task->group_leader;
-		}
-		nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
+		struct pid *pid;
+		if (type == PIDTYPE_PID)
+			pid = task_pid(task);
+		else if (type == __PIDTYPE_TGID)
+			pid = task_tgid(task);
+		else
+			pid = rcu_dereference(task->group_leader->pids[type].pid);
+		nr = pid_nr_ns(pid, ns);
 	}
 	rcu_read_unlock();
 
-- 
cgit 


From 2c4704756cab7cfa031ada4dab361562f0e357c0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 26 Sep 2017 13:06:43 -0500
Subject: pids: Move the pgrp and session pid pointers from task_struct to
 signal_struct

To access these fields the code always has to go to group leader so
going to signal struct is no loss and is actually a fundamental simplification.

This saves a little bit of memory by only allocating the pid pointer array
once instead of once for every thread, and even better this removes a
few potential races caused by the fact that group_leader can be changed
by de_thread, while signal_struct can not.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/fork.c | 23 ++++++++++++++++++-----
 kernel/pid.c  | 45 ++++++++++++++++++++++-----------------------
 2 files changed, 40 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 9440d61b925c..d2952162399b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1549,10 +1549,22 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
 static inline void posix_cpu_timers_init(struct task_struct *tsk) { }
 #endif
 
+static inline void init_task_pid_links(struct task_struct *task)
+{
+	enum pid_type type;
+
+	for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
+		INIT_HLIST_NODE(&task->pid_links[type]);
+	}
+}
+
 static inline void
 init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
 {
-	 task->pids[type].pid = pid;
+	if (type == PIDTYPE_PID)
+		task->thread_pid = pid;
+	else
+		task->signal->pids[type] = pid;
 }
 
 static inline void rcu_copy_process(struct task_struct *p)
@@ -1928,6 +1940,7 @@ static __latent_entropy struct task_struct *copy_process(
 		goto bad_fork_cancel_cgroup;
 	}
 
+	init_task_pid_links(p);
 	if (likely(p->pid)) {
 		ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
 
@@ -2036,13 +2049,13 @@ fork_out:
 	return ERR_PTR(retval);
 }
 
-static inline void init_idle_pids(struct pid_link *links)
+static inline void init_idle_pids(struct task_struct *idle)
 {
 	enum pid_type type;
 
 	for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
-		INIT_HLIST_NODE(&links[type].node); /* not really needed */
-		links[type].pid = &init_struct_pid;
+		INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */
+		init_task_pid(idle, type, &init_struct_pid);
 	}
 }
 
@@ -2052,7 +2065,7 @@ struct task_struct *fork_idle(int cpu)
 	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
 			    cpu_to_node(cpu));
 	if (!IS_ERR(task)) {
-		init_idle_pids(task->pids);
+		init_idle_pids(task);
 		init_idle(task, cpu);
 	}
 
diff --git a/kernel/pid.c b/kernel/pid.c
index d0de2b59f86f..f8486d2e2346 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -265,27 +265,35 @@ struct pid *find_vpid(int nr)
 }
 EXPORT_SYMBOL_GPL(find_vpid);
 
+static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
+{
+	return (type == PIDTYPE_PID) ?
+		&task->thread_pid :
+		(type == __PIDTYPE_TGID) ?
+		&task->signal->leader_pid :
+		&task->signal->pids[type];
+}
+
 /*
  * attach_pid() must be called with the tasklist_lock write-held.
  */
 void attach_pid(struct task_struct *task, enum pid_type type)
 {
-	struct pid_link *link = &task->pids[type];
-	hlist_add_head_rcu(&link->node, &link->pid->tasks[type]);
+	struct pid *pid = *task_pid_ptr(task, type);
+	hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
 }
 
 static void __change_pid(struct task_struct *task, enum pid_type type,
 			struct pid *new)
 {
-	struct pid_link *link;
+	struct pid **pid_ptr = task_pid_ptr(task, type);
 	struct pid *pid;
 	int tmp;
 
-	link = &task->pids[type];
-	pid = link->pid;
+	pid = *pid_ptr;
 
-	hlist_del_rcu(&link->node);
-	link->pid = new;
+	hlist_del_rcu(&task->pid_links[type]);
+	*pid_ptr = new;
 
 	for (tmp = PIDTYPE_MAX; --tmp >= 0; )
 		if (!hlist_empty(&pid->tasks[tmp]))
@@ -310,8 +318,9 @@ void change_pid(struct task_struct *task, enum pid_type type,
 void transfer_pid(struct task_struct *old, struct task_struct *new,
 			   enum pid_type type)
 {
-	new->pids[type].pid = old->pids[type].pid;
-	hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
+	if (type == PIDTYPE_PID)
+		new->thread_pid = old->thread_pid;
+	hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
 }
 
 struct task_struct *pid_task(struct pid *pid, enum pid_type type)
@@ -322,7 +331,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
 		first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
 					      lockdep_tasklist_lock_is_held());
 		if (first)
-			result = hlist_entry(first, struct task_struct, pids[(type)].node);
+			result = hlist_entry(first, struct task_struct, pid_links[(type)]);
 	}
 	return result;
 }
@@ -360,9 +369,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
 {
 	struct pid *pid;
 	rcu_read_lock();
-	if (type != PIDTYPE_PID)
-		task = task->group_leader;
-	pid = get_pid(rcu_dereference(task->pids[type].pid));
+	pid = get_pid(rcu_dereference(*task_pid_ptr(task, type)));
 	rcu_read_unlock();
 	return pid;
 }
@@ -420,16 +427,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 	rcu_read_lock();
 	if (!ns)
 		ns = task_active_pid_ns(current);
-	if (likely(pid_alive(task))) {
-		struct pid *pid;
-		if (type == PIDTYPE_PID)
-			pid = task_pid(task);
-		else if (type == __PIDTYPE_TGID)
-			pid = task_tgid(task);
-		else
-			pid = rcu_dereference(task->group_leader->pids[type].pid);
-		nr = pid_nr_ns(pid, ns);
-	}
+	if (likely(pid_alive(task)))
+		nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
 	rcu_read_unlock();
 
 	return nr;
-- 
cgit 


From 6883f81aac6f44e7df70a6af189b3689ff52cbfb Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 4 Jun 2017 04:32:13 -0500
Subject: pid: Implement PIDTYPE_TGID

Everywhere except in the pid array we distinguish between a tasks pid and
a tasks tgid (thread group id).  Even in the enumeration we want that
distinction sometimes so we have added __PIDTYPE_TGID.  With leader_pid
we almost have an implementation of PIDTYPE_TGID in struct signal_struct.

Add PIDTYPE_TGID as a first class member of the pid_type enumeration and
into the pids array.  Then remove the __PIDTYPE_TGID special case and the
leader_pid in signal_struct.

The net size increase is just an extra pointer added to struct pid and
an extra pair of pointers of an hlist_node added to task_struct.

The effect on code maintenance is the removal of a number of special
cases today and the potential to remove many more special cases as
PIDTYPE_TGID gets used to it's fullest.  The long term potential
is allowing zombie thread group leaders to exit, which will remove
a lot more special cases in the code.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/events/core.c           | 2 +-
 kernel/exit.c                  | 1 +
 kernel/fork.c                  | 3 ++-
 kernel/pid.c                   | 2 --
 kernel/time/itimer.c           | 5 +++--
 kernel/time/posix-cpu-timers.c | 2 +-
 6 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 80cca2b30c4f..9025b1796ca8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1334,7 +1334,7 @@ static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
 
 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
 {
-	return perf_event_pid_type(event, p, __PIDTYPE_TGID);
+	return perf_event_pid_type(event, p, PIDTYPE_TGID);
 }
 
 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
diff --git a/kernel/exit.c b/kernel/exit.c
index 16432428fc6c..25582b442955 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -73,6 +73,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
 	nr_threads--;
 	detach_pid(p, PIDTYPE_PID);
 	if (group_dead) {
+		detach_pid(p, PIDTYPE_TGID);
 		detach_pid(p, PIDTYPE_PGID);
 		detach_pid(p, PIDTYPE_SID);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index d2952162399b..cc5be0d01ce6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1946,6 +1946,7 @@ static __latent_entropy struct task_struct *copy_process(
 
 		init_task_pid(p, PIDTYPE_PID, pid);
 		if (thread_group_leader(p)) {
+			init_task_pid(p, PIDTYPE_TGID, pid);
 			init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
 			init_task_pid(p, PIDTYPE_SID, task_session(current));
 
@@ -1954,7 +1955,6 @@ static __latent_entropy struct task_struct *copy_process(
 				p->signal->flags |= SIGNAL_UNKILLABLE;
 			}
 
-			p->signal->leader_pid = pid;
 			p->signal->tty = tty_kref_get(current->signal->tty);
 			/*
 			 * Inherit has_child_subreaper flag under the same
@@ -1965,6 +1965,7 @@ static __latent_entropy struct task_struct *copy_process(
 							 p->real_parent->signal->is_child_subreaper;
 			list_add_tail(&p->sibling, &p->real_parent->children);
 			list_add_tail_rcu(&p->tasks, &init_task.tasks);
+			attach_pid(p, PIDTYPE_TGID);
 			attach_pid(p, PIDTYPE_PGID);
 			attach_pid(p, PIDTYPE_SID);
 			__this_cpu_inc(process_counts);
diff --git a/kernel/pid.c b/kernel/pid.c
index f8486d2e2346..de1cfc4f75a2 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -269,8 +269,6 @@ static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
 {
 	return (type == PIDTYPE_PID) ?
 		&task->thread_pid :
-		(type == __PIDTYPE_TGID) ?
-		&task->signal->leader_pid :
 		&task->signal->pids[type];
 }
 
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index f26acef5d7b4..9a65713c8309 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -139,9 +139,10 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
 {
 	struct signal_struct *sig =
 		container_of(timer, struct signal_struct, real_timer);
+	struct pid *leader_pid = sig->pids[PIDTYPE_TGID];
 
-	trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0);
-	kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
+	trace_itimer_expire(ITIMER_REAL, leader_pid, 0);
+	kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid);
 
 	return HRTIMER_NORESTART;
 }
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 5a6251ac6f7a..40e6fae46cec 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -895,7 +895,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
 
 		trace_itimer_expire(signo == SIGPROF ?
 				    ITIMER_PROF : ITIMER_VIRTUAL,
-				    tsk->signal->leader_pid, cur_time);
+				    task_tgid(tsk), cur_time);
 		__group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
 	}
 
-- 
cgit 


From 2118e1f53f6f0973a1d9a6a7dc9296959bf39ec0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 21 Jul 2018 00:00:29 -0500
Subject: posix-timers: Noralize good_sigevent

In good_sigevent directly compute the default return value as
"task_tgid(current)".  This is exactly the same as
"task_pid(current->group_leader)" but written more clearly.

In the thread case first compute the thread's pid.  Then veify that
attached to that pid is a thread of the current thread group.

This has the net effect of making the code a little clearer, and
making it obvious that posix timers never look up a process by a the
pid of a thread.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/time/posix-timers.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index e08ce3f27447..2bdf08a2bae9 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -433,11 +433,13 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 
 static struct pid *good_sigevent(sigevent_t * event)
 {
-	struct task_struct *rtn = current->group_leader;
+	struct pid *pid = task_tgid(current);
+	struct task_struct *rtn;
 
 	switch (event->sigev_notify) {
 	case SIGEV_SIGNAL | SIGEV_THREAD_ID:
-		rtn = find_task_by_vpid(event->sigev_notify_thread_id);
+		pid = find_vpid(event->sigev_notify_thread_id);
+		rtn = pid_task(pid, PIDTYPE_PID);
 		if (!rtn || !same_thread_group(rtn, current))
 			return NULL;
 		/* FALLTHRU */
@@ -447,7 +449,7 @@ static struct pid *good_sigevent(sigevent_t * event)
 			return NULL;
 		/* FALLTHRU */
 	case SIGEV_NONE:
-		return task_pid(rtn);
+		return pid;
 	default:
 		return NULL;
 	}
-- 
cgit 


From 24122c7f4969adeeaeca3fb1656a31569e9aa59b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 Jul 2018 14:30:23 -0500
Subject: signal: Pass pid and pid type into send_sigqueue

Make the code more maintainable by performing more of the signal
related work in send_sigqueue.

A quick inspection of do_timer_create will show that this code path
does not lookup a thread group by a thread's pid.  Making it safe
to find the task pointed to by it_pid with "pid_task(it_pid, type)";

This supports the changes needed in fork to tell if a signal was sent
to a single process or a group of processes.

Having the pid to task transition in signal.c will also make it easier
to sort out races with de_thread and and the thread group leader
exiting when it comes time to address that.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/signal.c            | 14 +++++++++-----
 kernel/time/posix-timers.c | 13 ++++---------
 2 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 8d8a940422a8..40feb14e276d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1664,17 +1664,20 @@ void sigqueue_free(struct sigqueue *q)
 		__sigqueue_free(q);
 }
 
-int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
+int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
 {
 	int sig = q->info.si_signo;
 	struct sigpending *pending;
+	struct task_struct *t;
 	unsigned long flags;
 	int ret, result;
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
 
 	ret = -1;
-	if (!likely(lock_task_sighand(t, &flags)))
+	rcu_read_lock();
+	t = pid_task(pid, type);
+	if (!t || !likely(lock_task_sighand(t, &flags)))
 		goto ret;
 
 	ret = 1; /* the signal is ignored */
@@ -1696,15 +1699,16 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
 	q->info.si_overrun = 0;
 
 	signalfd_notify(t, sig);
-	pending = group ? &t->signal->shared_pending : &t->pending;
+	pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
 	list_add_tail(&q->list, &pending->list);
 	sigaddset(&pending->signal, sig);
-	complete_signal(sig, t, group);
+	complete_signal(sig, t, type != PIDTYPE_PID);
 	result = TRACE_SIGNAL_DELIVERED;
 out:
-	trace_signal_generate(sig, &q->info, t, group, result);
+	trace_signal_generate(sig, &q->info, t, type != PIDTYPE_PID, result);
 	unlock_task_sighand(t, &flags);
 ret:
+	rcu_read_unlock();
 	return ret;
 }
 
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 2bdf08a2bae9..2d2e739fbc57 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -332,8 +332,8 @@ void posixtimer_rearm(struct siginfo *info)
 
 int posix_timer_event(struct k_itimer *timr, int si_private)
 {
-	struct task_struct *task;
-	int shared, ret = -1;
+	enum pid_type type;
+	int ret = -1;
 	/*
 	 * FIXME: if ->sigq is queued we can race with
 	 * dequeue_signal()->posixtimer_rearm().
@@ -347,13 +347,8 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
 	 */
 	timr->sigq->info.si_sys_private = si_private;
 
-	rcu_read_lock();
-	task = pid_task(timr->it_pid, PIDTYPE_PID);
-	if (task) {
-		shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
-		ret = send_sigqueue(timr->sigq, task, shared);
-	}
-	rcu_read_unlock();
+	type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID;
+	ret = send_sigqueue(timr->sigq, timr->it_pid, type);
 	/* If we failed to send the signal the timer stops. */
 	return ret > 0;
 }
-- 
cgit 


From 0102498083d58d8b17759642c602b525215e1a54 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 13 Jul 2018 18:40:57 -0500
Subject: signal: Pass pid type into group_send_sig_info

This passes the information we already have at the call sight
into group_send_sig_info.  Ultimatelly allowing for to better handle
signals sent to a group of processes.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/exit.c   |  3 ++-
 kernel/signal.c | 10 ++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 25582b442955..0e21e6d21f35 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -681,7 +681,8 @@ static void forget_original_parent(struct task_struct *father,
 				t->parent = t->real_parent;
 			if (t->pdeath_signal)
 				group_send_sig_info(t->pdeath_signal,
-						    SEND_SIG_NOINFO, t);
+						    SEND_SIG_NOINFO, t,
+						    PIDTYPE_TGID);
 		}
 		/*
 		 * If this is a threaded reparent there is no need to
diff --git a/kernel/signal.c b/kernel/signal.c
index 40feb14e276d..c7527338fe9d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1274,7 +1274,8 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
 /*
  * send signal info to all the members of a group
  */
-int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
+int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
+			enum pid_type type)
 {
 	int ret;
 
@@ -1301,7 +1302,7 @@ int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
 	success = 0;
 	retval = -ESRCH;
 	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
-		int err = group_send_sig_info(sig, info, p);
+		int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID);
 		success |= !err;
 		retval = err;
 	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
@@ -1317,7 +1318,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
 		rcu_read_lock();
 		p = pid_task(pid, PIDTYPE_PID);
 		if (p)
-			error = group_send_sig_info(sig, info, p);
+			error = group_send_sig_info(sig, info, p, PIDTYPE_TGID);
 		rcu_read_unlock();
 		if (likely(!p || error != -ESRCH))
 			return error;
@@ -1420,7 +1421,8 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
 		for_each_process(p) {
 			if (task_pid_vnr(p) > 1 &&
 					!same_thread_group(p, current)) {
-				int err = group_send_sig_info(sig, info, p);
+				int err = group_send_sig_info(sig, info, p,
+							      PIDTYPE_MAX);
 				++count;
 				if (err != -EPERM)
 					retval = err;
-- 
cgit 


From 40b3b02535621027f56d248139e0e467573c3098 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 21 Jul 2018 10:45:15 -0500
Subject: signal: Pass pid type into do_send_sig_info

This passes the information we already have at the call sight into
do_send_sig_info.  Ultimately allowing for better handling of signals
sent to a group of processes during fork.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/signal.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index c7527338fe9d..2c09e6143dd8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1161,13 +1161,13 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 }
 
 int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
-			bool group)
+			enum pid_type type)
 {
 	unsigned long flags;
 	int ret = -ESRCH;
 
 	if (lock_task_sighand(p, &flags)) {
-		ret = send_signal(sig, info, p, group);
+		ret = send_signal(sig, info, p, type != PIDTYPE_PID);
 		unlock_task_sighand(p, &flags);
 	}
 
@@ -1284,7 +1284,7 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
 	rcu_read_unlock();
 
 	if (!ret && sig)
-		ret = do_send_sig_info(sig, info, p, true);
+		ret = do_send_sig_info(sig, info, p, type);
 
 	return ret;
 }
@@ -1448,7 +1448,7 @@ int send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	if (!valid_signal(sig))
 		return -EINVAL;
 
-	return do_send_sig_info(sig, info, p, false);
+	return do_send_sig_info(sig, info, p, PIDTYPE_PID);
 }
 
 #define __si_special(priv) \
@@ -3199,7 +3199,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
 		 * probe.  No signal is actually delivered.
 		 */
 		if (!error && sig) {
-			error = do_send_sig_info(sig, info, p, false);
+			error = do_send_sig_info(sig, info, p, PIDTYPE_PID);
 			/*
 			 * If lock_task_sighand() failed we pretend the task
 			 * dies after receiving the signal. The window is tiny,
-- 
cgit 


From b213984bd3987e6f5bf96b7847d151b7b42e1efd Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 Jul 2018 15:49:17 -0500
Subject: signal: Push pid type down into send_signal

This information is already available in the callers and by pushing it
down it makes the code a little clearer, and allows better group
signal behavior in fork.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/signal.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 2c09e6143dd8..8decc70c1dc2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1103,7 +1103,7 @@ ret:
 }
 
 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
-			int group)
+			enum pid_type type)
 {
 	int from_ancestor_ns = 0;
 
@@ -1112,7 +1112,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			   !task_pid_nr_ns(current, task_active_pid_ns(t));
 #endif
 
-	return __send_signal(sig, info, t, group, from_ancestor_ns);
+	return __send_signal(sig, info, t, type != PIDTYPE_PID, from_ancestor_ns);
 }
 
 static void print_fatal_signal(int signr)
@@ -1151,13 +1151,13 @@ __setup("print-fatal-signals=", setup_print_fatal_signals);
 int
 __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
-	return send_signal(sig, info, p, 1);
+	return send_signal(sig, info, p, PIDTYPE_TGID);
 }
 
 static int
 specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 {
-	return send_signal(sig, info, t, 0);
+	return send_signal(sig, info, t, PIDTYPE_PID);
 }
 
 int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
@@ -1167,7 +1167,7 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
 	int ret = -ESRCH;
 
 	if (lock_task_sighand(p, &flags)) {
-		ret = send_signal(sig, info, p, type != PIDTYPE_PID);
+		ret = send_signal(sig, info, p, type);
 		unlock_task_sighand(p, &flags);
 	}
 
@@ -3966,7 +3966,7 @@ void kdb_send_sig(struct task_struct *t, int sig)
 			   "the deadlock.\n");
 		return;
 	}
-	ret = send_signal(sig, SEND_SIG_PRIV, t, false);
+	ret = send_signal(sig, SEND_SIG_PRIV, t, PIDTYPE_PID);
 	spin_unlock(&t->sighand->siglock);
 	if (ret)
 		kdb_printf("Fail to deliver Signal %d to process %d.\n",
-- 
cgit 


From 5a883cee7442a5fd72e18eae895113fbd7b16110 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 13 Jul 2018 19:26:27 -0500
Subject: signal: Push pid type down into __send_signal

This information is already available in the callers and by pushing it
down it makes the code a little clearer, and allows implementing
better handling of signales set to a group of processes in fork.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/signal.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 8decc70c1dc2..1ef94303d87a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -998,7 +998,7 @@ static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_str
 #endif
 
 static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
-			int group, int from_ancestor_ns)
+			enum pid_type type, int from_ancestor_ns)
 {
 	struct sigpending *pending;
 	struct sigqueue *q;
@@ -1012,7 +1012,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			from_ancestor_ns || (info == SEND_SIG_FORCED)))
 		goto ret;
 
-	pending = group ? &t->signal->shared_pending : &t->pending;
+	pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
 	/*
 	 * Short-circuit ignored signals and support queuing
 	 * exactly one non-rt signal, so that we can get more
@@ -1096,9 +1096,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 out_set:
 	signalfd_notify(t, sig);
 	sigaddset(&pending->signal, sig);
-	complete_signal(sig, t, group);
+	complete_signal(sig, t, type != PIDTYPE_PID);
 ret:
-	trace_signal_generate(sig, info, t, group, result);
+	trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result);
 	return ret;
 }
 
@@ -1112,7 +1112,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			   !task_pid_nr_ns(current, task_active_pid_ns(t));
 #endif
 
-	return __send_signal(sig, info, t, type != PIDTYPE_PID, from_ancestor_ns);
+	return __send_signal(sig, info, t, type, from_ancestor_ns);
 }
 
 static void print_fatal_signal(int signr)
@@ -1377,7 +1377,7 @@ int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid,
 
 	if (sig) {
 		if (lock_task_sighand(p, &flags)) {
-			ret = __send_signal(sig, info, p, 1, 0);
+			ret = __send_signal(sig, info, p, PIDTYPE_TGID, 0);
 			unlock_task_sighand(p, &flags);
 		} else
 			ret = -ESRCH;
-- 
cgit 


From 0729614992c946f6e8ccb9ef260aea1f06993df0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 13 Jul 2018 21:39:13 -0500
Subject: signal: Push pid type down into complete_signal.

This is the bottom and by pushing this down it simplifies the callers
and otherwise leaves things as is.  This is in preparation for allowing
fork to implement better handling of signals set to groups of processes.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/signal.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 1ef94303d87a..dddbea558455 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -895,7 +895,7 @@ static inline int wants_signal(int sig, struct task_struct *p)
 	return task_curr(p) || !signal_pending(p);
 }
 
-static void complete_signal(int sig, struct task_struct *p, int group)
+static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
 {
 	struct signal_struct *signal = p->signal;
 	struct task_struct *t;
@@ -908,7 +908,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 	 */
 	if (wants_signal(sig, p))
 		t = p;
-	else if (!group || thread_group_empty(p))
+	else if ((type == PIDTYPE_PID) || thread_group_empty(p))
 		/*
 		 * There is just one thread and it does not need to be woken.
 		 * It will dequeue unblocked signals before it runs again.
@@ -1096,7 +1096,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 out_set:
 	signalfd_notify(t, sig);
 	sigaddset(&pending->signal, sig);
-	complete_signal(sig, t, type != PIDTYPE_PID);
+	complete_signal(sig, t, type);
 ret:
 	trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result);
 	return ret;
@@ -1704,7 +1704,7 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
 	pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
 	list_add_tail(&q->list, &pending->list);
 	sigaddset(&pending->signal, sig);
-	complete_signal(sig, t, type != PIDTYPE_PID);
+	complete_signal(sig, t, type);
 	result = TRACE_SIGNAL_DELIVERED;
 out:
 	trace_signal_generate(sig, &q->info, t, type != PIDTYPE_PID, result);
-- 
cgit 


From 4ca1d3ee46130e9b939c02a93e3970dad151fed6 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 13 Jul 2018 15:30:33 -0500
Subject: fork: Move and describe why the code examines PIDNS_ADDING

Normally this would be something that would be handled by handling
signals that are sent to a group of processes but in this case the
forking process is not a member of the group being signaled.  Thus
special code is needed to prevent a race with pid namespaces exiting,
and fork adding new processes within them.

Move this test up before the signal restart just in case signals are
also pending.  Fatal conditions should take presedence over restarts.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/fork.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index cc5be0d01ce6..b9c54318a292 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1922,6 +1922,12 @@ static __latent_entropy struct task_struct *copy_process(
 
 	rseq_fork(p, clone_flags);
 
+	/* Don't start children in a dying pid namespace */
+	if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
+		retval = -ENOMEM;
+		goto bad_fork_cancel_cgroup;
+	}
+
 	/*
 	 * Process group and session signals need to be delivered to just the
 	 * parent before the fork or both the parent and the child after the
@@ -1935,10 +1941,7 @@ static __latent_entropy struct task_struct *copy_process(
 		retval = -ERESTARTNOINTR;
 		goto bad_fork_cancel_cgroup;
 	}
-	if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
-		retval = -ENOMEM;
-		goto bad_fork_cancel_cgroup;
-	}
+
 
 	init_task_pid_links(p);
 	if (likely(p->pid)) {
-- 
cgit 


From 7673bf553b2732a00f7644fb2adadda69389ab37 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 23 Jul 2018 08:01:10 -0500
Subject: fork: Unconditionally exit if a fatal signal is pending

In practice this does not change anything as testing for fatal_signal_pending
and exiting for with an error code duplicates the work of the next clause
which recalculates pending signals and then exits fork if any are pending.
In both cases the pending signal will trigger the slow path when existing
to userspace, and the fatal signal will cause do_exit to be called.

The advantage of making this a separate test is that it makes it clear
processing the fatal signal will terminate the fork, and it allows the
rest of the signal logic to be updated without fear that this important
case will be lost.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/fork.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index b9c54318a292..22d4cdb9a7ca 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1928,6 +1928,12 @@ static __latent_entropy struct task_struct *copy_process(
 		goto bad_fork_cancel_cgroup;
 	}
 
+	/* Let kill terminate clone/fork in the middle */
+	if (fatal_signal_pending(current)) {
+		retval = -EINTR;
+		goto bad_fork_cancel_cgroup;
+	}
+
 	/*
 	 * Process group and session signals need to be delivered to just the
 	 * parent before the fork or both the parent and the child after the
-- 
cgit 


From 088fe47ce952542389e604e83f533811750aaf7c Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 23 Jul 2018 17:26:49 -0500
Subject: signal: Add calculate_sigpending()

Add a function calculate_sigpending to test to see if any signals are
pending for a new task immediately following fork.  Signals have to
happen either before or after fork.  Today our practice is to push
all of the signals to before the fork, but that has the downside that
frequent or periodic signals can make fork take much much longer than
normal or prevent fork from completing entirely.

So we need move signals that we can after the fork to prevent that.

This updates the code to set TIF_SIGPENDING on a new task if there
are signals or other activities that have moved so that they appear
to happen after the fork.

As the code today restarts if it sees any such activity this won't
immediately have an effect, as there will be no reason for it
to set TIF_SIGPENDING immediately after the fork.

Adding calculate_sigpending means the code in fork can safely be
changed to not always restart if a signal is pending.

The new calculate_sigpending function sets sigpending if there
are pending bits in jobctl, pending signals, the freezer needs
to freeze the new task or the live kernel patching framework
need the new thread to take the slow path to userspace.

I have verified that setting TIF_SIGPENDING does make a new process
take the slow path to userspace before it executes it's first userspace
instruction.

I have looked at the callers of signal_wake_up and the code paths
setting TIF_SIGPENDING and I don't see anything else that needs to be
handled.  The code probably doesn't need to set TIF_SIGPENDING for the
kernel live patching as it uses a separate thread flag as well.  But
at this point it seems safer reuse the recalc_sigpending logic and get
the kernel live patching folks to sort out their story later.

V2: I have moved the test into schedule_tail where siglock can
    be grabbed and recalc_sigpending can be reused directly.
    Further as the last action of setting up a new task this
    guarantees that TIF_SIGPENDING will be properly set in the
    new process.

    The helper calculate_sigpending takes the siglock and
    uncontitionally sets TIF_SIGPENDING and let's recalc_sigpending
    clear TIF_SIGPENDING if it is unnecessary.  This allows reusing
    the existing code and keeps maintenance of the conditions simple.

    Oleg Nesterov <oleg@redhat.com>  suggested the movement
    and pointed out the need to take siglock if this code
    was going to be called while the new task is discoverable.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/sched/core.c |  2 ++
 kernel/signal.c     | 11 +++++++++++
 2 files changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78d8facba456..3e4ed4b7aa2d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2813,6 +2813,8 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 
 	if (current->set_child_tid)
 		put_user(task_pid_vnr(current), current->set_child_tid);
+
+	calculate_sigpending();
 }
 
 /*
diff --git a/kernel/signal.c b/kernel/signal.c
index dddbea558455..1e06f1eba363 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -172,6 +172,17 @@ void recalc_sigpending(void)
 
 }
 
+void calculate_sigpending(void)
+{
+	/* Have any signals or users of TIF_SIGPENDING been delayed
+	 * until after fork?
+	 */
+	spin_lock_irq(&current->sighand->siglock);
+	set_tsk_thread_flag(current, TIF_SIGPENDING);
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+}
+
 /* Given the mask, find the first available signal that should be serviced. */
 
 #define SYNCHRONOUS_MASK \
-- 
cgit 


From 924de3b8c9410c404c6eda7abffd282b97b3ff7f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 23 Jul 2018 13:38:00 -0500
Subject: fork: Have new threads join on-going signal group stops

There are only two signals that are delivered to every member of a
signal group: SIGSTOP and SIGKILL.  Signal delivery requires every
signal appear to be delivered either before or after a clone syscall.
SIGKILL terminates the clone so does not need to be considered.  Which
leaves only SIGSTOP that needs to be considered when creating new
threads.

Today in the event of a group stop TIF_SIGPENDING will get set and the
fork will restart ensuring the fork syscall participates in the group
stop.

A fork (especially of a process with a lot of memory) is one of the
most expensive system so we really only want to restart a fork when
necessary.

It is easy so check to see if a SIGSTOP is ongoing and have the new
thread join it immediate after the clone completes.  Making it appear
the clone completed happened just before the SIGSTOP.

The calculate_sigpending function will see the bits set in jobctl and
set TIF_SIGPENDING to ensure the new task takes the slow path to userspace.

V2: The call to task_join_group_stop was moved before the new task is
    added to the thread group list.  This should not matter as
    sighand->siglock is held over both the addition of the threads,
    the call to task_join_group_stop and do_signal_stop.  But the change
    is trivial and it is one less thing to worry about when reading
    the code.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/fork.c   | 27 +++++++++++++++------------
 kernel/signal.c | 14 ++++++++++++++
 2 files changed, 29 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 22d4cdb9a7ca..ab731e15a600 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1934,18 +1934,20 @@ static __latent_entropy struct task_struct *copy_process(
 		goto bad_fork_cancel_cgroup;
 	}
 
-	/*
-	 * Process group and session signals need to be delivered to just the
-	 * parent before the fork or both the parent and the child after the
-	 * fork. Restart if a signal comes in before we add the new process to
-	 * it's process group.
-	 * A fatal signal pending means that current will exit, so the new
-	 * thread can't slip out of an OOM kill (or normal SIGKILL).
-	*/
-	recalc_sigpending();
-	if (signal_pending(current)) {
-		retval = -ERESTARTNOINTR;
-		goto bad_fork_cancel_cgroup;
+	if (!(clone_flags & CLONE_THREAD)) {
+		/*
+		 * Process group and session signals need to be delivered to just the
+		 * parent before the fork or both the parent and the child after the
+		 * fork. Restart if a signal comes in before we add the new process to
+		 * it's process group.
+		 * A fatal signal pending means that current will exit, so the new
+		 * thread can't slip out of an OOM kill (or normal SIGKILL).
+		 */
+		recalc_sigpending();
+		if (signal_pending(current)) {
+			retval = -ERESTARTNOINTR;
+			goto bad_fork_cancel_cgroup;
+		}
 	}
 
 
@@ -1982,6 +1984,7 @@ static __latent_entropy struct task_struct *copy_process(
 			current->signal->nr_threads++;
 			atomic_inc(&current->signal->live);
 			atomic_inc(&current->signal->sigcnt);
+			task_join_group_stop(p);
 			list_add_tail_rcu(&p->thread_group,
 					  &p->group_leader->thread_group);
 			list_add_tail_rcu(&p->thread_node,
diff --git a/kernel/signal.c b/kernel/signal.c
index 1e06f1eba363..9f0eafb6d474 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -373,6 +373,20 @@ static bool task_participate_group_stop(struct task_struct *task)
 	return false;
 }
 
+void task_join_group_stop(struct task_struct *task)
+{
+	/* Have the new thread join an on-going signal group stop */
+	unsigned long jobctl = current->jobctl;
+	if (jobctl & JOBCTL_STOP_PENDING) {
+		struct signal_struct *sig = current->signal;
+		unsigned long signr = jobctl & JOBCTL_STOP_SIGMASK;
+		unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
+		if (task_set_jobctl_pending(task, signr | gstop)) {
+			sig->group_stop_count++;
+		}
+	}
+}
+
 /*
  * allocate a new signal queue record
  * - this may be called without locks if and only if t == current, otherwise an
-- 
cgit 


From c3ad2c3b02e953ead2b8d52a0c9e70312930c3d0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 23 Jul 2018 15:20:37 -0500
Subject: signal: Don't restart fork when signals come in.

Wen Yang <wen.yang99@zte.com.cn> and majiang <ma.jiang@zte.com.cn>
report that a periodic signal received during fork can cause fork to
continually restart preventing an application from making progress.

The code was being overly pessimistic.  Fork needs to guarantee that a
signal sent to multiple processes is logically delivered before the
fork and just to the forking process or logically delivered after the
fork to both the forking process and it's newly spawned child.  For
signals like periodic timers that are always delivered to a single
process fork can safely complete and let them appear to logically
delivered after the fork().

While examining this issue I also discovered that fork today will miss
signals delivered to multiple processes during the fork and handled by
another thread.  Similarly the current code will also miss blocked
signals that are delivered to multiple process, as those signals will
not appear pending during fork.

Add a list of each thread that is currently forking, and keep on that
list a signal set that records all of the signals sent to multiple
processes.  When fork completes initialize the new processes
shared_pending signal set with it.  The calculate_sigpending function
will see those signals and set TIF_SIGPENDING causing the new task to
take the slow path to userspace to handle those signals.  Making it
appear as if those signals were received immediately after the fork.

It is not possible to send real time signals to multiple processes and
exceptions don't go to multiple processes, which means that that are
no signals sent to multiple processes that require siginfo.  This
means it is safe to not bother collecting siginfo on signals sent
during fork.

The sigaction of a child of fork is initially the same as the
sigaction of the parent process.  So a signal the parent ignores the
child will also initially ignore.  Therefore it is safe to ignore
signals sent to multiple processes and ignored by the forking process.

Signals sent to only a single process or only a single thread and delivered
during fork are treated as if they are received after the fork, and generally
not dealt with.  They won't cause any problems.

V2: Added removal from the multiprocess list on failure.
V3: Use -ERESTARTNOINTR directly
V4: - Don't queue both SIGCONT and SIGSTOP
    - Initialize signal_struct.multiprocess in init_task
    - Move setting of shared_pending to before the new task
      is visible to signals.  This prevents signals from comming
      in before shared_pending.signal is set to delayed.signal
      and being lost.
V5: - rework list add and delete to account for idle threads
v6: - Use sigdelsetmask when removing stop signals

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=200447
Reported-by: Wen Yang <wen.yang99@zte.com.cn> and
Reported-by: majiang <ma.jiang@zte.com.cn>
Fixes: 4a2c7a7837da ("[PATCH] make fork() atomic wrt pgrp/session signals")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/fork.c   | 43 +++++++++++++++++++++++++------------------
 kernel/signal.c | 15 +++++++++++++++
 2 files changed, 40 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index ab731e15a600..411e34acace7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1456,6 +1456,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	init_waitqueue_head(&sig->wait_chldexit);
 	sig->curr_target = tsk;
 	init_sigpending(&sig->shared_pending);
+	INIT_HLIST_HEAD(&sig->multiprocess);
 	seqlock_init(&sig->stats_lock);
 	prev_cputime_init(&sig->prev_cputime);
 
@@ -1602,6 +1603,7 @@ static __latent_entropy struct task_struct *copy_process(
 {
 	int retval;
 	struct task_struct *p;
+	struct multiprocess_signals delayed;
 
 	/*
 	 * Don't allow sharing the root directory with processes in a different
@@ -1649,6 +1651,24 @@ static __latent_entropy struct task_struct *copy_process(
 			return ERR_PTR(-EINVAL);
 	}
 
+	/*
+	 * Force any signals received before this point to be delivered
+	 * before the fork happens.  Collect up signals sent to multiple
+	 * processes that happen during the fork and delay them so that
+	 * they appear to happen after the fork.
+	 */
+	sigemptyset(&delayed.signal);
+	INIT_HLIST_NODE(&delayed.node);
+
+	spin_lock_irq(&current->sighand->siglock);
+	if (!(clone_flags & CLONE_THREAD))
+		hlist_add_head(&delayed.node, &current->signal->multiprocess);
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+	retval = -ERESTARTNOINTR;
+	if (signal_pending(current))
+		goto fork_out;
+
 	retval = -ENOMEM;
 	p = dup_task_struct(current, node);
 	if (!p)
@@ -1934,22 +1954,6 @@ static __latent_entropy struct task_struct *copy_process(
 		goto bad_fork_cancel_cgroup;
 	}
 
-	if (!(clone_flags & CLONE_THREAD)) {
-		/*
-		 * Process group and session signals need to be delivered to just the
-		 * parent before the fork or both the parent and the child after the
-		 * fork. Restart if a signal comes in before we add the new process to
-		 * it's process group.
-		 * A fatal signal pending means that current will exit, so the new
-		 * thread can't slip out of an OOM kill (or normal SIGKILL).
-		 */
-		recalc_sigpending();
-		if (signal_pending(current)) {
-			retval = -ERESTARTNOINTR;
-			goto bad_fork_cancel_cgroup;
-		}
-	}
-
 
 	init_task_pid_links(p);
 	if (likely(p->pid)) {
@@ -1965,7 +1969,7 @@ static __latent_entropy struct task_struct *copy_process(
 				ns_of_pid(pid)->child_reaper = p;
 				p->signal->flags |= SIGNAL_UNKILLABLE;
 			}
-
+			p->signal->shared_pending.signal = delayed.signal;
 			p->signal->tty = tty_kref_get(current->signal->tty);
 			/*
 			 * Inherit has_child_subreaper flag under the same
@@ -1993,8 +1997,8 @@ static __latent_entropy struct task_struct *copy_process(
 		attach_pid(p, PIDTYPE_PID);
 		nr_threads++;
 	}
-
 	total_forks++;
+	hlist_del_init(&delayed.node);
 	spin_unlock(&current->sighand->siglock);
 	syscall_tracepoint_update(p);
 	write_unlock_irq(&tasklist_lock);
@@ -2059,6 +2063,9 @@ bad_fork_free:
 	put_task_stack(p);
 	free_task(p);
 fork_out:
+	spin_lock_irq(&current->sighand->siglock);
+	hlist_del_init(&delayed.node);
+	spin_unlock_irq(&current->sighand->siglock);
 	return ERR_PTR(retval);
 }
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 9f0eafb6d474..cfa9d10e731a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1121,6 +1121,21 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 out_set:
 	signalfd_notify(t, sig);
 	sigaddset(&pending->signal, sig);
+
+	/* Let multiprocess signals appear after on-going forks */
+	if (type > PIDTYPE_TGID) {
+		struct multiprocess_signals *delayed;
+		hlist_for_each_entry(delayed, &t->signal->multiprocess, node) {
+			sigset_t *signal = &delayed->signal;
+			/* Can't queue both a stop and a continue signal */
+			if (sig == SIGCONT)
+				sigdelsetmask(signal, SIG_KERNEL_STOP_MASK);
+			else if (sig_kernel_stop(sig))
+				sigdelset(signal, SIGCONT);
+			sigaddset(signal, sig);
+		}
+	}
+
 	complete_signal(sig, t, type);
 ret:
 	trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result);
-- 
cgit