diff options
Diffstat (limited to 'kernel/exit.c')
| -rw-r--r-- | kernel/exit.c | 1780 |
1 files changed, 1053 insertions, 727 deletions
diff --git a/kernel/exit.c b/kernel/exit.c index a949819055d5..8a87021211ae 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/exit.c * @@ -6,6 +7,12 @@ #include <linux/mm.h> #include <linux/slab.h> +#include <linux/sched/autogroup.h> +#include <linux/sched/mm.h> +#include <linux/sched/stat.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> +#include <linux/sched/cputime.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/capability.h> @@ -14,12 +21,10 @@ #include <linux/tty.h> #include <linux/iocontext.h> #include <linux/key.h> -#include <linux/security.h> #include <linux/cpu.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/freezer.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> @@ -42,9 +47,9 @@ #include <linux/pipe_fs_i.h> #include <linux/audit.h> /* for audit_free() */ #include <linux/resource.h> -#include <linux/blkdev.h> #include <linux/task_io_accounting_ops.h> -#include <linux/tracehook.h> +#include <linux/blkdev.h> +#include <linux/task_work.h> #include <linux/fs_struct.h> #include <linux/init_task.h> #include <linux/perf_event.h> @@ -53,136 +58,211 @@ #include <linux/oom.h> #include <linux/writeback.h> #include <linux/shm.h> +#include <linux/kcov.h> +#include <linux/kmsan.h> +#include <linux/random.h> +#include <linux/rcuwait.h> +#include <linux/compat.h> +#include <linux/io_uring.h> +#include <linux/kprobes.h> +#include <linux/rethook.h> +#include <linux/sysfs.h> +#include <linux/user_events.h> +#include <linux/unwind_deferred.h> +#include <linux/uaccess.h> +#include <linux/pidfs.h> + +#include <uapi/linux/wait.h> -#include <asm/uaccess.h> #include <asm/unistd.h> -#include <asm/pgtable.h> #include <asm/mmu_context.h> -static void exit_mm(struct task_struct * tsk); +#include "exit.h" -static void __unhash_process(struct task_struct *p, bool group_dead) +/* + * The default value should be high enough to not crash a system that randomly + * crashes its kernel from time to time, but low enough to at least not permit + * overflowing 32-bit refcounts or the ldsem writer count. + */ +static unsigned int oops_limit = 10000; + +#ifdef CONFIG_SYSCTL +static const struct ctl_table kern_exit_table[] = { + { + .procname = "oops_limit", + .data = &oops_limit, + .maxlen = sizeof(oops_limit), + .mode = 0644, + .proc_handler = proc_douintvec, + }, +}; + +static __init int kernel_exit_sysctls_init(void) { + register_sysctl_init("kernel", kern_exit_table); + return 0; +} +late_initcall(kernel_exit_sysctls_init); +#endif + +static atomic_t oops_count = ATOMIC_INIT(0); + +#ifdef CONFIG_SYSFS +static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%d\n", atomic_read(&oops_count)); +} + +static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count); + +static __init int kernel_exit_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL); + return 0; +} +late_initcall(kernel_exit_sysfs_init); +#endif + +/* + * For things release_task() would like to do *after* tasklist_lock is released. + */ +struct release_task_post { + struct pid *pids[PIDTYPE_MAX]; +}; + +static void __unhash_process(struct release_task_post *post, struct task_struct *p, + bool group_dead) +{ + struct pid *pid = task_pid(p); + nr_threads--; - detach_pid(p, PIDTYPE_PID); + + detach_pid(post->pids, p, PIDTYPE_PID); + wake_up_all(&pid->wait_pidfd); + if (group_dead) { - detach_pid(p, PIDTYPE_PGID); - detach_pid(p, PIDTYPE_SID); + detach_pid(post->pids, p, PIDTYPE_TGID); + detach_pid(post->pids, p, PIDTYPE_PGID); + detach_pid(post->pids, p, PIDTYPE_SID); list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); } - list_del_rcu(&p->thread_group); + list_del_rcu(&p->thread_node); } /* * This function expects the tasklist_lock write-locked. */ -static void __exit_signal(struct task_struct *tsk) +static void __exit_signal(struct release_task_post *post, struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; - struct tty_struct *uninitialized_var(tty); - cputime_t utime, stime; + struct tty_struct *tty; + u64 utime, stime; sighand = rcu_dereference_check(tsk->sighand, lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); +#ifdef CONFIG_POSIX_TIMERS posix_cpu_timers_exit(tsk); - if (group_dead) { + if (group_dead) posix_cpu_timers_exit_group(tsk); +#endif + + if (group_dead) { tty = sig->tty; sig->tty = NULL; } else { /* - * This can only happen if the caller is de_thread(). - * FIXME: this is the temporary hack, we should teach - * posix-cpu-timers to handle this case correctly. - */ - if (unlikely(has_group_leader_pid(tsk))) - posix_cpu_timers_exit_group(tsk); - - /* * If there is any task waiting for the group exit * then notify it: */ if (sig->notify_count > 0 && !--sig->notify_count) - wake_up_process(sig->group_exit_task); + wake_up_process(sig->group_exec_task); if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); - /* - * Accumulate here the counters for all threads but the - * group leader as they die, so they can be added into - * the process-wide totals when those are taken. - * The group leader stays around as a zombie as long - * as there are other threads. When it gets reaped, - * the exit.c code will add its counts into these totals. - * We won't ever get here for the group leader, since it - * will have been the last reference on the signal_struct. - */ - task_cputime(tsk, &utime, &stime); - sig->utime += utime; - sig->stime += stime; - sig->gtime += task_gtime(tsk); - sig->min_flt += tsk->min_flt; - sig->maj_flt += tsk->maj_flt; - sig->nvcsw += tsk->nvcsw; - sig->nivcsw += tsk->nivcsw; - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; } - sig->nr_threads--; - __unhash_process(tsk, group_dead); - /* - * Do this under ->siglock, we can race with another thread - * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. + * Accumulate here the counters for all threads as they die. We could + * skip the group leader because it is the last user of signal_struct, + * but we want to avoid the race with thread_group_cputime() which can + * see the empty ->thread_head list. */ - flush_sigqueue(&tsk->pending); + task_cputime(tsk, &utime, &stime); + write_seqlock(&sig->stats_lock); + sig->utime += utime; + sig->stime += stime; + sig->gtime += task_gtime(tsk); + sig->min_flt += tsk->min_flt; + sig->maj_flt += tsk->maj_flt; + sig->nvcsw += tsk->nvcsw; + sig->nivcsw += tsk->nivcsw; + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); + sig->sum_sched_runtime += tsk->se.sum_exec_runtime; + sig->nr_threads--; + __unhash_process(post, tsk, group_dead); + write_sequnlock(&sig->stats_lock); + tsk->sighand = NULL; spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); - clear_tsk_thread_flag(tsk,TIF_SIGPENDING); - if (group_dead) { - flush_sigqueue(&sig->shared_pending); + if (group_dead) tty_kref_put(tty); - } } static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + kprobe_flush_task(tsk); + rethook_flush_task(tsk); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); } +void put_task_struct_rcu_user(struct task_struct *task) +{ + if (refcount_dec_and_test(&task->rcu_users)) + call_rcu(&task->rcu, delayed_put_task_struct); +} + +void __weak release_thread(struct task_struct *dead_task) +{ +} -void release_task(struct task_struct * p) +void release_task(struct task_struct *p) { + struct release_task_post post; struct task_struct *leader; + struct pid *thread_pid; int zap_leader; repeat: + memset(&post, 0, sizeof(post)); + /* don't need to get the RCU readlock here - the process is dead and - * can't be modifying its own credentials. But shut RCU-lockdep up */ - rcu_read_lock(); - atomic_dec(&__task_cred(p)->user->processes); - rcu_read_unlock(); + * can't be modifying its own credentials. */ + dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + + pidfs_exit(p); + cgroup_task_release(p); - proc_flush_task(p); + /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */ + thread_pid = task_pid(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); - __exit_signal(p); + __exit_signal(&post, p); /* * If we are the last non-leader member of the thread @@ -191,7 +271,11 @@ repeat: */ zap_leader = 0; leader = p->group_leader; - if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { + if (leader != p && thread_group_empty(leader) + && leader->exit_state == EXIT_ZOMBIE) { + /* for pidfs_exit() and do_notify_parent() */ + if (leader->signal->flags & SIGNAL_GROUP_EXIT) + leader->exit_code = leader->signal->group_exit_code; /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, @@ -203,34 +287,58 @@ repeat: } write_unlock_irq(&tasklist_lock); + /* @thread_pid can't go away until free_pids() below */ + proc_flush_pid(thread_pid); + exit_cred_namespaces(p); + add_device_randomness(&p->se.sum_exec_runtime, + sizeof(p->se.sum_exec_runtime)); + free_pids(post.pids); release_thread(p); - call_rcu(&p->rcu, delayed_put_task_struct); + /* + * This task was already removed from the process/thread/pid lists + * and lock_task_sighand(p) can't succeed. Nobody else can touch + * ->pending or, if group dead, signal->shared_pending. We can call + * flush_sigqueue() lockless. + */ + flush_sigqueue(&p->pending); + if (thread_group_leader(p)) + flush_sigqueue(&p->signal->shared_pending); + + put_task_struct_rcu_user(p); p = leader; if (unlikely(zap_leader)) goto repeat; } -/* - * This checks not only the pgrp, but falls back on the pid if no - * satisfactory pgrp is found. I dunno - gdb doesn't work correctly - * without this... - * - * The caller must hold rcu lock or the tasklist lock. - */ -struct pid *session_of_pgrp(struct pid *pgrp) +int rcuwait_wake_up(struct rcuwait *w) { - struct task_struct *p; - struct pid *sid = NULL; + int ret = 0; + struct task_struct *task; + + rcu_read_lock(); - p = pid_task(pgrp, PIDTYPE_PGID); - if (p == NULL) - p = pid_task(pgrp, PIDTYPE_PID); - if (p != NULL) - sid = task_session(p); + /* + * Order condition vs @task, such that everything prior to the load + * of @task is visible. This is the condition as to why the user called + * rcuwait_wake() in the first place. Pairs with set_current_state() + * barrier (A) in rcuwait_wait_event(). + * + * WAIT WAKE + * [S] tsk = current [S] cond = true + * MB (A) MB (B) + * [L] cond [L] tsk + */ + smp_mb(); /* (B) */ - return sid; + task = rcu_dereference(w->task); + if (task) + ret = wake_up_process(task); + rcu_read_unlock(); + + return ret; } +EXPORT_SYMBOL_GPL(rcuwait_wake_up); /* * Determine if a process group is "orphaned", according to the POSIX @@ -240,7 +348,8 @@ struct pid *session_of_pgrp(struct pid *pgrp) * * "I ask you, have you ever known what it is to be an orphan?" */ -static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) +static int will_become_orphaned_pgrp(struct pid *pgrp, + struct task_struct *ignored_task) { struct task_struct *p; @@ -293,9 +402,9 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) struct task_struct *ignored_task = tsk; if (!parent) - /* exit: our father is in a different pgrp than - * we are and we were the only connection outside. - */ + /* exit: our father is in a different pgrp than + * we are and we were the only connection outside. + */ parent = tsk->real_parent; else /* reparent: our child is in a different pgrp than @@ -312,54 +421,73 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } -/* - * Let kernel threads use this to say that they allow a certain signal. - * Must not be used if kthread was cloned with CLONE_SIGHAND. - */ -int allow_signal(int sig) +static void coredump_task_exit(struct task_struct *tsk, + struct core_state *core_state) { - if (!valid_signal(sig) || sig < 1) - return -EINVAL; + struct core_thread self; - spin_lock_irq(¤t->sighand->siglock); - /* This is only needed for daemonize()'ed kthreads */ - sigdelset(¤t->blocked, sig); + self.task = tsk; + if (self.task->flags & PF_SIGNALED) + self.next = xchg(&core_state->dumper.next, &self); + else + self.task = NULL; /* - * Kernel threads handle their own signals. Let the signal code - * know it'll be handled, so that they don't get converted to - * SIGKILL or just silently dropped. + * Implies mb(), the result of xchg() must be visible + * to core_state->dumper. */ - current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 0; -} + if (atomic_dec_and_test(&core_state->nr_threads)) + complete(&core_state->startup); -EXPORT_SYMBOL(allow_signal); + for (;;) { + set_current_state(TASK_IDLE|TASK_FREEZABLE); + if (!self.task) /* see coredump_finish() */ + break; + schedule(); + } + __set_current_state(TASK_RUNNING); +} -int disallow_signal(int sig) +#ifdef CONFIG_MEMCG +/* drops tasklist_lock if succeeds */ +static bool __try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm) { - if (!valid_signal(sig) || sig < 1) - return -EINVAL; + bool ret = false; - spin_lock_irq(¤t->sighand->siglock); - current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 0; + task_lock(tsk); + if (likely(tsk->mm == mm)) { + /* tsk can't pass exit_mm/exec_mmap and exit */ + read_unlock(&tasklist_lock); + WRITE_ONCE(mm->owner, tsk); + lru_gen_migrate_mm(mm); + ret = true; + } + task_unlock(tsk); + return ret; } -EXPORT_SYMBOL(disallow_signal); +static bool try_to_set_owner(struct task_struct *g, struct mm_struct *mm) +{ + struct task_struct *t; + + for_each_thread(g, t) { + struct mm_struct *t_mm = READ_ONCE(t->mm); + if (t_mm == mm) { + if (__try_to_set_owner(t, mm)) + return true; + } else if (t_mm) + break; + } + + return false; +} -#ifdef CONFIG_MM_OWNER /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ void mm_update_next_owner(struct mm_struct *mm) { - struct task_struct *c, *g, *p = current; + struct task_struct *g, *p = current; -retry: /* * If the exiting or execing task is not the owner, it's * someone else's problem. @@ -372,7 +500,7 @@ retry: * freed task structure. */ if (atomic_read(&mm->mm_users) <= 1) { - mm->owner = NULL; + WRITE_ONCE(mm->owner, NULL); return; } @@ -380,115 +508,121 @@ retry: /* * Search in the children */ - list_for_each_entry(c, &p->children, sibling) { - if (c->mm == mm) - goto assign_new_owner; + list_for_each_entry(g, &p->children, sibling) { + if (try_to_set_owner(g, mm)) + goto ret; } - /* * Search in the siblings */ - list_for_each_entry(c, &p->real_parent->children, sibling) { - if (c->mm == mm) - goto assign_new_owner; + list_for_each_entry(g, &p->real_parent->children, sibling) { + if (try_to_set_owner(g, mm)) + goto ret; } - /* - * Search through everything else. We should not get - * here often + * Search through everything else, we should not get here often. */ - do_each_thread(g, c) { - if (c->mm == mm) - goto assign_new_owner; - } while_each_thread(g, c); - + for_each_process(g) { + if (atomic_read(&mm->mm_users) <= 1) + break; + if (g->flags & PF_KTHREAD) + continue; + if (try_to_set_owner(g, mm)) + goto ret; + } read_unlock(&tasklist_lock); /* * We found no owner yet mm_users > 1: this implies that we are * most likely racing with swapoff (try_to_unuse()) or /proc or * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ - mm->owner = NULL; + WRITE_ONCE(mm->owner, NULL); + ret: return; -assign_new_owner: - BUG_ON(c == p); - get_task_struct(c); - /* - * The task_lock protects c->mm from changing. - * We always want mm->owner->mm == mm - */ - task_lock(c); - /* - * Delay read_unlock() till we have the task_lock() - * to ensure that c does not slip away underneath us - */ - read_unlock(&tasklist_lock); - if (c->mm != mm) { - task_unlock(c); - put_task_struct(c); - goto retry; - } - mm->owner = c; - task_unlock(c); - put_task_struct(c); } -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */ /* * Turn us into a lazy TLB process if we * aren't already.. */ -static void exit_mm(struct task_struct * tsk) +static void exit_mm(void) { - struct mm_struct *mm = tsk->mm; - struct core_state *core_state; + struct mm_struct *mm = current->mm; - mm_release(tsk, mm); + exit_mm_release(current, mm); if (!mm) return; - sync_mm_rss(mm); + mmap_read_lock(mm); + mmgrab_lazy_tlb(mm); + BUG_ON(mm != current->active_mm); + /* more a memory barrier than a real lock */ + task_lock(current); /* - * Serialize with any possible pending coredump. - * We must hold mmap_sem around checking core_state - * and clearing tsk->mm. The core-inducing thread - * will increment ->nr_threads for each thread in the - * group with ->mm != NULL. + * When a thread stops operating on an address space, the loop + * in membarrier_private_expedited() may not observe that + * tsk->mm, and the loop in membarrier_global_expedited() may + * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED + * rq->membarrier_state, so those would not issue an IPI. + * Membarrier requires a memory barrier after accessing + * user-space memory, before clearing tsk->mm or the + * rq->membarrier_state. */ - down_read(&mm->mmap_sem); - core_state = mm->core_state; - if (core_state) { - struct core_thread self; - up_read(&mm->mmap_sem); - - self.task = tsk; - self.next = xchg(&core_state->dumper.next, &self); - /* - * Implies mb(), the result of xchg() must be visible - * to core_state->dumper. - */ - if (atomic_dec_and_test(&core_state->nr_threads)) - complete(&core_state->startup); - - for (;;) { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (!self.task) /* see coredump_finish() */ - break; - freezable_schedule(); - } - __set_task_state(tsk, TASK_RUNNING); - down_read(&mm->mmap_sem); - } - atomic_inc(&mm->mm_count); - BUG_ON(mm != tsk->active_mm); - /* more a memory barrier than a real lock */ - task_lock(tsk); - tsk->mm = NULL; - up_read(&mm->mmap_sem); + smp_mb__after_spinlock(); + local_irq_disable(); + current->mm = NULL; + membarrier_update_current_mm(NULL); enter_lazy_tlb(mm, current); - task_unlock(tsk); + local_irq_enable(); + task_unlock(current); + mmap_read_unlock(mm); mm_update_next_owner(mm); mmput(mm); + if (test_thread_flag(TIF_MEMDIE)) + exit_oom_victim(); +} + +static struct task_struct *find_alive_thread(struct task_struct *p) +{ + struct task_struct *t; + + for_each_thread(p, t) { + if (!(t->flags & PF_EXITING)) + return t; + } + return NULL; +} + +static struct task_struct *find_child_reaper(struct task_struct *father, + struct list_head *dead) + __releases(&tasklist_lock) + __acquires(&tasklist_lock) +{ + struct pid_namespace *pid_ns = task_active_pid_ns(father); + struct task_struct *reaper = pid_ns->child_reaper; + struct task_struct *p, *n; + + if (likely(reaper != father)) + return reaper; + + reaper = find_alive_thread(father); + if (reaper) { + pid_ns->child_reaper = reaper; + return reaper; + } + + write_unlock_irq(&tasklist_lock); + + list_for_each_entry_safe(p, n, dead, ptrace_entry) { + list_del_init(&p->ptrace_entry); + release_task(p); + } + + zap_pid_ns_processes(pid_ns); + write_lock_irq(&tasklist_lock); + + return father; } /* @@ -498,58 +632,39 @@ static void exit_mm(struct task_struct * tsk) * child_subreaper for its children (like a service manager) * 3. give it to the init process (PID 1) in our pid namespace */ -static struct task_struct *find_new_reaper(struct task_struct *father) - __releases(&tasklist_lock) - __acquires(&tasklist_lock) +static struct task_struct *find_new_reaper(struct task_struct *father, + struct task_struct *child_reaper) { - struct pid_namespace *pid_ns = task_active_pid_ns(father); - struct task_struct *thread; + struct task_struct *thread, *reaper; - thread = father; - while_each_thread(father, thread) { - if (thread->flags & PF_EXITING) - continue; - if (unlikely(pid_ns->child_reaper == father)) - pid_ns->child_reaper = thread; + thread = find_alive_thread(father); + if (thread) return thread; - } - - if (unlikely(pid_ns->child_reaper == father)) { - write_unlock_irq(&tasklist_lock); - if (unlikely(pid_ns == &init_pid_ns)) { - panic("Attempted to kill init! exitcode=0x%08x\n", - father->signal->group_exit_code ?: - father->exit_code); - } - - zap_pid_ns_processes(pid_ns); - write_lock_irq(&tasklist_lock); - } else if (father->signal->has_child_subreaper) { - struct task_struct *reaper; + if (father->signal->has_child_subreaper) { + unsigned int ns_level = task_pid(father)->level; /* - * Find the first ancestor marked as child_subreaper. - * Note that the code below checks same_thread_group(reaper, - * pid_ns->child_reaper). This is what we need to DTRT in a - * PID namespace. However we still need the check above, see - * http://marc.info/?l=linux-kernel&m=131385460420380 + * Find the first ->is_child_subreaper ancestor in our pid_ns. + * We can't check reaper != child_reaper to ensure we do not + * cross the namespaces, the exiting parent could be injected + * by setns() + fork(). + * We check pid->level, this is slightly more efficient than + * task_active_pid_ns(reaper) != task_active_pid_ns(father). */ for (reaper = father->real_parent; - reaper != &init_task; + task_pid(reaper)->level == ns_level; reaper = reaper->real_parent) { - if (same_thread_group(reaper, pid_ns->child_reaper)) + if (reaper == &init_task) break; if (!reaper->signal->is_child_subreaper) continue; - thread = reaper; - do { - if (!(thread->flags & PF_EXITING)) - return reaper; - } while_each_thread(reaper, thread); + thread = find_alive_thread(reaper); + if (thread) + return thread; } } - return pid_ns->child_reaper; + return child_reaper; } /* @@ -558,18 +673,10 @@ static struct task_struct *find_new_reaper(struct task_struct *father) static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { - list_move_tail(&p->sibling, &p->real_parent->children); - - if (p->exit_state == EXIT_DEAD) - return; - /* - * If this is a threaded reparent there is no need to - * notify anyone anything has happened. - */ - if (same_thread_group(p->real_parent, father)) + if (unlikely(p->exit_state == EXIT_DEAD)) return; - /* We don't want people slaying init. */ + /* We don't want people slaying init. */ p->exit_signal = SIGCHLD; /* If it has exited notify the new parent about this child's death. */ @@ -577,48 +684,49 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { if (do_notify_parent(p, p->exit_signal)) { p->exit_state = EXIT_DEAD; - list_move_tail(&p->sibling, dead); + list_add(&p->ptrace_entry, dead); } } kill_orphaned_pgrp(p, father); } -static void forget_original_parent(struct task_struct *father) +/* + * Make init inherit all the child processes + */ +static void forget_original_parent(struct task_struct *father, + struct list_head *dead) { - struct task_struct *p, *n, *reaper; - LIST_HEAD(dead_children); + struct task_struct *p, *t, *reaper; - write_lock_irq(&tasklist_lock); - /* - * Note that exit_ptrace() and find_new_reaper() might - * drop tasklist_lock and reacquire it. - */ - exit_ptrace(father); - reaper = find_new_reaper(father); + if (unlikely(!list_empty(&father->ptraced))) + exit_ptrace(father, dead); - list_for_each_entry_safe(p, n, &father->children, sibling) { - struct task_struct *t = p; - do { - t->real_parent = reaper; - if (t->parent == father) { - BUG_ON(t->ptrace); + /* Can drop and reacquire tasklist_lock */ + reaper = find_child_reaper(father, dead); + if (list_empty(&father->children)) + return; + + reaper = find_new_reaper(father, reaper); + list_for_each_entry(p, &father->children, sibling) { + for_each_thread(p, t) { + RCU_INIT_POINTER(t->real_parent, reaper); + BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father)); + if (likely(!t->ptrace)) t->parent = t->real_parent; - } if (t->pdeath_signal) group_send_sig_info(t->pdeath_signal, - SEND_SIG_NOINFO, t); - } while_each_thread(p, t); - reparent_leader(father, p, &dead_children); - } - write_unlock_irq(&tasklist_lock); - - BUG_ON(!list_empty(&father->children)); - - list_for_each_entry_safe(p, n, &dead_children, sibling) { - list_del_init(&p->sibling); - release_task(p); + SEND_SIG_NOINFO, t, + PIDTYPE_TGID); + } + /* + * If this is a threaded reparent there is no need to + * notify anyone anything has happened. + */ + if (!same_thread_group(reaper, father)) + reparent_leader(father, p, dead); } + list_splice_tail_init(&father->children, &reaper->children); } /* @@ -628,21 +736,17 @@ static void forget_original_parent(struct task_struct *father) static void exit_notify(struct task_struct *tsk, int group_dead) { bool autoreap; - - /* - * This does two things: - * - * A. Make init inherit all the child processes - * B. Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) - */ - forget_original_parent(tsk); + struct task_struct *p, *n; + LIST_HEAD(dead); write_lock_irq(&tasklist_lock); + forget_original_parent(tsk, &dead); + if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); + tsk->exit_state = EXIT_ZOMBIE; + if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && @@ -654,21 +758,88 @@ static void exit_notify(struct task_struct *tsk, int group_dead) do_notify_parent(tsk, tsk->exit_signal); } else { autoreap = true; + /* untraced sub-thread */ + do_notify_pidfd(tsk); } - tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; + if (autoreap) { + tsk->exit_state = EXIT_DEAD; + list_add(&tsk->ptrace_entry, &dead); + } /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) - wake_up_process(tsk->signal->group_exit_task); + wake_up_process(tsk->signal->group_exec_task); write_unlock_irq(&tasklist_lock); - /* If the process is dead, release it - nobody will wait for it */ - if (autoreap) - release_task(tsk); + list_for_each_entry_safe(p, n, &dead, ptrace_entry) { + list_del_init(&p->ptrace_entry); + release_task(p); + } } #ifdef CONFIG_DEBUG_STACK_USAGE +#ifdef CONFIG_STACK_GROWSUP +unsigned long stack_not_used(struct task_struct *p) +{ + unsigned long *n = end_of_stack(p); + + do { /* Skip over canary */ + n--; + } while (!*n); + + return (unsigned long)end_of_stack(p) - (unsigned long)n; +} +#else /* !CONFIG_STACK_GROWSUP */ +unsigned long stack_not_used(struct task_struct *p) +{ + unsigned long *n = end_of_stack(p); + + do { /* Skip over canary */ + n++; + } while (!*n); + + return (unsigned long)n - (unsigned long)end_of_stack(p); +} +#endif /* CONFIG_STACK_GROWSUP */ + +/* Count the maximum pages reached in kernel stacks */ +static inline void kstack_histogram(unsigned long used_stack) +{ +#ifdef CONFIG_VM_EVENT_COUNTERS + if (used_stack <= 1024) + count_vm_event(KSTACK_1K); +#if THREAD_SIZE > 1024 + else if (used_stack <= 2048) + count_vm_event(KSTACK_2K); +#endif +#if THREAD_SIZE > 2048 + else if (used_stack <= 4096) + count_vm_event(KSTACK_4K); +#endif +#if THREAD_SIZE > 4096 + else if (used_stack <= 8192) + count_vm_event(KSTACK_8K); +#endif +#if THREAD_SIZE > 8192 + else if (used_stack <= 16384) + count_vm_event(KSTACK_16K); +#endif +#if THREAD_SIZE > 16384 + else if (used_stack <= 32768) + count_vm_event(KSTACK_32K); +#endif +#if THREAD_SIZE > 32768 + else if (used_stack <= 65536) + count_vm_event(KSTACK_64K); +#endif +#if THREAD_SIZE > 65536 + else + count_vm_event(KSTACK_REST); +#endif +#endif /* CONFIG_VM_EVENT_COUNTERS */ +} + static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); @@ -676,92 +847,88 @@ static void check_stack_usage(void) unsigned long free; free = stack_not_used(current); + kstack_histogram(THREAD_SIZE - free); if (free >= lowest_to_date) return; spin_lock(&low_water_lock); if (free < lowest_to_date) { - printk(KERN_WARNING "%s (%d) used greatest stack depth: " - "%lu bytes left\n", - current->comm, task_pid_nr(current), free); + pr_info("%s (%d) used greatest stack depth: %lu bytes left\n", + current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); } -#else +#else /* !CONFIG_DEBUG_STACK_USAGE */ static inline void check_stack_usage(void) {} -#endif +#endif /* CONFIG_DEBUG_STACK_USAGE */ -void do_exit(long code) +static void synchronize_group_exit(struct task_struct *tsk, long code) { - struct task_struct *tsk = current; - int group_dead; - - profile_task_exit(tsk); - - WARN_ON(blk_needs_flush_plug(tsk)); - - if (unlikely(in_interrupt())) - panic("Aiee, killing interrupt handler!"); - if (unlikely(!tsk->pid)) - panic("Attempted to kill the idle task!"); + struct sighand_struct *sighand = tsk->sighand; + struct signal_struct *signal = tsk->signal; + struct core_state *core_state; + spin_lock_irq(&sighand->siglock); + signal->quick_threads--; + if ((signal->quick_threads == 0) && + !(signal->flags & SIGNAL_GROUP_EXIT)) { + signal->flags = SIGNAL_GROUP_EXIT; + signal->group_exit_code = code; + signal->group_stop_count = 0; + } /* - * If do_exit is called because this processes oopsed, it's possible - * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before - * continuing. Amongst other possible reasons, this is to prevent - * mm_release()->clear_child_tid() from writing to a user-controlled - * kernel address. + * Serialize with any possible pending coredump. + * We must hold siglock around checking core_state + * and setting PF_POSTCOREDUMP. The core-inducing thread + * will increment ->nr_threads for each thread in the + * group without PF_POSTCOREDUMP set. */ - set_fs(USER_DS); + tsk->flags |= PF_POSTCOREDUMP; + core_state = signal->core_state; + spin_unlock_irq(&sighand->siglock); - ptrace_event(PTRACE_EVENT_EXIT, code); + if (unlikely(core_state)) + coredump_task_exit(tsk, core_state); +} - validate_creds_for_do_exit(tsk); +void __noreturn do_exit(long code) +{ + struct task_struct *tsk = current; + int group_dead; - /* - * We're taking recursive faults here in do_exit. Safest is to just - * leave this task alone and wait for reboot. - */ - if (unlikely(tsk->flags & PF_EXITING)) { - printk(KERN_ALERT - "Fixing recursive fault but reboot is needed!\n"); - /* - * We can do this unlocked here. The futex code uses - * this flag just to verify whether the pi state - * cleanup has been done or not. In the worst case it - * loops once more. We pretend that the cleanup was - * done as there is no way to return. Either the - * OWNER_DIED bit is set by now or we push the blocked - * task into the wait for ever nirwana as well. - */ - tsk->flags |= PF_EXITPIDONE; - set_current_state(TASK_UNINTERRUPTIBLE); - schedule(); - } + WARN_ON(irqs_disabled()); + WARN_ON(tsk->plug); + + kcov_task_exit(tsk); + kmsan_task_exit(tsk); + synchronize_group_exit(tsk, code); + ptrace_event(PTRACE_EVENT_EXIT, code); + user_events_exit(tsk); + + io_uring_files_cancel(); + sched_mm_cid_exit(tsk); exit_signals(tsk); /* sets PF_EXITING */ - /* - * tsk->flags are checked in the futex code to protect against - * an exiting task cleaning up the robust pi futexes. - */ - smp_mb(); - raw_spin_unlock_wait(&tsk->pi_lock); - if (unlikely(in_atomic())) - printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); + seccomp_filter_release(tsk); acct_update_integrals(tsk); - /* sync mm's RSS info before statistics gathering */ - if (tsk->mm) - sync_mm_rss(tsk->mm); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { + /* + * If the last thread of global init has exited, panic + * immediately to get a useable coredump. + */ + if (unlikely(is_global_init(tsk))) + panic("Attempted to kill init! exitcode=0x%08x\n", + tsk->signal->group_exit_code ?: (int)code); + +#ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); - exit_itimers(tsk->signal); + exit_itimers(tsk); +#endif if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } @@ -772,51 +939,50 @@ void do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); + trace_sched_process_exit(tsk, group_dead); + + /* + * Since sampling can touch ->mm, make sure to stop everything before we + * tear it down. + * + * Also flushes inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_event_exit_task(tsk); + /* + * PF_EXITING (above) ensures unwind_deferred_request() will no + * longer add new unwinds. While exit_mm() (below) will destroy the + * abaility to do unwinds. So flush any pending unwinds here. + */ + unwind_deferred_task_exit(tsk); - exit_mm(tsk); + exit_mm(); if (group_dead) acct_process(); - trace_sched_process_exit(tsk); exit_sem(tsk); exit_shm(tsk); exit_files(tsk); exit_fs(tsk); - exit_task_namespaces(tsk); - exit_task_work(tsk); - check_stack_usage(); - exit_thread(); - - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - * - * because of cgroup mode, must be called before cgroup_exit() - */ - perf_event_exit_task(tsk); - - cgroup_exit(tsk, 1); - if (group_dead) disassociate_ctty(1); + exit_nsproxy_namespaces(tsk); + exit_task_work(tsk); + exit_thread(tsk); - module_put(task_thread_info(tsk)->exec_domain->module); - - proc_exit_connector(tsk); + sched_autogroup_exit_task(tsk); + cgroup_task_exit(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk); + exit_tasks_rcu_start(); exit_notify(tsk, group_dead); -#ifdef CONFIG_NUMA - task_lock(tsk); - mpol_put(tsk->mempolicy); - tsk->mempolicy = NULL; - task_unlock(tsk); -#endif + proc_exit_connector(tsk); + mpol_put_task_policy(tsk); #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); @@ -825,12 +991,6 @@ void do_exit(long code) * Make sure we are holding no locks: */ debug_check_no_locks_held(); - /* - * We can do this unlocked here. The futex code uses this flag - * just to verify whether the pi state cleanup has been done - * or not. In the worst case it loops once more. - */ - tsk->flags |= PF_EXITPIDONE; if (tsk->io_context) exit_io_context(tsk); @@ -841,49 +1001,78 @@ void do_exit(long code) if (tsk->task_frag.page) put_page(tsk->task_frag.page); - validate_creds_for_do_exit(tsk); + exit_task_stack_account(tsk); + check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); + exit_tasks_rcu_finish(); + lockdep_free_task(tsk); + do_task_dead(); +} + +void __noreturn make_task_dead(int signr) +{ /* - * The setting of TASK_RUNNING by try_to_wake_up() may be delayed - * when the following two conditions become true. - * - There is race condition of mmap_sem (It is acquired by - * exit_mm()), and - * - SMI occurs before setting TASK_RUNINNG. - * (or hypervisor of virtual machine switches to other guest) - * As a result, we may become TASK_RUNNING after becoming TASK_DEAD + * Take the task off the cpu after something catastrophic has + * happened. * - * To avoid it, we have to wait for releasing tsk->pi_lock which - * is held by try_to_wake_up() + * We can get here from a kernel oops, sometimes with preemption off. + * Start by checking for critical errors. + * Then fix up important state like USER_DS and preemption. + * Then do everything else. */ - smp_mb(); - raw_spin_unlock_wait(&tsk->pi_lock); + struct task_struct *tsk = current; + unsigned int limit; - /* causes final put_task_struct in finish_task_switch(). */ - tsk->state = TASK_DEAD; - tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ - schedule(); - BUG(); - /* Avoid "noreturn function does return". */ - for (;;) - cpu_relax(); /* For when BUG is null */ -} + if (unlikely(in_interrupt())) + panic("Aiee, killing interrupt handler!"); + if (unlikely(!tsk->pid)) + panic("Attempted to kill the idle task!"); -EXPORT_SYMBOL_GPL(do_exit); + if (unlikely(irqs_disabled())) { + pr_info("note: %s[%d] exited with irqs disabled\n", + current->comm, task_pid_nr(current)); + local_irq_enable(); + } + if (unlikely(in_atomic())) { + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); + preempt_count_set(PREEMPT_ENABLED); + } -void complete_and_exit(struct completion *comp, long code) -{ - if (comp) - complete(comp); + /* + * Every time the system oopses, if the oops happens while a reference + * to an object was held, the reference leaks. + * If the oops doesn't also leak memory, repeated oopsing can cause + * reference counters to wrap around (if they're not using refcount_t). + * This means that repeated oopsing can make unexploitable-looking bugs + * exploitable through repeated oopsing. + * To make sure this can't happen, place an upper bound on how often the + * kernel may oops without panic(). + */ + limit = READ_ONCE(oops_limit); + if (atomic_inc_return(&oops_count) >= limit && limit) + panic("Oopsed too often (kernel.oops_limit is %d)", limit); - do_exit(code); -} + /* + * We're taking recursive faults here in make_task_dead. Safest is to just + * leave this task alone and wait for reboot. + */ + if (unlikely(tsk->flags & PF_EXITING)) { + pr_alert("Fixing recursive fault but reboot is needed!\n"); + futex_exit_recursive(tsk); + tsk->exit_state = EXIT_DEAD; + refcount_inc(&tsk->rcu_users); + do_task_dead(); + } -EXPORT_SYMBOL(complete_and_exit); + do_exit(signr); +} SYSCALL_DEFINE1(exit, int, error_code) { @@ -894,21 +1083,24 @@ SYSCALL_DEFINE1(exit, int, error_code) * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ -void +void __noreturn do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; - BUG_ON(exit_code & 0x80); /* core dumps don't get here */ - - if (signal_group_exit(sig)) + if (sig->flags & SIGNAL_GROUP_EXIT) exit_code = sig->group_exit_code; - else if (!thread_group_empty(current)) { + else if (sig->group_exec_task) + exit_code = 0; + else { struct sighand_struct *const sighand = current->sighand; + spin_lock_irq(&sighand->siglock); - if (signal_group_exit(sig)) + if (sig->flags & SIGNAL_GROUP_EXIT) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; + else if (sig->group_exec_task) + exit_code = 0; else { sig->group_exit_code = exit_code; sig->flags = SIGNAL_GROUP_EXIT; @@ -933,75 +1125,37 @@ SYSCALL_DEFINE1(exit_group, int, error_code) return 0; } -struct wait_opts { - enum pid_type wo_type; - int wo_flags; - struct pid *wo_pid; - - struct siginfo __user *wo_info; - int __user *wo_stat; - struct rusage __user *wo_rusage; - - wait_queue_t child_wait; - int notask_error; -}; - -static inline -struct pid *task_pid_type(struct task_struct *task, enum pid_type type) -{ - if (type != PIDTYPE_PID) - task = task->group_leader; - return task->pids[type].pid; -} - static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { return wo->wo_type == PIDTYPE_MAX || task_pid_type(p, wo->wo_type) == wo->wo_pid; } -static int eligible_child(struct wait_opts *wo, struct task_struct *p) +static int +eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) { if (!eligible_pid(wo, p)) return 0; - /* Wait for all children (clone and not) if __WALL is set; - * otherwise, wait for clone children *only* if __WCLONE is - * set; otherwise, wait for non-clone children *only*. (Note: - * A "clone" child here is one that reports to its parent - * using a signal other than SIGCHLD.) */ - if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) - && !(wo->wo_flags & __WALL)) - return 0; - return 1; -} + /* + * Wait for all children (clone and not) if __WALL is set or + * if it is traced by us. + */ + if (ptrace || (wo->wo_flags & __WALL)) + return 1; -static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, - pid_t pid, uid_t uid, int why, int status) -{ - struct siginfo __user *infop; - int retval = wo->wo_rusage - ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; + /* + * Otherwise, wait for clone children *only* if __WCLONE is set; + * otherwise, wait for non-clone children *only*. + * + * Note: a "clone" child here is one that reports to its parent + * using a signal other than SIGCHLD, or a non-leader thread which + * we can only see if it is traced by us. + */ + if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) + return 0; - put_task_struct(p); - infop = wo->wo_info; - if (infop) { - if (!retval) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(pid, &infop->si_pid); - if (!retval) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = put_user(status, &infop->si_status); - } - if (!retval) - retval = pid; - return retval; + return 1; } /* @@ -1012,51 +1166,46 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, */ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) { - unsigned long state; - int retval, status, traced; + int state, status; pid_t pid = task_pid_vnr(p); uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); - struct siginfo __user *infop; + struct waitid_info *infop; if (!likely(wo->wo_flags & WEXITED)) return 0; if (unlikely(wo->wo_flags & WNOWAIT)) { - int exit_code = p->exit_code; - int why; - + status = (p->signal->flags & SIGNAL_GROUP_EXIT) + ? p->signal->group_exit_code : p->exit_code; get_task_struct(p); read_unlock(&tasklist_lock); - if ((exit_code & 0x7f) == 0) { - why = CLD_EXITED; - status = exit_code >> 8; - } else { - why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; - status = exit_code & 0x7f; - } - return wait_noreap_copyout(wo, p, pid, uid, why, status); + sched_annotate_sleep(); + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); + put_task_struct(p); + goto out_info; } - /* - * Try to move the task's state to DEAD - * only one thread is allowed to do this: + * Move the task's state to DEAD/TRACE, only one thread can do this. */ - state = xchg(&p->exit_state, EXIT_DEAD); - if (state != EXIT_ZOMBIE) { - BUG_ON(state != EXIT_DEAD); + state = (ptrace_reparented(p) && thread_group_leader(p)) ? + EXIT_TRACE : EXIT_DEAD; + if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; - } + /* + * We own this thread, nobody else can reap it. + */ + read_unlock(&tasklist_lock); + sched_annotate_sleep(); - traced = ptrace_reparented(p); /* - * It can be ptraced but not reparented, check - * thread_group_leader() to filter out sub-threads. + * Check thread_group_leader() to exclude the traced sub-threads. */ - if (likely(!traced) && thread_group_leader(p)) { - struct signal_struct *psig; - struct signal_struct *sig; + if (state == EXIT_DEAD && thread_group_leader(p)) { + struct signal_struct *sig = p->signal; + struct signal_struct *psig = current->signal; unsigned long maxrss; - cputime_t tgutime, tgstime; + u64 tgutime, tgstime; /* * The resource counters for the group leader are in its @@ -1066,21 +1215,18 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * accumulate in the parent's signal_struct c* fields. * * We don't bother to take a lock here to protect these - * p->signal fields, because they are only touched by - * __exit_signal, which runs with tasklist_lock - * write-locked anyway, and so is excluded here. We do - * need to protect the access to parent->signal fields, - * as other threads in the parent group can be right - * here reaping other children at the same time. + * p->signal fields because the whole thread group is dead + * and nobody can change them. + * + * psig->stats_lock also protects us from our sub-threads + * which can reap other children at the same time. * - * We use thread_group_cputime_adjusted() to get times for the thread - * group, which consolidates times for all threads in the - * group including the group leader. + * We use thread_group_cputime_adjusted() to get times for + * the thread group, which consolidates times for all threads + * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); - spin_lock_irq(&p->real_parent->sighand->siglock); - psig = p->real_parent->signal; - sig = p->signal; + write_seqlock_irq(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; @@ -1103,74 +1249,51 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); - spin_unlock_irq(&p->real_parent->sighand->siglock); + write_sequnlock_irq(&psig->stats_lock); } - /* - * Now we are sure this task is interesting, and no other - * thread can reap it because we set its state to EXIT_DEAD. - */ - read_unlock(&tasklist_lock); - - retval = wo->wo_rusage - ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; - if (!retval && wo->wo_stat) - retval = put_user(status, wo->wo_stat); + wo->wo_stat = status; - infop = wo->wo_info; - if (!retval && infop) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval && infop) - retval = put_user(0, &infop->si_errno); - if (!retval && infop) { - int why; - - if ((status & 0x7f) == 0) { - why = CLD_EXITED; - status >>= 8; - } else { - why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; - status &= 0x7f; - } - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(status, &infop->si_status); - } - if (!retval && infop) - retval = put_user(pid, &infop->si_pid); - if (!retval && infop) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = pid; - - if (traced) { + if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); - /* - * If this is not a sub-thread, notify the parent. - * If parent wants a zombie, don't release it now. - */ - if (thread_group_leader(p) && - !do_notify_parent(p, p->exit_signal)) { - p->exit_state = EXIT_ZOMBIE; - p = NULL; - } + + /* If parent wants a zombie, don't release it now */ + state = EXIT_ZOMBIE; + if (do_notify_parent(p, p->exit_signal)) + state = EXIT_DEAD; + p->exit_state = state; write_unlock_irq(&tasklist_lock); } - if (p != NULL) + if (state == EXIT_DEAD) release_task(p); - return retval; +out_info: + infop = wo->wo_info; + if (infop) { + if ((status & 0x7f) == 0) { + infop->cause = CLD_EXITED; + infop->status = status >> 8; + } else { + infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; + infop->status = status & 0x7f; + } + infop->pid = pid; + infop->uid = uid; + } + + return pid; } static int *task_stopped_code(struct task_struct *p, bool ptrace) { if (ptrace) { - if (task_is_stopped_or_traced(p) && - !(p->jobctl & JOBCTL_LISTENING)) + if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING)) return &p->exit_code; } else { if (p->signal->flags & SIGNAL_STOP_STOPPED) @@ -1200,8 +1323,8 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace) static int wait_task_stopped(struct wait_opts *wo, int ptrace, struct task_struct *p) { - struct siginfo __user *infop; - int retval, exit_code, *p_code, why; + struct waitid_info *infop; + int exit_code, *p_code, why; uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; @@ -1245,34 +1368,22 @@ unlock_sig: pid = task_pid_vnr(p); why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); + sched_annotate_sleep(); + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); + put_task_struct(p); - if (unlikely(wo->wo_flags & WNOWAIT)) - return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); - - retval = wo->wo_rusage - ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; - if (!retval && wo->wo_stat) - retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat); + if (likely(!(wo->wo_flags & WNOWAIT))) + wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; - if (!retval && infop) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval && infop) - retval = put_user(0, &infop->si_errno); - if (!retval && infop) - retval = put_user((short)why, &infop->si_code); - if (!retval && infop) - retval = put_user(exit_code, &infop->si_status); - if (!retval && infop) - retval = put_user(pid, &infop->si_pid); - if (!retval && infop) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = pid; - put_task_struct(p); - - BUG_ON(!retval); - return retval; + if (infop) { + infop->cause = why; + infop->status = exit_code; + infop->pid = pid; + infop->uid = uid; + } + return pid; } /* @@ -1283,7 +1394,7 @@ unlock_sig: */ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) { - int retval; + struct waitid_info *infop; pid_t pid; uid_t uid; @@ -1307,22 +1418,21 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) pid = task_pid_vnr(p); get_task_struct(p); read_unlock(&tasklist_lock); + sched_annotate_sleep(); + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); + put_task_struct(p); - if (!wo->wo_info) { - retval = wo->wo_rusage - ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; - put_task_struct(p); - if (!retval && wo->wo_stat) - retval = put_user(0xffff, wo->wo_stat); - if (!retval) - retval = pid; + infop = wo->wo_info; + if (!infop) { + wo->wo_stat = 0xffff; } else { - retval = wait_noreap_copyout(wo, p, pid, uid, - CLD_CONTINUED, SIGCONT); - BUG_ON(retval == 0); + infop->cause = CLD_CONTINUED; + infop->pid = pid; + infop->uid = uid; + infop->status = SIGCONT; } - - return retval; + return pid; } /* @@ -1332,56 +1442,64 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; * then ->notask_error is 0 if @p is an eligible child, - * or another error from security_task_wait(), or still -ECHILD. + * or still -ECHILD. */ static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { - int ret = eligible_child(wo, p); + /* + * We can race with wait_task_zombie() from another thread. + * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition + * can't confuse the checks below. + */ + int exit_state = READ_ONCE(p->exit_state); + int ret; + + if (unlikely(exit_state == EXIT_DEAD)) + return 0; + + ret = eligible_child(wo, ptrace, p); if (!ret) return ret; - ret = security_task_wait(p); - if (unlikely(ret < 0)) { + if (unlikely(exit_state == EXIT_TRACE)) { /* - * If we have not yet seen any eligible child, - * then let this error code replace -ECHILD. - * A permission error will give the user a clue - * to look for security policy problems, rather - * than for mysterious wait bugs. + * ptrace == 0 means we are the natural parent. In this case + * we should clear notask_error, debugger will notify us. */ - if (wo->notask_error) - wo->notask_error = ret; + if (likely(!ptrace)) + wo->notask_error = 0; return 0; } - /* dead body doesn't have much to contribute */ - if (unlikely(p->exit_state == EXIT_DEAD)) { + if (likely(!ptrace) && unlikely(p->ptrace)) { /* - * But do not ignore this task until the tracer does - * wait_task_zombie()->do_notify_parent(). + * If it is traced by its real parent's group, just pretend + * the caller is ptrace_do_wait() and reap this child if it + * is zombie. + * + * This also hides group stop state from real parent; otherwise + * a single stop can be reported twice as group and ptrace stop. + * If a ptracer wants to distinguish these two events for its + * own children it should create a separate process which takes + * the role of real parent. */ - if (likely(!ptrace) && unlikely(ptrace_reparented(p))) - wo->notask_error = 0; - return 0; + if (!ptrace_reparented(p)) + ptrace = 1; } /* slay zombie? */ - if (p->exit_state == EXIT_ZOMBIE) { - /* - * A zombie ptracee is only visible to its ptracer. - * Notification and reaping will be cascaded to the real - * parent when the ptracer detaches. - */ - if (likely(!ptrace) && unlikely(p->ptrace)) { - /* it will become visible, clear notask_error */ - wo->notask_error = 0; - return 0; - } - + if (exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ - if (!delay_group_leader(p)) - return wait_task_zombie(wo, p); + if (!delay_group_leader(p)) { + /* + * A zombie ptracee is only visible to its ptracer. + * Notification and reaping will be cascaded to the + * real parent when the ptracer detaches. + */ + if (unlikely(ptrace) || likely(!p->ptrace)) + return wait_task_zombie(wo, p); + } /* * Allow access to stopped/continued state via zombie by @@ -1407,19 +1525,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, wo->notask_error = 0; } else { /* - * If @p is ptraced by a task in its real parent's group, - * hide group stop/continued state when looking at @p as - * the real parent; otherwise, a single stop can be - * reported twice as group and ptrace stops. - * - * If a ptracer wants to distinguish the two events for its - * own children, it should create a separate process which - * takes the role of real parent. - */ - if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p)) - return 0; - - /* * @p is alive and it's gonna stop, continue or exit, so * there always is something to wait for. */ @@ -1449,7 +1554,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; then * ->notask_error is 0 if there were any eligible children, - * or another error from security_task_wait(), or still -ECHILD. + * or still -ECHILD. */ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) { @@ -1457,6 +1562,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) list_for_each_entry(p, &tsk->children, sibling) { int ret = wait_consider_task(wo, 0, p); + if (ret) return ret; } @@ -1470,6 +1576,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { int ret = wait_consider_task(wo, 1, p); + if (ret) return ret; } @@ -1477,91 +1584,158 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) return 0; } -static int child_wait_callback(wait_queue_t *wait, unsigned mode, +bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p) +{ + if (!eligible_pid(wo, p)) + return false; + + if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent) + return false; + + return true; +} + +static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait); struct task_struct *p = key; - if (!eligible_pid(wo, p)) - return 0; - - if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) - return 0; + if (pid_child_should_wake(wo, p)) + return default_wake_function(wait, mode, sync, key); - return default_wake_function(wait, mode, sync, key); + return 0; } void __wake_up_parent(struct task_struct *p, struct task_struct *parent) { __wake_up_sync_key(&parent->signal->wait_chldexit, - TASK_INTERRUPTIBLE, 1, p); + TASK_INTERRUPTIBLE, p); } -static long do_wait(struct wait_opts *wo) +static bool is_effectively_child(struct wait_opts *wo, bool ptrace, + struct task_struct *target) { - struct task_struct *tsk; + struct task_struct *parent = + !ptrace ? target->real_parent : target->parent; + + return current == parent || (!(wo->wo_flags & __WNOTHREAD) && + same_thread_group(current, parent)); +} + +/* + * Optimization for waiting on PIDTYPE_PID. No need to iterate through child + * and tracee lists to find the target task. + */ +static int do_wait_pid(struct wait_opts *wo) +{ + bool ptrace; + struct task_struct *target; int retval; - trace_sched_process_wait(wo->wo_pid); + ptrace = false; + target = pid_task(wo->wo_pid, PIDTYPE_TGID); + if (target && is_effectively_child(wo, ptrace, target)) { + retval = wait_consider_task(wo, ptrace, target); + if (retval) + return retval; + } + + ptrace = true; + target = pid_task(wo->wo_pid, PIDTYPE_PID); + if (target && target->ptrace && + is_effectively_child(wo, ptrace, target)) { + retval = wait_consider_task(wo, ptrace, target); + if (retval) + return retval; + } + + return 0; +} + +long __do_wait(struct wait_opts *wo) +{ + long retval; - init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); - wo->child_wait.private = current; - add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); -repeat: /* - * If there is nothing that can match our critiera just get out. + * If there is nothing that can match our criteria, just get out. * We will clear ->notask_error to zero if we see any child that * might later match our criteria, even if we are not able to reap * it yet. */ wo->notask_error = -ECHILD; if ((wo->wo_type < PIDTYPE_MAX) && - (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type]))) + (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type))) goto notask; - set_current_state(TASK_INTERRUPTIBLE); read_lock(&tasklist_lock); - tsk = current; - do { - retval = do_wait_thread(wo, tsk); - if (retval) - goto end; - retval = ptrace_do_wait(wo, tsk); + if (wo->wo_type == PIDTYPE_PID) { + retval = do_wait_pid(wo); if (retval) - goto end; + return retval; + } else { + struct task_struct *tsk = current; - if (wo->wo_flags & __WNOTHREAD) - break; - } while_each_thread(current, tsk); + do { + retval = do_wait_thread(wo, tsk); + if (retval) + return retval; + + retval = ptrace_do_wait(wo, tsk); + if (retval) + return retval; + + if (wo->wo_flags & __WNOTHREAD) + break; + } while_each_thread(current, tsk); + } read_unlock(&tasklist_lock); notask: retval = wo->notask_error; - if (!retval && !(wo->wo_flags & WNOHANG)) { - retval = -ERESTARTSYS; - if (!signal_pending(current)) { - schedule(); - goto repeat; - } - } -end: + if (!retval && !(wo->wo_flags & WNOHANG)) + return -ERESTARTSYS; + + return retval; +} + +static long do_wait(struct wait_opts *wo) +{ + int retval; + + trace_sched_process_wait(wo->wo_pid); + + init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); + wo->child_wait.private = current; + add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); + + do { + set_current_state(TASK_INTERRUPTIBLE); + retval = __do_wait(wo); + if (retval != -ERESTARTSYS) + break; + if (signal_pending(current)) + break; + schedule(); + } while (1); + __set_current_state(TASK_RUNNING); remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); return retval; } -SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, - infop, int, options, struct rusage __user *, ru) +int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid, + struct waitid_info *infop, int options, + struct rusage *ru) { - struct wait_opts wo; + unsigned int f_flags = 0; struct pid *pid = NULL; enum pid_type type; - long ret; - if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) + if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| + __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) return -EINVAL; @@ -1574,55 +1748,97 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, type = PIDTYPE_PID; if (upid <= 0) return -EINVAL; + + pid = find_get_pid(upid); break; case P_PGID: type = PIDTYPE_PGID; - if (upid <= 0) + if (upid < 0) + return -EINVAL; + + if (upid) + pid = find_get_pid(upid); + else + pid = get_task_pid(current, PIDTYPE_PGID); + break; + case P_PIDFD: + type = PIDTYPE_PID; + if (upid < 0) return -EINVAL; + + pid = pidfd_get_pid(upid, &f_flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + break; default: return -EINVAL; } - if (type < PIDTYPE_MAX) - pid = find_get_pid(upid); + wo->wo_type = type; + wo->wo_pid = pid; + wo->wo_flags = options; + wo->wo_info = infop; + wo->wo_rusage = ru; + if (f_flags & O_NONBLOCK) + wo->wo_flags |= WNOHANG; - wo.wo_type = type; - wo.wo_pid = pid; - wo.wo_flags = options; - wo.wo_info = infop; - wo.wo_stat = NULL; - wo.wo_rusage = ru; - ret = do_wait(&wo); + return 0; +} - if (ret > 0) { - ret = 0; - } else if (infop) { - /* - * For a WNOHANG return, clear out all the fields - * we would set so the user can easily tell the - * difference. - */ - if (!ret) - ret = put_user(0, &infop->si_signo); - if (!ret) - ret = put_user(0, &infop->si_errno); - if (!ret) - ret = put_user(0, &infop->si_code); - if (!ret) - ret = put_user(0, &infop->si_pid); - if (!ret) - ret = put_user(0, &infop->si_uid); - if (!ret) - ret = put_user(0, &infop->si_status); - } +static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, + int options, struct rusage *ru) +{ + struct wait_opts wo; + long ret; - put_pid(pid); + ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru); + if (ret) + return ret; + + ret = do_wait(&wo); + if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG)) + ret = -EAGAIN; + + put_pid(wo.wo_pid); return ret; } -SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, - int, options, struct rusage __user *, ru) +SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, + infop, int, options, struct rusage __user *, ru) +{ + struct rusage r; + struct waitid_info info = {.status = 0}; + long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); + int signo = 0; + + if (err > 0) { + signo = SIGCHLD; + err = 0; + if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) + return -EFAULT; + } + if (!infop) + return err; + + if (!user_write_access_begin(infop, sizeof(*infop))) + return -EFAULT; + + unsafe_put_user(signo, &infop->si_signo, Efault); + unsafe_put_user(0, &infop->si_errno, Efault); + unsafe_put_user(info.cause, &infop->si_code, Efault); + unsafe_put_user(info.pid, &infop->si_pid, Efault); + unsafe_put_user(info.uid, &infop->si_uid, Efault); + unsafe_put_user(info.status, &infop->si_status, Efault); + user_write_access_end(); + return err; +Efault: + user_write_access_end(); + return -EFAULT; +} + +long kernel_wait4(pid_t upid, int __user *stat_addr, int options, + struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; @@ -1633,6 +1849,10 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; + /* -INT_MIN is not defined */ + if (upid == INT_MIN) + return -ESRCH; + if (upid == -1) type = PIDTYPE_MAX; else if (upid < 0) { @@ -1650,14 +1870,45 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, wo.wo_pid = pid; wo.wo_flags = options | WEXITED; wo.wo_info = NULL; - wo.wo_stat = stat_addr; + wo.wo_stat = 0; wo.wo_rusage = ru; ret = do_wait(&wo); put_pid(pid); + if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr)) + ret = -EFAULT; return ret; } +int kernel_wait(pid_t pid, int *stat) +{ + struct wait_opts wo = { + .wo_type = PIDTYPE_PID, + .wo_pid = find_get_pid(pid), + .wo_flags = WEXITED, + }; + int ret; + + ret = do_wait(&wo); + if (ret > 0 && wo.wo_stat) + *stat = wo.wo_stat; + put_pid(wo.wo_pid); + return ret; +} + +SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, + int, options, struct rusage __user *, ru) +{ + struct rusage r; + long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL); + + if (err > 0) { + if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) + return -EFAULT; + } + return err; +} + #ifdef __ARCH_WANT_SYS_WAITPID /* @@ -1666,7 +1917,82 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, */ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) { - return sys_wait4(pid, stat_addr, options, NULL); + return kernel_wait4(pid, stat_addr, options, NULL); +} + +#endif + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(wait4, + compat_pid_t, pid, + compat_uint_t __user *, stat_addr, + int, options, + struct compat_rusage __user *, ru) +{ + struct rusage r; + long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL); + if (err > 0) { + if (ru && put_compat_rusage(&r, ru)) + return -EFAULT; + } + return err; } +COMPAT_SYSCALL_DEFINE5(waitid, + int, which, compat_pid_t, pid, + struct compat_siginfo __user *, infop, int, options, + struct compat_rusage __user *, uru) +{ + struct rusage ru; + struct waitid_info info = {.status = 0}; + long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); + int signo = 0; + if (err > 0) { + signo = SIGCHLD; + err = 0; + if (uru) { + /* kernel_waitid() overwrites everything in ru */ + if (COMPAT_USE_64BIT_TIME) + err = copy_to_user(uru, &ru, sizeof(ru)); + else + err = put_compat_rusage(&ru, uru); + if (err) + return -EFAULT; + } + } + + if (!infop) + return err; + + if (!user_write_access_begin(infop, sizeof(*infop))) + return -EFAULT; + + unsafe_put_user(signo, &infop->si_signo, Efault); + unsafe_put_user(0, &infop->si_errno, Efault); + unsafe_put_user(info.cause, &infop->si_code, Efault); + unsafe_put_user(info.pid, &infop->si_pid, Efault); + unsafe_put_user(info.uid, &infop->si_uid, Efault); + unsafe_put_user(info.status, &infop->si_status, Efault); + user_write_access_end(); + return err; +Efault: + user_write_access_end(); + return -EFAULT; +} #endif + +/* + * This needs to be __function_aligned as GCC implicitly makes any + * implementation of abort() cold and drops alignment specified by + * -falign-functions=N. + * + * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11 + */ +__weak __function_aligned void abort(void) +{ + BUG(); + + /* if that doesn't kill us, halt */ + panic("Oops failed to kill thread"); +} +EXPORT_SYMBOL(abort); |
