diff options
Diffstat (limited to 'kernel/exit.c')
| -rw-r--r-- | kernel/exit.c | 1005 |
1 files changed, 622 insertions, 383 deletions
diff --git a/kernel/exit.c b/kernel/exit.c index 284f2fe9a293..8a87021211ae 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/exit.c * @@ -24,7 +25,6 @@ #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/freezer.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> @@ -47,9 +47,9 @@ #include <linux/pipe_fs_i.h> #include <linux/audit.h> /* for audit_free() */ #include <linux/resource.h> -#include <linux/blkdev.h> #include <linux/task_io_accounting_ops.h> -#include <linux/tracehook.h> +#include <linux/blkdev.h> +#include <linux/task_work.h> #include <linux/fs_struct.h> #include <linux/init_task.h> #include <linux/perf_event.h> @@ -59,41 +59,109 @@ #include <linux/writeback.h> #include <linux/shm.h> #include <linux/kcov.h> +#include <linux/kmsan.h> #include <linux/random.h> #include <linux/rcuwait.h> #include <linux/compat.h> - +#include <linux/io_uring.h> +#include <linux/kprobes.h> +#include <linux/rethook.h> +#include <linux/sysfs.h> +#include <linux/user_events.h> +#include <linux/unwind_deferred.h> #include <linux/uaccess.h> +#include <linux/pidfs.h> + +#include <uapi/linux/wait.h> + #include <asm/unistd.h> -#include <asm/pgtable.h> #include <asm/mmu_context.h> -static void __unhash_process(struct task_struct *p, bool group_dead) +#include "exit.h" + +/* + * The default value should be high enough to not crash a system that randomly + * crashes its kernel from time to time, but low enough to at least not permit + * overflowing 32-bit refcounts or the ldsem writer count. + */ +static unsigned int oops_limit = 10000; + +#ifdef CONFIG_SYSCTL +static const struct ctl_table kern_exit_table[] = { + { + .procname = "oops_limit", + .data = &oops_limit, + .maxlen = sizeof(oops_limit), + .mode = 0644, + .proc_handler = proc_douintvec, + }, +}; + +static __init int kernel_exit_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_exit_table); + return 0; +} +late_initcall(kernel_exit_sysctls_init); +#endif + +static atomic_t oops_count = ATOMIC_INIT(0); + +#ifdef CONFIG_SYSFS +static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%d\n", atomic_read(&oops_count)); +} + +static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count); + +static __init int kernel_exit_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL); + return 0; +} +late_initcall(kernel_exit_sysfs_init); +#endif + +/* + * For things release_task() would like to do *after* tasklist_lock is released. + */ +struct release_task_post { + struct pid *pids[PIDTYPE_MAX]; +}; + +static void __unhash_process(struct release_task_post *post, struct task_struct *p, + bool group_dead) { + struct pid *pid = task_pid(p); + nr_threads--; - detach_pid(p, PIDTYPE_PID); + + detach_pid(post->pids, p, PIDTYPE_PID); + wake_up_all(&pid->wait_pidfd); + if (group_dead) { - detach_pid(p, PIDTYPE_TGID); - detach_pid(p, PIDTYPE_PGID); - detach_pid(p, PIDTYPE_SID); + detach_pid(post->pids, p, PIDTYPE_TGID); + detach_pid(post->pids, p, PIDTYPE_PGID); + detach_pid(post->pids, p, PIDTYPE_SID); list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); } - list_del_rcu(&p->thread_group); list_del_rcu(&p->thread_node); } /* * This function expects the tasklist_lock write-locked. */ -static void __exit_signal(struct task_struct *tsk) +static void __exit_signal(struct release_task_post *post, struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; - struct tty_struct *uninitialized_var(tty); + struct tty_struct *tty; u64 utime, stime; sighand = rcu_dereference_check(tsk->sighand, @@ -102,17 +170,8 @@ static void __exit_signal(struct task_struct *tsk) #ifdef CONFIG_POSIX_TIMERS posix_cpu_timers_exit(tsk); - if (group_dead) { + if (group_dead) posix_cpu_timers_exit_group(tsk); - } else { - /* - * This can only happen if the caller is de_thread(). - * FIXME: this is the temporary hack, we should teach - * posix-cpu-timers to handle this case correctly. - */ - if (unlikely(has_group_leader_pid(tsk))) - posix_cpu_timers_exit_group(tsk); - } #endif if (group_dead) { @@ -124,15 +183,12 @@ static void __exit_signal(struct task_struct *tsk) * then notify it: */ if (sig->notify_count > 0 && !--sig->notify_count) - wake_up_process(sig->group_exit_task); + wake_up_process(sig->group_exec_task); if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); } - add_device_randomness((const void*) &tsk->se.sum_exec_runtime, - sizeof(unsigned long long)); - /* * Accumulate here the counters for all threads as they die. We could * skip the group leader because it is the last user of signal_struct, @@ -153,51 +209,60 @@ static void __exit_signal(struct task_struct *tsk) task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; - __unhash_process(tsk, group_dead); + __unhash_process(post, tsk, group_dead); write_sequnlock(&sig->stats_lock); - /* - * Do this under ->siglock, we can race with another thread - * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. - */ - flush_sigqueue(&tsk->pending); tsk->sighand = NULL; spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); - clear_tsk_thread_flag(tsk, TIF_SIGPENDING); - if (group_dead) { - flush_sigqueue(&sig->shared_pending); + if (group_dead) tty_kref_put(tty); - } } static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + kprobe_flush_task(tsk); + rethook_flush_task(tsk); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); } +void put_task_struct_rcu_user(struct task_struct *task) +{ + if (refcount_dec_and_test(&task->rcu_users)) + call_rcu(&task->rcu, delayed_put_task_struct); +} + +void __weak release_thread(struct task_struct *dead_task) +{ +} void release_task(struct task_struct *p) { + struct release_task_post post; struct task_struct *leader; + struct pid *thread_pid; int zap_leader; repeat: + memset(&post, 0, sizeof(post)); + /* don't need to get the RCU readlock here - the process is dead and - * can't be modifying its own credentials. But shut RCU-lockdep up */ - rcu_read_lock(); - atomic_dec(&__task_cred(p)->user->processes); - rcu_read_unlock(); + * can't be modifying its own credentials. */ + dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); - proc_flush_task(p); + pidfs_exit(p); + cgroup_task_release(p); + + /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */ + thread_pid = task_pid(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); - __exit_signal(p); + __exit_signal(&post, p); /* * If we are the last non-leader member of the thread @@ -208,6 +273,9 @@ repeat: leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { + /* for pidfs_exit() and do_notify_parent() */ + if (leader->signal->flags & SIGNAL_GROUP_EXIT) + leader->exit_code = leader->signal->group_exit_code; /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, @@ -219,79 +287,33 @@ repeat: } write_unlock_irq(&tasklist_lock); + /* @thread_pid can't go away until free_pids() below */ + proc_flush_pid(thread_pid); + exit_cred_namespaces(p); + add_device_randomness(&p->se.sum_exec_runtime, + sizeof(p->se.sum_exec_runtime)); + free_pids(post.pids); release_thread(p); - call_rcu(&p->rcu, delayed_put_task_struct); - - p = leader; - if (unlikely(zap_leader)) - goto repeat; -} - -/* - * Note that if this function returns a valid task_struct pointer (!NULL) - * task->usage must remain >0 for the duration of the RCU critical section. - */ -struct task_struct *task_rcu_dereference(struct task_struct **ptask) -{ - struct sighand_struct *sighand; - struct task_struct *task; - - /* - * We need to verify that release_task() was not called and thus - * delayed_put_task_struct() can't run and drop the last reference - * before rcu_read_unlock(). We check task->sighand != NULL, - * but we can read the already freed and reused memory. - */ -retry: - task = rcu_dereference(*ptask); - if (!task) - return NULL; - - probe_kernel_address(&task->sighand, sighand); - /* - * Pairs with atomic_dec_and_test() in put_task_struct(). If this task - * was already freed we can not miss the preceding update of this - * pointer. + * This task was already removed from the process/thread/pid lists + * and lock_task_sighand(p) can't succeed. Nobody else can touch + * ->pending or, if group dead, signal->shared_pending. We can call + * flush_sigqueue() lockless. */ - smp_rmb(); - if (unlikely(task != READ_ONCE(*ptask))) - goto retry; + flush_sigqueue(&p->pending); + if (thread_group_leader(p)) + flush_sigqueue(&p->signal->shared_pending); - /* - * We've re-checked that "task == *ptask", now we have two different - * cases: - * - * 1. This is actually the same task/task_struct. In this case - * sighand != NULL tells us it is still alive. - * - * 2. This is another task which got the same memory for task_struct. - * We can't know this of course, and we can not trust - * sighand != NULL. - * - * In this case we actually return a random value, but this is - * correct. - * - * If we return NULL - we can pretend that we actually noticed that - * *ptask was updated when the previous task has exited. Or pretend - * that probe_slab_address(&sighand) reads NULL. - * - * If we return the new task (because sighand is not NULL for any - * reason) - this is fine too. This (new) task can't go away before - * another gp pass. - * - * And note: We could even eliminate the false positive if re-read - * task->sighand once again to avoid the falsely NULL. But this case - * is very unlikely so we don't care. - */ - if (!sighand) - return NULL; + put_task_struct_rcu_user(p); - return task; + p = leader; + if (unlikely(zap_leader)) + goto repeat; } -void rcuwait_wake_up(struct rcuwait *w) +int rcuwait_wake_up(struct rcuwait *w) { + int ret = 0; struct task_struct *task; rcu_read_lock(); @@ -299,7 +321,7 @@ void rcuwait_wake_up(struct rcuwait *w) /* * Order condition vs @task, such that everything prior to the load * of @task is visible. This is the condition as to why the user called - * rcuwait_trywake() in the first place. Pairs with set_current_state() + * rcuwait_wake() in the first place. Pairs with set_current_state() * barrier (A) in rcuwait_wait_event(). * * WAIT WAKE @@ -307,17 +329,16 @@ void rcuwait_wake_up(struct rcuwait *w) * MB (A) MB (B) * [L] cond [L] tsk */ - smp_rmb(); /* (B) */ + smp_mb(); /* (B) */ - /* - * Avoid using task_rcu_dereference() magic as long as we are careful, - * see comment in rcuwait_wait_event() regarding ->exit_state. - */ task = rcu_dereference(w->task); if (task) - wake_up_process(task); + ret = wake_up_process(task); rcu_read_unlock(); + + return ret; } +EXPORT_SYMBOL_GPL(rcuwait_wake_up); /* * Determine if a process group is "orphaned", according to the POSIX @@ -400,15 +421,73 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } +static void coredump_task_exit(struct task_struct *tsk, + struct core_state *core_state) +{ + struct core_thread self; + + self.task = tsk; + if (self.task->flags & PF_SIGNALED) + self.next = xchg(&core_state->dumper.next, &self); + else + self.task = NULL; + /* + * Implies mb(), the result of xchg() must be visible + * to core_state->dumper. + */ + if (atomic_dec_and_test(&core_state->nr_threads)) + complete(&core_state->startup); + + for (;;) { + set_current_state(TASK_IDLE|TASK_FREEZABLE); + if (!self.task) /* see coredump_finish() */ + break; + schedule(); + } + __set_current_state(TASK_RUNNING); +} + #ifdef CONFIG_MEMCG +/* drops tasklist_lock if succeeds */ +static bool __try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm) +{ + bool ret = false; + + task_lock(tsk); + if (likely(tsk->mm == mm)) { + /* tsk can't pass exit_mm/exec_mmap and exit */ + read_unlock(&tasklist_lock); + WRITE_ONCE(mm->owner, tsk); + lru_gen_migrate_mm(mm); + ret = true; + } + task_unlock(tsk); + return ret; +} + +static bool try_to_set_owner(struct task_struct *g, struct mm_struct *mm) +{ + struct task_struct *t; + + for_each_thread(g, t) { + struct mm_struct *t_mm = READ_ONCE(t->mm); + if (t_mm == mm) { + if (__try_to_set_owner(t, mm)) + return true; + } else if (t_mm) + break; + } + + return false; +} + /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ void mm_update_next_owner(struct mm_struct *mm) { - struct task_struct *c, *g, *p = current; + struct task_struct *g, *p = current; -retry: /* * If the exiting or execing task is not the owner, it's * someone else's problem. @@ -421,7 +500,7 @@ retry: * freed task structure. */ if (atomic_read(&mm->mm_users) <= 1) { - mm->owner = NULL; + WRITE_ONCE(mm->owner, NULL); return; } @@ -429,31 +508,27 @@ retry: /* * Search in the children */ - list_for_each_entry(c, &p->children, sibling) { - if (c->mm == mm) - goto assign_new_owner; + list_for_each_entry(g, &p->children, sibling) { + if (try_to_set_owner(g, mm)) + goto ret; } - /* * Search in the siblings */ - list_for_each_entry(c, &p->real_parent->children, sibling) { - if (c->mm == mm) - goto assign_new_owner; + list_for_each_entry(g, &p->real_parent->children, sibling) { + if (try_to_set_owner(g, mm)) + goto ret; } - /* * Search through everything else, we should not get here often. */ for_each_process(g) { + if (atomic_read(&mm->mm_users) <= 1) + break; if (g->flags & PF_KTHREAD) continue; - for_each_thread(g, c) { - if (c->mm == mm) - goto assign_new_owner; - if (c->mm) - break; - } + if (try_to_set_owner(g, mm)) + goto ret; } read_unlock(&tasklist_lock); /* @@ -461,30 +536,10 @@ retry: * most likely racing with swapoff (try_to_unuse()) or /proc or * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ - mm->owner = NULL; + WRITE_ONCE(mm->owner, NULL); + ret: return; -assign_new_owner: - BUG_ON(c == p); - get_task_struct(c); - /* - * The task_lock protects c->mm from changing. - * We always want mm->owner->mm == mm - */ - task_lock(c); - /* - * Delay read_unlock() till we have the task_lock() - * to ensure that c does not slip away underneath us - */ - read_unlock(&tasklist_lock); - if (c->mm != mm) { - task_unlock(c); - put_task_struct(c); - goto retry; - } - mm->owner = c; - task_unlock(c); - put_task_struct(c); } #endif /* CONFIG_MEMCG */ @@ -495,52 +550,33 @@ assign_new_owner: static void exit_mm(void) { struct mm_struct *mm = current->mm; - struct core_state *core_state; - mm_release(current, mm); + exit_mm_release(current, mm); if (!mm) return; - sync_mm_rss(mm); - /* - * Serialize with any possible pending coredump. - * We must hold mmap_sem around checking core_state - * and clearing tsk->mm. The core-inducing thread - * will increment ->nr_threads for each thread in the - * group with ->mm != NULL. - */ - down_read(&mm->mmap_sem); - core_state = mm->core_state; - if (core_state) { - struct core_thread self; - - up_read(&mm->mmap_sem); - - self.task = current; - self.next = xchg(&core_state->dumper.next, &self); - /* - * Implies mb(), the result of xchg() must be visible - * to core_state->dumper. - */ - if (atomic_dec_and_test(&core_state->nr_threads)) - complete(&core_state->startup); - - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!self.task) /* see coredump_finish() */ - break; - freezable_schedule(); - } - __set_current_state(TASK_RUNNING); - down_read(&mm->mmap_sem); - } - mmgrab(mm); + mmap_read_lock(mm); + mmgrab_lazy_tlb(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); + /* + * When a thread stops operating on an address space, the loop + * in membarrier_private_expedited() may not observe that + * tsk->mm, and the loop in membarrier_global_expedited() may + * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED + * rq->membarrier_state, so those would not issue an IPI. + * Membarrier requires a memory barrier after accessing + * user-space memory, before clearing tsk->mm or the + * rq->membarrier_state. + */ + smp_mb__after_spinlock(); + local_irq_disable(); current->mm = NULL; - up_read(&mm->mmap_sem); + membarrier_update_current_mm(NULL); enter_lazy_tlb(mm, current); + local_irq_enable(); task_unlock(current); + mmap_read_unlock(mm); mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) @@ -558,12 +594,14 @@ static struct task_struct *find_alive_thread(struct task_struct *p) return NULL; } -static struct task_struct *find_child_reaper(struct task_struct *father) +static struct task_struct *find_child_reaper(struct task_struct *father, + struct list_head *dead) __releases(&tasklist_lock) __acquires(&tasklist_lock) { struct pid_namespace *pid_ns = task_active_pid_ns(father); struct task_struct *reaper = pid_ns->child_reaper; + struct task_struct *p, *n; if (likely(reaper != father)) return reaper; @@ -575,10 +613,12 @@ static struct task_struct *find_child_reaper(struct task_struct *father) } write_unlock_irq(&tasklist_lock); - if (unlikely(pid_ns == &init_pid_ns)) { - panic("Attempted to kill init! exitcode=0x%08x\n", - father->signal->group_exit_code ?: father->exit_code); + + list_for_each_entry_safe(p, n, dead, ptrace_entry) { + list_del_init(&p->ptrace_entry); + release_task(p); } + zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); @@ -652,12 +692,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, } /* - * This does two things: - * - * A. Make init inherit all the child processes - * B. Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + * Make init inherit all the child processes */ static void forget_original_parent(struct task_struct *father, struct list_head *dead) @@ -668,15 +703,15 @@ static void forget_original_parent(struct task_struct *father, exit_ptrace(father, dead); /* Can drop and reacquire tasklist_lock */ - reaper = find_child_reaper(father); + reaper = find_child_reaper(father, dead); if (list_empty(&father->children)) return; reaper = find_new_reaper(father, reaper); list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { - t->real_parent = reaper; - BUG_ON((!t->ptrace) != (t->parent == father)); + RCU_INIT_POINTER(t->real_parent, reaper); + BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father)); if (likely(!t->ptrace)) t->parent = t->real_parent; if (t->pdeath_signal) @@ -710,6 +745,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead) if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); + tsk->exit_state = EXIT_ZOMBIE; + if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && @@ -721,15 +758,18 @@ static void exit_notify(struct task_struct *tsk, int group_dead) do_notify_parent(tsk, tsk->exit_signal); } else { autoreap = true; + /* untraced sub-thread */ + do_notify_pidfd(tsk); } - tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; - if (tsk->exit_state == EXIT_DEAD) + if (autoreap) { + tsk->exit_state = EXIT_DEAD; list_add(&tsk->ptrace_entry, &dead); + } /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) - wake_up_process(tsk->signal->group_exit_task); + wake_up_process(tsk->signal->group_exec_task); write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, &dead, ptrace_entry) { @@ -739,6 +779,67 @@ static void exit_notify(struct task_struct *tsk, int group_dead) } #ifdef CONFIG_DEBUG_STACK_USAGE +#ifdef CONFIG_STACK_GROWSUP +unsigned long stack_not_used(struct task_struct *p) +{ + unsigned long *n = end_of_stack(p); + + do { /* Skip over canary */ + n--; + } while (!*n); + + return (unsigned long)end_of_stack(p) - (unsigned long)n; +} +#else /* !CONFIG_STACK_GROWSUP */ +unsigned long stack_not_used(struct task_struct *p) +{ + unsigned long *n = end_of_stack(p); + + do { /* Skip over canary */ + n++; + } while (!*n); + + return (unsigned long)n - (unsigned long)end_of_stack(p); +} +#endif /* CONFIG_STACK_GROWSUP */ + +/* Count the maximum pages reached in kernel stacks */ +static inline void kstack_histogram(unsigned long used_stack) +{ +#ifdef CONFIG_VM_EVENT_COUNTERS + if (used_stack <= 1024) + count_vm_event(KSTACK_1K); +#if THREAD_SIZE > 1024 + else if (used_stack <= 2048) + count_vm_event(KSTACK_2K); +#endif +#if THREAD_SIZE > 2048 + else if (used_stack <= 4096) + count_vm_event(KSTACK_4K); +#endif +#if THREAD_SIZE > 4096 + else if (used_stack <= 8192) + count_vm_event(KSTACK_8K); +#endif +#if THREAD_SIZE > 8192 + else if (used_stack <= 16384) + count_vm_event(KSTACK_16K); +#endif +#if THREAD_SIZE > 16384 + else if (used_stack <= 32768) + count_vm_event(KSTACK_32K); +#endif +#if THREAD_SIZE > 32768 + else if (used_stack <= 65536) + count_vm_event(KSTACK_64K); +#endif +#if THREAD_SIZE > 65536 + else + count_vm_event(KSTACK_REST); +#endif +#endif /* CONFIG_VM_EVENT_COUNTERS */ +} + static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); @@ -746,6 +847,7 @@ static void check_stack_usage(void) unsigned long free; free = stack_not_used(current); + kstack_histogram(THREAD_SIZE - free); if (free >= lowest_to_date) return; @@ -758,87 +860,74 @@ static void check_stack_usage(void) } spin_unlock(&low_water_lock); } -#else +#else /* !CONFIG_DEBUG_STACK_USAGE */ static inline void check_stack_usage(void) {} -#endif +#endif /* CONFIG_DEBUG_STACK_USAGE */ + +static void synchronize_group_exit(struct task_struct *tsk, long code) +{ + struct sighand_struct *sighand = tsk->sighand; + struct signal_struct *signal = tsk->signal; + struct core_state *core_state; + + spin_lock_irq(&sighand->siglock); + signal->quick_threads--; + if ((signal->quick_threads == 0) && + !(signal->flags & SIGNAL_GROUP_EXIT)) { + signal->flags = SIGNAL_GROUP_EXIT; + signal->group_exit_code = code; + signal->group_stop_count = 0; + } + /* + * Serialize with any possible pending coredump. + * We must hold siglock around checking core_state + * and setting PF_POSTCOREDUMP. The core-inducing thread + * will increment ->nr_threads for each thread in the + * group without PF_POSTCOREDUMP set. + */ + tsk->flags |= PF_POSTCOREDUMP; + core_state = signal->core_state; + spin_unlock_irq(&sighand->siglock); + + if (unlikely(core_state)) + coredump_task_exit(tsk, core_state); +} void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; - profile_task_exit(tsk); - kcov_task_exit(tsk); - - WARN_ON(blk_needs_flush_plug(tsk)); - - if (unlikely(in_interrupt())) - panic("Aiee, killing interrupt handler!"); - if (unlikely(!tsk->pid)) - panic("Attempted to kill the idle task!"); + WARN_ON(irqs_disabled()); + WARN_ON(tsk->plug); - /* - * If do_exit is called because this processes oopsed, it's possible - * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before - * continuing. Amongst other possible reasons, this is to prevent - * mm_release()->clear_child_tid() from writing to a user-controlled - * kernel address. - */ - set_fs(USER_DS); + kcov_task_exit(tsk); + kmsan_task_exit(tsk); + synchronize_group_exit(tsk, code); ptrace_event(PTRACE_EVENT_EXIT, code); + user_events_exit(tsk); - validate_creds_for_do_exit(tsk); - - /* - * We're taking recursive faults here in do_exit. Safest is to just - * leave this task alone and wait for reboot. - */ - if (unlikely(tsk->flags & PF_EXITING)) { - pr_alert("Fixing recursive fault but reboot is needed!\n"); - /* - * We can do this unlocked here. The futex code uses - * this flag just to verify whether the pi state - * cleanup has been done or not. In the worst case it - * loops once more. We pretend that the cleanup was - * done as there is no way to return. Either the - * OWNER_DIED bit is set by now or we push the blocked - * task into the wait for ever nirwana as well. - */ - tsk->flags |= PF_EXITPIDONE; - set_current_state(TASK_UNINTERRUPTIBLE); - schedule(); - } - + io_uring_files_cancel(); + sched_mm_cid_exit(tsk); exit_signals(tsk); /* sets PF_EXITING */ - /* - * Ensure that all new tsk->pi_lock acquisitions must observe - * PF_EXITING. Serializes against futex.c:attach_to_pi_owner(). - */ - smp_mb(); - /* - * Ensure that we must observe the pi_state in exit_mm() -> - * mm_release() -> exit_pi_state_list(). - */ - raw_spin_lock_irq(&tsk->pi_lock); - raw_spin_unlock_irq(&tsk->pi_lock); - if (unlikely(in_atomic())) { - pr_info("note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); - preempt_count_set(PREEMPT_ENABLED); - } + seccomp_filter_release(tsk); - /* sync mm's RSS info before statistics gathering */ - if (tsk->mm) - sync_mm_rss(tsk->mm); acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { + /* + * If the last thread of global init has exited, panic + * immediately to get a useable coredump. + */ + if (unlikely(is_global_init(tsk))) + panic("Attempted to kill init! exitcode=0x%08x\n", + tsk->signal->group_exit_code ?: (int)code); + #ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); - exit_itimers(tsk->signal); + exit_itimers(tsk); #endif if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); @@ -850,12 +939,27 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); + trace_sched_process_exit(tsk, group_dead); + + /* + * Since sampling can touch ->mm, make sure to stop everything before we + * tear it down. + * + * Also flushes inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_event_exit_task(tsk); + /* + * PF_EXITING (above) ensures unwind_deferred_request() will no + * longer add new unwinds. While exit_mm() (below) will destroy the + * abaility to do unwinds. So flush any pending unwinds here. + */ + unwind_deferred_task_exit(tsk); exit_mm(); if (group_dead) acct_process(); - trace_sched_process_exit(tsk); exit_sem(tsk); exit_shm(tsk); @@ -863,21 +967,12 @@ void __noreturn do_exit(long code) exit_fs(tsk); if (group_dead) disassociate_ctty(1); - exit_task_namespaces(tsk); + exit_nsproxy_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); - exit_umh(tsk); - - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - * - * because of cgroup mode, must be called before cgroup_exit() - */ - perf_event_exit_task(tsk); sched_autogroup_exit_task(tsk); - cgroup_exit(tsk); + cgroup_task_exit(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint @@ -896,12 +991,6 @@ void __noreturn do_exit(long code) * Make sure we are holding no locks: */ debug_check_no_locks_held(); - /* - * We can do this unlocked here. The futex code uses this flag - * just to verify whether the pi state cleanup has been done - * or not. In the worst case it loops once more. - */ - tsk->flags |= PF_EXITPIDONE; if (tsk->io_context) exit_io_context(tsk); @@ -912,7 +1001,7 @@ void __noreturn do_exit(long code) if (tsk->task_frag.page) put_page(tsk->task_frag.page); - validate_creds_for_do_exit(tsk); + exit_task_stack_account(tsk); check_stack_usage(); preempt_disable(); @@ -924,16 +1013,66 @@ void __noreturn do_exit(long code) lockdep_free_task(tsk); do_task_dead(); } -EXPORT_SYMBOL_GPL(do_exit); -void complete_and_exit(struct completion *comp, long code) +void __noreturn make_task_dead(int signr) { - if (comp) - complete(comp); + /* + * Take the task off the cpu after something catastrophic has + * happened. + * + * We can get here from a kernel oops, sometimes with preemption off. + * Start by checking for critical errors. + * Then fix up important state like USER_DS and preemption. + * Then do everything else. + */ + struct task_struct *tsk = current; + unsigned int limit; + + if (unlikely(in_interrupt())) + panic("Aiee, killing interrupt handler!"); + if (unlikely(!tsk->pid)) + panic("Attempted to kill the idle task!"); + + if (unlikely(irqs_disabled())) { + pr_info("note: %s[%d] exited with irqs disabled\n", + current->comm, task_pid_nr(current)); + local_irq_enable(); + } + if (unlikely(in_atomic())) { + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); + preempt_count_set(PREEMPT_ENABLED); + } + + /* + * Every time the system oopses, if the oops happens while a reference + * to an object was held, the reference leaks. + * If the oops doesn't also leak memory, repeated oopsing can cause + * reference counters to wrap around (if they're not using refcount_t). + * This means that repeated oopsing can make unexploitable-looking bugs + * exploitable through repeated oopsing. + * To make sure this can't happen, place an upper bound on how often the + * kernel may oops without panic(). + */ + limit = READ_ONCE(oops_limit); + if (atomic_inc_return(&oops_count) >= limit && limit) + panic("Oopsed too often (kernel.oops_limit is %d)", limit); + + /* + * We're taking recursive faults here in make_task_dead. Safest is to just + * leave this task alone and wait for reboot. + */ + if (unlikely(tsk->flags & PF_EXITING)) { + pr_alert("Fixing recursive fault but reboot is needed!\n"); + futex_exit_recursive(tsk); + tsk->exit_state = EXIT_DEAD; + refcount_inc(&tsk->rcu_users); + do_task_dead(); + } - do_exit(code); + do_exit(signr); } -EXPORT_SYMBOL(complete_and_exit); SYSCALL_DEFINE1(exit, int, error_code) { @@ -944,22 +1083,24 @@ SYSCALL_DEFINE1(exit, int, error_code) * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ -void +void __noreturn do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; - BUG_ON(exit_code & 0x80); /* core dumps don't get here */ - - if (signal_group_exit(sig)) + if (sig->flags & SIGNAL_GROUP_EXIT) exit_code = sig->group_exit_code; - else if (!thread_group_empty(current)) { + else if (sig->group_exec_task) + exit_code = 0; + else { struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); - if (signal_group_exit(sig)) + if (sig->flags & SIGNAL_GROUP_EXIT) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; + else if (sig->group_exec_task) + exit_code = 0; else { sig->group_exit_code = exit_code; sig->flags = SIGNAL_GROUP_EXIT; @@ -984,26 +1125,6 @@ SYSCALL_DEFINE1(exit_group, int, error_code) return 0; } -struct waitid_info { - pid_t pid; - uid_t uid; - int status; - int cause; -}; - -struct wait_opts { - enum pid_type wo_type; - int wo_flags; - struct pid *wo_pid; - - struct waitid_info *wo_info; - int wo_stat; - struct rusage *wo_rusage; - - wait_queue_entry_t child_wait; - int notask_error; -}; - static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { return wo->wo_type == PIDTYPE_MAX || @@ -1054,7 +1175,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) return 0; if (unlikely(wo->wo_flags & WNOWAIT)) { - status = p->exit_code; + status = (p->signal->flags & SIGNAL_GROUP_EXIT) + ? p->signal->group_exit_code : p->exit_code; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); @@ -1096,18 +1218,15 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * p->signal fields because the whole thread group is dead * and nobody can change them. * - * psig->stats_lock also protects us from our sub-theads - * which can reap other children at the same time. Until - * we change k_getrusage()-like users to rely on this lock - * we have to take ->siglock as well. + * psig->stats_lock also protects us from our sub-threads + * which can reap other children at the same time. * * We use thread_group_cputime_adjusted() to get times for * the thread group, which consolidates times for all threads * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); - spin_lock_irq(¤t->sighand->siglock); - write_seqlock(&psig->stats_lock); + write_seqlock_irq(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; @@ -1130,8 +1249,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); - write_sequnlock(&psig->stats_lock); - spin_unlock_irq(¤t->sighand->siglock); + write_sequnlock_irq(&psig->stats_lock); } if (wo->wo_rusage) @@ -1466,6 +1584,17 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) return 0; } +bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p) +{ + if (!eligible_pid(wo, p)) + return false; + + if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent) + return false; + + return true; +} + static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { @@ -1473,32 +1602,62 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, child_wait); struct task_struct *p = key; - if (!eligible_pid(wo, p)) - return 0; - - if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) - return 0; + if (pid_child_should_wake(wo, p)) + return default_wake_function(wait, mode, sync, key); - return default_wake_function(wait, mode, sync, key); + return 0; } void __wake_up_parent(struct task_struct *p, struct task_struct *parent) { __wake_up_sync_key(&parent->signal->wait_chldexit, - TASK_INTERRUPTIBLE, 1, p); + TASK_INTERRUPTIBLE, p); } -static long do_wait(struct wait_opts *wo) +static bool is_effectively_child(struct wait_opts *wo, bool ptrace, + struct task_struct *target) +{ + struct task_struct *parent = + !ptrace ? target->real_parent : target->parent; + + return current == parent || (!(wo->wo_flags & __WNOTHREAD) && + same_thread_group(current, parent)); +} + +/* + * Optimization for waiting on PIDTYPE_PID. No need to iterate through child + * and tracee lists to find the target task. + */ +static int do_wait_pid(struct wait_opts *wo) { - struct task_struct *tsk; + bool ptrace; + struct task_struct *target; int retval; - trace_sched_process_wait(wo->wo_pid); + ptrace = false; + target = pid_task(wo->wo_pid, PIDTYPE_TGID); + if (target && is_effectively_child(wo, ptrace, target)) { + retval = wait_consider_task(wo, ptrace, target); + if (retval) + return retval; + } + + ptrace = true; + target = pid_task(wo->wo_pid, PIDTYPE_PID); + if (target && target->ptrace && + is_effectively_child(wo, ptrace, target)) { + retval = wait_consider_task(wo, ptrace, target); + if (retval) + return retval; + } + + return 0; +} + +long __do_wait(struct wait_opts *wo) +{ + long retval; - init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); - wo->child_wait.private = current; - add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); -repeat: /* * If there is nothing that can match our criteria, just get out. * We will clear ->notask_error to zero if we see any child that @@ -1507,48 +1666,73 @@ repeat: */ wo->notask_error = -ECHILD; if ((wo->wo_type < PIDTYPE_MAX) && - (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type]))) + (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type))) goto notask; - set_current_state(TASK_INTERRUPTIBLE); read_lock(&tasklist_lock); - tsk = current; - do { - retval = do_wait_thread(wo, tsk); - if (retval) - goto end; - retval = ptrace_do_wait(wo, tsk); + if (wo->wo_type == PIDTYPE_PID) { + retval = do_wait_pid(wo); if (retval) - goto end; + return retval; + } else { + struct task_struct *tsk = current; - if (wo->wo_flags & __WNOTHREAD) - break; - } while_each_thread(current, tsk); + do { + retval = do_wait_thread(wo, tsk); + if (retval) + return retval; + + retval = ptrace_do_wait(wo, tsk); + if (retval) + return retval; + + if (wo->wo_flags & __WNOTHREAD) + break; + } while_each_thread(current, tsk); + } read_unlock(&tasklist_lock); notask: retval = wo->notask_error; - if (!retval && !(wo->wo_flags & WNOHANG)) { - retval = -ERESTARTSYS; - if (!signal_pending(current)) { - schedule(); - goto repeat; - } - } -end: + if (!retval && !(wo->wo_flags & WNOHANG)) + return -ERESTARTSYS; + + return retval; +} + +static long do_wait(struct wait_opts *wo) +{ + int retval; + + trace_sched_process_wait(wo->wo_pid); + + init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); + wo->child_wait.private = current; + add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); + + do { + set_current_state(TASK_INTERRUPTIBLE); + retval = __do_wait(wo); + if (retval != -ERESTARTSYS) + break; + if (signal_pending(current)) + break; + schedule(); + } while (1); + __set_current_state(TASK_RUNNING); remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); return retval; } -static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, - int options, struct rusage *ru) +int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid, + struct waitid_info *infop, int options, + struct rusage *ru) { - struct wait_opts wo; + unsigned int f_flags = 0; struct pid *pid = NULL; enum pid_type type; - long ret; if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) @@ -1564,27 +1748,59 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, type = PIDTYPE_PID; if (upid <= 0) return -EINVAL; + + pid = find_get_pid(upid); break; case P_PGID: type = PIDTYPE_PGID; - if (upid <= 0) + if (upid < 0) return -EINVAL; + + if (upid) + pid = find_get_pid(upid); + else + pid = get_task_pid(current, PIDTYPE_PGID); + break; + case P_PIDFD: + type = PIDTYPE_PID; + if (upid < 0) + return -EINVAL; + + pid = pidfd_get_pid(upid, &f_flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + break; default: return -EINVAL; } - if (type < PIDTYPE_MAX) - pid = find_get_pid(upid); + wo->wo_type = type; + wo->wo_pid = pid; + wo->wo_flags = options; + wo->wo_info = infop; + wo->wo_rusage = ru; + if (f_flags & O_NONBLOCK) + wo->wo_flags |= WNOHANG; + + return 0; +} + +static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, + int options, struct rusage *ru) +{ + struct wait_opts wo; + long ret; + + ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru); + if (ret) + return ret; - wo.wo_type = type; - wo.wo_pid = pid; - wo.wo_flags = options; - wo.wo_info = infop; - wo.wo_rusage = ru; ret = do_wait(&wo); + if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG)) + ret = -EAGAIN; - put_pid(pid); + put_pid(wo.wo_pid); return ret; } @@ -1605,7 +1821,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, if (!infop) return err; - if (!user_access_begin(infop, sizeof(*infop))) + if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); @@ -1614,10 +1830,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); - user_access_end(); + user_write_access_end(); return err; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } @@ -1664,6 +1880,22 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options, return ret; } +int kernel_wait(pid_t pid, int *stat) +{ + struct wait_opts wo = { + .wo_type = PIDTYPE_PID, + .wo_pid = find_get_pid(pid), + .wo_flags = WEXITED, + }; + int ret; + + ret = do_wait(&wo); + if (ret > 0 && wo.wo_stat) + *stat = wo.wo_stat; + put_pid(wo.wo_pid); + return ret; +} + SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, int, options, struct rusage __user *, ru) { @@ -1732,7 +1964,7 @@ COMPAT_SYSCALL_DEFINE5(waitid, if (!infop) return err; - if (!user_access_begin(infop, sizeof(*infop))) + if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); @@ -1741,15 +1973,22 @@ COMPAT_SYSCALL_DEFINE5(waitid, unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); - user_access_end(); + user_write_access_end(); return err; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } #endif -__weak void abort(void) +/* + * This needs to be __function_aligned as GCC implicitly makes any + * implementation of abort() cold and drops alignment specified by + * -falign-functions=N. + * + * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11 + */ +__weak __function_aligned void abort(void) { BUG(); |
