diff options
Diffstat (limited to 'kernel/exit.c')
-rw-r--r-- | kernel/exit.c | 312 |
1 files changed, 195 insertions, 117 deletions
diff --git a/kernel/exit.c b/kernel/exit.c index 41a12630cbbc..bb184a67ac73 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -25,7 +25,6 @@ #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/freezer.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> @@ -70,6 +69,7 @@ #include <linux/sysfs.h> #include <linux/user_events.h> #include <linux/uaccess.h> +#include <linux/pidfs.h> #include <uapi/linux/wait.h> @@ -86,7 +86,7 @@ static unsigned int oops_limit = 10000; #ifdef CONFIG_SYSCTL -static struct ctl_table kern_exit_table[] = { +static const struct ctl_table kern_exit_table[] = { { .procname = "oops_limit", .data = &oops_limit, @@ -94,7 +94,6 @@ static struct ctl_table kern_exit_table[] = { .mode = 0644, .proc_handler = proc_douintvec, }, - { } }; static __init int kernel_exit_sysctls_init(void) @@ -124,14 +123,27 @@ static __init int kernel_exit_sysfs_init(void) late_initcall(kernel_exit_sysfs_init); #endif -static void __unhash_process(struct task_struct *p, bool group_dead) +/* + * For things release_task() would like to do *after* tasklist_lock is released. + */ +struct release_task_post { + struct pid *pids[PIDTYPE_MAX]; +}; + +static void __unhash_process(struct release_task_post *post, struct task_struct *p, + bool group_dead) { + struct pid *pid = task_pid(p); + nr_threads--; - detach_pid(p, PIDTYPE_PID); + + detach_pid(post->pids, p, PIDTYPE_PID); + wake_up_all(&pid->wait_pidfd); + if (group_dead) { - detach_pid(p, PIDTYPE_TGID); - detach_pid(p, PIDTYPE_PGID); - detach_pid(p, PIDTYPE_SID); + detach_pid(post->pids, p, PIDTYPE_TGID); + detach_pid(post->pids, p, PIDTYPE_PGID); + detach_pid(post->pids, p, PIDTYPE_SID); list_del_rcu(&p->tasks); list_del_init(&p->sibling); @@ -143,7 +155,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) /* * This function expects the tasklist_lock write-locked. */ -static void __exit_signal(struct task_struct *tsk) +static void __exit_signal(struct release_task_post *post, struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); @@ -176,9 +188,6 @@ static void __exit_signal(struct task_struct *tsk) sig->curr_target = next_thread(tsk); } - add_device_randomness((const void*) &tsk->se.sum_exec_runtime, - sizeof(unsigned long long)); - /* * Accumulate here the counters for all threads as they die. We could * skip the group leader because it is the last user of signal_struct, @@ -199,23 +208,15 @@ static void __exit_signal(struct task_struct *tsk) task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; - __unhash_process(tsk, group_dead); + __unhash_process(post, tsk, group_dead); write_sequnlock(&sig->stats_lock); - /* - * Do this under ->siglock, we can race with another thread - * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. - */ - flush_sigqueue(&tsk->pending); tsk->sighand = NULL; spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); - clear_tsk_thread_flag(tsk, TIF_SIGPENDING); - if (group_dead) { - flush_sigqueue(&sig->shared_pending); + if (group_dead) tty_kref_put(tty); - } } static void delayed_put_task_struct(struct rcu_head *rhp) @@ -241,22 +242,28 @@ void __weak release_thread(struct task_struct *dead_task) void release_task(struct task_struct *p) { + struct release_task_post post; struct task_struct *leader; struct pid *thread_pid; int zap_leader; repeat: + memset(&post, 0, sizeof(post)); + /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); rcu_read_unlock(); + pidfs_exit(p); cgroup_release(p); + /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */ + thread_pid = task_pid(p); + write_lock_irq(&tasklist_lock); ptrace_release_task(p); - thread_pid = get_pid(p->thread_pid); - __exit_signal(p); + __exit_signal(&post, p); /* * If we are the last non-leader member of the thread @@ -267,6 +274,9 @@ repeat: leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { + /* for pidfs_exit() and do_notify_parent() */ + if (leader->signal->flags & SIGNAL_GROUP_EXIT) + leader->exit_code = leader->signal->group_exit_code; /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, @@ -278,10 +288,22 @@ repeat: } write_unlock_irq(&tasklist_lock); - seccomp_filter_release(p); + /* @thread_pid can't go away until free_pids() below */ proc_flush_pid(thread_pid); - put_pid(thread_pid); + add_device_randomness(&p->se.sum_exec_runtime, + sizeof(p->se.sum_exec_runtime)); + free_pids(post.pids); release_thread(p); + /* + * This task was already removed from the process/thread/pid lists + * and lock_task_sighand(p) can't succeed. Nobody else can touch + * ->pending or, if group dead, signal->shared_pending. We can call + * flush_sigqueue() lockless. + */ + flush_sigqueue(&p->pending); + if (thread_group_leader(p)) + flush_sigqueue(&p->signal->shared_pending); + put_task_struct_rcu_user(p); p = leader; @@ -399,58 +421,73 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } -static void coredump_task_exit(struct task_struct *tsk) +static void coredump_task_exit(struct task_struct *tsk, + struct core_state *core_state) { - struct core_state *core_state; + struct core_thread self; + self.task = tsk; + if (self.task->flags & PF_SIGNALED) + self.next = xchg(&core_state->dumper.next, &self); + else + self.task = NULL; /* - * Serialize with any possible pending coredump. - * We must hold siglock around checking core_state - * and setting PF_POSTCOREDUMP. The core-inducing thread - * will increment ->nr_threads for each thread in the - * group without PF_POSTCOREDUMP set. + * Implies mb(), the result of xchg() must be visible + * to core_state->dumper. */ - spin_lock_irq(&tsk->sighand->siglock); - tsk->flags |= PF_POSTCOREDUMP; - core_state = tsk->signal->core_state; - spin_unlock_irq(&tsk->sighand->siglock); + if (atomic_dec_and_test(&core_state->nr_threads)) + complete(&core_state->startup); - /* The vhost_worker does not particpate in coredumps */ - if (core_state && - ((tsk->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER)) { - struct core_thread self; + for (;;) { + set_current_state(TASK_IDLE|TASK_FREEZABLE); + if (!self.task) /* see coredump_finish() */ + break; + schedule(); + } + __set_current_state(TASK_RUNNING); +} - self.task = current; - if (self.task->flags & PF_SIGNALED) - self.next = xchg(&core_state->dumper.next, &self); - else - self.task = NULL; - /* - * Implies mb(), the result of xchg() must be visible - * to core_state->dumper. - */ - if (atomic_dec_and_test(&core_state->nr_threads)) - complete(&core_state->startup); +#ifdef CONFIG_MEMCG +/* drops tasklist_lock if succeeds */ +static bool __try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm) +{ + bool ret = false; - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); - if (!self.task) /* see coredump_finish() */ - break; - schedule(); - } - __set_current_state(TASK_RUNNING); + task_lock(tsk); + if (likely(tsk->mm == mm)) { + /* tsk can't pass exit_mm/exec_mmap and exit */ + read_unlock(&tasklist_lock); + WRITE_ONCE(mm->owner, tsk); + lru_gen_migrate_mm(mm); + ret = true; + } + task_unlock(tsk); + return ret; +} + +static bool try_to_set_owner(struct task_struct *g, struct mm_struct *mm) +{ + struct task_struct *t; + + for_each_thread(g, t) { + struct mm_struct *t_mm = READ_ONCE(t->mm); + if (t_mm == mm) { + if (__try_to_set_owner(t, mm)) + return true; + } else if (t_mm) + break; } + + return false; } -#ifdef CONFIG_MEMCG /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ void mm_update_next_owner(struct mm_struct *mm) { - struct task_struct *c, *g, *p = current; + struct task_struct *g, *p = current; -retry: /* * If the exiting or execing task is not the owner, it's * someone else's problem. @@ -471,31 +508,27 @@ retry: /* * Search in the children */ - list_for_each_entry(c, &p->children, sibling) { - if (c->mm == mm) - goto assign_new_owner; + list_for_each_entry(g, &p->children, sibling) { + if (try_to_set_owner(g, mm)) + goto ret; } - /* * Search in the siblings */ - list_for_each_entry(c, &p->real_parent->children, sibling) { - if (c->mm == mm) - goto assign_new_owner; + list_for_each_entry(g, &p->real_parent->children, sibling) { + if (try_to_set_owner(g, mm)) + goto ret; } - /* * Search through everything else, we should not get here often. */ for_each_process(g) { + if (atomic_read(&mm->mm_users) <= 1) + break; if (g->flags & PF_KTHREAD) continue; - for_each_thread(g, c) { - if (c->mm == mm) - goto assign_new_owner; - if (c->mm) - break; - } + if (try_to_set_owner(g, mm)) + goto ret; } read_unlock(&tasklist_lock); /* @@ -504,30 +537,9 @@ retry: * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ WRITE_ONCE(mm->owner, NULL); + ret: return; -assign_new_owner: - BUG_ON(c == p); - get_task_struct(c); - /* - * The task_lock protects c->mm from changing. - * We always want mm->owner->mm == mm - */ - task_lock(c); - /* - * Delay read_unlock() till we have the task_lock() - * to ensure that c does not slip away underneath us - */ - read_unlock(&tasklist_lock); - if (c->mm != mm) { - task_unlock(c); - put_task_struct(c); - goto retry; - } - WRITE_ONCE(mm->owner, c); - lru_gen_migrate_mm(mm); - task_unlock(c); - put_task_struct(c); } #endif /* CONFIG_MEMCG */ @@ -739,12 +751,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); tsk->exit_state = EXIT_ZOMBIE; - /* - * sub-thread or delay_group_leader(), wake up the - * PIDFD_THREAD waiters. - */ - if (!thread_group_empty(tsk)) - do_notify_pidfd(tsk); if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && @@ -757,6 +763,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead) do_notify_parent(tsk, tsk->exit_signal); } else { autoreap = true; + /* untraced sub-thread */ + do_notify_pidfd(tsk); } if (autoreap) { @@ -776,6 +784,62 @@ static void exit_notify(struct task_struct *tsk, int group_dead) } #ifdef CONFIG_DEBUG_STACK_USAGE +unsigned long stack_not_used(struct task_struct *p) +{ + unsigned long *n = end_of_stack(p); + + do { /* Skip over canary */ +# ifdef CONFIG_STACK_GROWSUP + n--; +# else + n++; +# endif + } while (!*n); + +# ifdef CONFIG_STACK_GROWSUP + return (unsigned long)end_of_stack(p) - (unsigned long)n; +# else + return (unsigned long)n - (unsigned long)end_of_stack(p); +# endif +} + +/* Count the maximum pages reached in kernel stacks */ +static inline void kstack_histogram(unsigned long used_stack) +{ +#ifdef CONFIG_VM_EVENT_COUNTERS + if (used_stack <= 1024) + count_vm_event(KSTACK_1K); +#if THREAD_SIZE > 1024 + else if (used_stack <= 2048) + count_vm_event(KSTACK_2K); +#endif +#if THREAD_SIZE > 2048 + else if (used_stack <= 4096) + count_vm_event(KSTACK_4K); +#endif +#if THREAD_SIZE > 4096 + else if (used_stack <= 8192) + count_vm_event(KSTACK_8K); +#endif +#if THREAD_SIZE > 8192 + else if (used_stack <= 16384) + count_vm_event(KSTACK_16K); +#endif +#if THREAD_SIZE > 16384 + else if (used_stack <= 32768) + count_vm_event(KSTACK_32K); +#endif +#if THREAD_SIZE > 32768 + else if (used_stack <= 65536) + count_vm_event(KSTACK_64K); +#endif +#if THREAD_SIZE > 65536 + else + count_vm_event(KSTACK_REST); +#endif +#endif /* CONFIG_VM_EVENT_COUNTERS */ +} + static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); @@ -783,6 +847,7 @@ static void check_stack_usage(void) unsigned long free; free = stack_not_used(current); + kstack_histogram(THREAD_SIZE - free); if (free >= lowest_to_date) return; @@ -803,6 +868,7 @@ static void synchronize_group_exit(struct task_struct *tsk, long code) { struct sighand_struct *sighand = tsk->sighand; struct signal_struct *signal = tsk->signal; + struct core_state *core_state; spin_lock_irq(&sighand->siglock); signal->quick_threads--; @@ -812,7 +878,19 @@ static void synchronize_group_exit(struct task_struct *tsk, long code) signal->group_exit_code = code; signal->group_stop_count = 0; } + /* + * Serialize with any possible pending coredump. + * We must hold siglock around checking core_state + * and setting PF_POSTCOREDUMP. The core-inducing thread + * will increment ->nr_threads for each thread in the + * group without PF_POSTCOREDUMP set. + */ + tsk->flags |= PF_POSTCOREDUMP; + core_state = signal->core_state; spin_unlock_irq(&sighand->siglock); + + if (unlikely(core_state)) + coredump_task_exit(tsk, core_state); } void __noreturn do_exit(long code) @@ -821,21 +899,20 @@ void __noreturn do_exit(long code) int group_dead; WARN_ON(irqs_disabled()); - - synchronize_group_exit(tsk, code); - WARN_ON(tsk->plug); kcov_task_exit(tsk); kmsan_task_exit(tsk); - coredump_task_exit(tsk); + synchronize_group_exit(tsk, code); ptrace_event(PTRACE_EVENT_EXIT, code); user_events_exit(tsk); io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ + seccomp_filter_release(tsk); + acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { @@ -861,12 +938,21 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); + trace_sched_process_exit(tsk, group_dead); + + /* + * Since sampling can touch ->mm, make sure to stop everything before we + * tear it down. + * + * Also flushes inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_event_exit_task(tsk); exit_mm(); if (group_dead) acct_process(); - trace_sched_process_exit(tsk); exit_sem(tsk); exit_shm(tsk); @@ -878,14 +964,6 @@ void __noreturn do_exit(long code) exit_task_work(tsk); exit_thread(tsk); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - * - * because of cgroup mode, must be called before cgroup_exit() - */ - perf_event_exit_task(tsk); - sched_autogroup_exit_task(tsk); cgroup_exit(tsk); |