diff options
Diffstat (limited to 'kernel/exit.c')
-rw-r--r-- | kernel/exit.c | 185 |
1 files changed, 114 insertions, 71 deletions
diff --git a/kernel/exit.c b/kernel/exit.c index dfb963d2f862..3485e5fc499e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -25,7 +25,6 @@ #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/freezer.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> @@ -86,7 +85,7 @@ static unsigned int oops_limit = 10000; #ifdef CONFIG_SYSCTL -static struct ctl_table kern_exit_table[] = { +static const struct ctl_table kern_exit_table[] = { { .procname = "oops_limit", .data = &oops_limit, @@ -94,7 +93,6 @@ static struct ctl_table kern_exit_table[] = { .mode = 0644, .proc_handler = proc_douintvec, }, - { } }; static __init int kernel_exit_sysctls_init(void) @@ -278,7 +276,6 @@ repeat: } write_unlock_irq(&tasklist_lock); - seccomp_filter_release(p); proc_flush_pid(thread_pid); put_pid(thread_pid); release_thread(p); @@ -414,10 +411,7 @@ static void coredump_task_exit(struct task_struct *tsk) tsk->flags |= PF_POSTCOREDUMP; core_state = tsk->signal->core_state; spin_unlock_irq(&tsk->sighand->siglock); - - /* The vhost_worker does not particpate in coredumps */ - if (core_state && - ((tsk->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER)) { + if (core_state) { struct core_thread self; self.task = current; @@ -433,7 +427,7 @@ static void coredump_task_exit(struct task_struct *tsk) complete(&core_state->startup); for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); + set_current_state(TASK_IDLE|TASK_FREEZABLE); if (!self.task) /* see coredump_finish() */ break; schedule(); @@ -443,14 +437,46 @@ static void coredump_task_exit(struct task_struct *tsk) } #ifdef CONFIG_MEMCG +/* drops tasklist_lock if succeeds */ +static bool __try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm) +{ + bool ret = false; + + task_lock(tsk); + if (likely(tsk->mm == mm)) { + /* tsk can't pass exit_mm/exec_mmap and exit */ + read_unlock(&tasklist_lock); + WRITE_ONCE(mm->owner, tsk); + lru_gen_migrate_mm(mm); + ret = true; + } + task_unlock(tsk); + return ret; +} + +static bool try_to_set_owner(struct task_struct *g, struct mm_struct *mm) +{ + struct task_struct *t; + + for_each_thread(g, t) { + struct mm_struct *t_mm = READ_ONCE(t->mm); + if (t_mm == mm) { + if (__try_to_set_owner(t, mm)) + return true; + } else if (t_mm) + break; + } + + return false; +} + /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ void mm_update_next_owner(struct mm_struct *mm) { - struct task_struct *c, *g, *p = current; + struct task_struct *g, *p = current; -retry: /* * If the exiting or execing task is not the owner, it's * someone else's problem. @@ -471,31 +497,27 @@ retry: /* * Search in the children */ - list_for_each_entry(c, &p->children, sibling) { - if (c->mm == mm) - goto assign_new_owner; + list_for_each_entry(g, &p->children, sibling) { + if (try_to_set_owner(g, mm)) + goto ret; } - /* * Search in the siblings */ - list_for_each_entry(c, &p->real_parent->children, sibling) { - if (c->mm == mm) - goto assign_new_owner; + list_for_each_entry(g, &p->real_parent->children, sibling) { + if (try_to_set_owner(g, mm)) + goto ret; } - /* * Search through everything else, we should not get here often. */ for_each_process(g) { + if (atomic_read(&mm->mm_users) <= 1) + break; if (g->flags & PF_KTHREAD) continue; - for_each_thread(g, c) { - if (c->mm == mm) - goto assign_new_owner; - if (c->mm) - break; - } + if (try_to_set_owner(g, mm)) + goto ret; } read_unlock(&tasklist_lock); /* @@ -504,30 +526,9 @@ retry: * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ WRITE_ONCE(mm->owner, NULL); + ret: return; -assign_new_owner: - BUG_ON(c == p); - get_task_struct(c); - /* - * The task_lock protects c->mm from changing. - * We always want mm->owner->mm == mm - */ - task_lock(c); - /* - * Delay read_unlock() till we have the task_lock() - * to ensure that c does not slip away underneath us - */ - read_unlock(&tasklist_lock); - if (c->mm != mm) { - task_unlock(c); - put_task_struct(c); - goto retry; - } - WRITE_ONCE(mm->owner, c); - lru_gen_migrate_mm(mm); - task_unlock(c); - put_task_struct(c); } #endif /* CONFIG_MEMCG */ @@ -739,6 +740,13 @@ static void exit_notify(struct task_struct *tsk, int group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); tsk->exit_state = EXIT_ZOMBIE; + /* + * sub-thread or delay_group_leader(), wake up the + * PIDFD_THREAD waiters. + */ + if (!thread_group_empty(tsk)) + do_notify_pidfd(tsk); + if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && @@ -769,6 +777,62 @@ static void exit_notify(struct task_struct *tsk, int group_dead) } #ifdef CONFIG_DEBUG_STACK_USAGE +unsigned long stack_not_used(struct task_struct *p) +{ + unsigned long *n = end_of_stack(p); + + do { /* Skip over canary */ +# ifdef CONFIG_STACK_GROWSUP + n--; +# else + n++; +# endif + } while (!*n); + +# ifdef CONFIG_STACK_GROWSUP + return (unsigned long)end_of_stack(p) - (unsigned long)n; +# else + return (unsigned long)n - (unsigned long)end_of_stack(p); +# endif +} + +/* Count the maximum pages reached in kernel stacks */ +static inline void kstack_histogram(unsigned long used_stack) +{ +#ifdef CONFIG_VM_EVENT_COUNTERS + if (used_stack <= 1024) + count_vm_event(KSTACK_1K); +#if THREAD_SIZE > 1024 + else if (used_stack <= 2048) + count_vm_event(KSTACK_2K); +#endif +#if THREAD_SIZE > 2048 + else if (used_stack <= 4096) + count_vm_event(KSTACK_4K); +#endif +#if THREAD_SIZE > 4096 + else if (used_stack <= 8192) + count_vm_event(KSTACK_8K); +#endif +#if THREAD_SIZE > 8192 + else if (used_stack <= 16384) + count_vm_event(KSTACK_16K); +#endif +#if THREAD_SIZE > 16384 + else if (used_stack <= 32768) + count_vm_event(KSTACK_32K); +#endif +#if THREAD_SIZE > 32768 + else if (used_stack <= 65536) + count_vm_event(KSTACK_64K); +#endif +#if THREAD_SIZE > 65536 + else + count_vm_event(KSTACK_REST); +#endif +#endif /* CONFIG_VM_EVENT_COUNTERS */ +} + static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); @@ -776,6 +840,7 @@ static void check_stack_usage(void) unsigned long free; free = stack_not_used(current); + kstack_histogram(THREAD_SIZE - free); if (free >= lowest_to_date) return; @@ -829,6 +894,8 @@ void __noreturn do_exit(long code) io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ + seccomp_filter_release(tsk); + acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { @@ -1889,30 +1956,6 @@ Efault: } #endif -/** - * thread_group_exited - check that a thread group has exited - * @pid: tgid of thread group to be checked. - * - * Test if the thread group represented by tgid has exited (all - * threads are zombies, dead or completely gone). - * - * Return: true if the thread group has exited. false otherwise. - */ -bool thread_group_exited(struct pid *pid) -{ - struct task_struct *task; - bool exited; - - rcu_read_lock(); - task = pid_task(pid, PIDTYPE_PID); - exited = !task || - (READ_ONCE(task->exit_state) && thread_group_empty(task)); - rcu_read_unlock(); - - return exited; -} -EXPORT_SYMBOL(thread_group_exited); - /* * This needs to be __function_aligned as GCC implicitly makes any * implementation of abort() cold and drops alignment specified by |