From 15d42eb26bdee47c0278fbdab4198577bc6a97b5 Mon Sep 17 00:00:00 2001 From: Christian Kellner Date: Mon, 14 Oct 2019 18:20:32 +0200 Subject: pidfd: add NSpid entries to fdinfo Currently, the fdinfo file contains the Pid field which shows the pid a given pidfd refers to in the pid namespace of the procfs instance. If pid namespaces are configured, also show an NSpid field for easy retrieval of the pid in all descendant pid namespaces. If the pid namespace of the process is not a descendant of the pid namespace of the procfs instance 0 will be shown as its first NSpid entry and no other entries will be shown. Add a block comment to pidfd_show_fdinfo with a detailed explanation of Pid and NSpid fields. Co-developed-by: Christian Brauner Signed-off-by: Christian Brauner Signed-off-by: Christian Kellner Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20191014162034.2185-1-ckellner@redhat.com Signed-off-by: Christian Brauner --- kernel/fork.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index bcdf53125210..782986962d47 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1695,12 +1695,63 @@ static int pidfd_release(struct inode *inode, struct file *file) } #ifdef CONFIG_PROC_FS +/** + * pidfd_show_fdinfo - print information about a pidfd + * @m: proc fdinfo file + * @f: file referencing a pidfd + * + * Pid: + * This function will print the pid that a given pidfd refers to in the + * pid namespace of the procfs instance. + * If the pid namespace of the process is not a descendant of the pid + * namespace of the procfs instance 0 will be shown as its pid. This is + * similar to calling getppid() on a process whose parent is outside of + * its pid namespace. + * + * NSpid: + * If pid namespaces are supported then this function will also print + * the pid of a given pidfd refers to for all descendant pid namespaces + * starting from the current pid namespace of the instance, i.e. the + * Pid field and the first entry in the NSpid field will be identical. + * If the pid namespace of the process is not a descendant of the pid + * namespace of the procfs instance 0 will be shown as its first NSpid + * entry and no others will be shown. + * Note that this differs from the Pid and NSpid fields in + * /proc//status where Pid and NSpid are always shown relative to + * the pid namespace of the procfs instance. The difference becomes + * obvious when sending around a pidfd between pid namespaces from a + * different branch of the tree, i.e. where no ancestoral relation is + * present between the pid namespaces: + * - create two new pid namespaces ns1 and ns2 in the initial pid + * namespace (also take care to create new mount namespaces in the + * new pid namespace and mount procfs) + * - create a process with a pidfd in ns1 + * - send pidfd from ns1 to ns2 + * - read /proc/self/fdinfo/ and observe that both Pid and NSpid + * have exactly one entry, which is 0 + */ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) { struct pid_namespace *ns = proc_pid_ns(file_inode(m->file)); struct pid *pid = f->private_data; + pid_t nr = pid_nr_ns(pid, ns); + + seq_put_decimal_ull(m, "Pid:\t", nr); - seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns)); +#ifdef CONFIG_PID_NS + seq_put_decimal_ull(m, "\nNSpid:\t", nr); + if (nr) { + int i; + + /* If nr is non-zero it means that 'pid' is valid and that + * ns, i.e. the pid namespace associated with the procfs + * instance, is in the pid namespace hierarchy of pid. + * Start at one below the already printed level. + */ + for (i = ns->level + 1; i <= pid->level; i++) + seq_put_decimal_ull(m, "\t", pid->numbers[i].nr); + } +#endif seq_putc(m, '\n'); } #endif -- cgit From 3d6d8da48d0b214d65ea0227d47228abc75d7c88 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 17 Oct 2019 12:18:28 +0200 Subject: pidfd: check pid has attached task in fdinfo Currently, when a task is dead we still print the pid it used to use in the fdinfo files of its pidfds. This doesn't make much sense since the pid may have already been reused. So verify that the task is still alive by introducing the pid_has_task() helper which will be used by other callers in follow-up patches. If the task is not alive anymore, we will print -1. This allows us to differentiate between a task not being present in a given pid namespace - in which case we already print 0 - and a task having been reaped. Note that this uses PIDTYPE_PID for the check. Technically, we could've checked PIDTYPE_TGID since pidfds currently only refer to thread-group leaders but if they won't anymore in the future then this check becomes problematic without it being immediately obvious to non-experts imho. If a thread is created via clone(CLONE_THREAD) than struct pid has a single non-empty list pid->tasks[PIDTYPE_PID] and this pid can't be used as a PIDTYPE_TGID meaning pid->tasks[PIDTYPE_TGID] will return NULL even though the thread-group leader might still be very much alive. So checking PIDTYPE_PID is fine and is easier to maintain should we ever allow pidfds to refer to threads. Cc: Jann Horn Cc: Christian Kellner Cc: linux-api@vger.kernel.org Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20191017101832.5985-1-christian.brauner@ubuntu.com --- kernel/fork.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 782986962d47..ffa314838b43 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1732,15 +1732,20 @@ static int pidfd_release(struct inode *inode, struct file *file) */ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) { - struct pid_namespace *ns = proc_pid_ns(file_inode(m->file)); struct pid *pid = f->private_data; - pid_t nr = pid_nr_ns(pid, ns); + struct pid_namespace *ns; + pid_t nr = -1; - seq_put_decimal_ull(m, "Pid:\t", nr); + if (likely(pid_has_task(pid, PIDTYPE_PID))) { + ns = proc_pid_ns(file_inode(m->file)); + nr = pid_nr_ns(pid, ns); + } + + seq_put_decimal_ll(m, "Pid:\t", nr); #ifdef CONFIG_PID_NS - seq_put_decimal_ull(m, "\nNSpid:\t", nr); - if (nr) { + seq_put_decimal_ll(m, "\nNSpid:\t", nr); + if (nr > 0) { int i; /* If nr is non-zero it means that 'pid' is valid and that @@ -1749,7 +1754,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) * Start at one below the already printed level. */ for (i = ns->level + 1; i <= pid->level; i++) - seq_put_decimal_ull(m, "\t", pid->numbers[i].nr); + seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); } #endif seq_putc(m, '\n'); -- cgit From b612e5df4587c934bd056bf05f4a1deca4de4f75 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 14 Oct 2019 12:45:37 +0200 Subject: clone3: add CLONE_CLEAR_SIGHAND Reset all signal handlers of the child not set to SIG_IGN to SIG_DFL. Mutually exclusive with CLONE_SIGHAND to not disturb other thread's signal handler. In the spirit of closer cooperation between glibc developers and kernel developers (cf. [2]) this patchset came out of a discussion on the glibc mailing list for improving posix_spawn() (cf. [1], [3], [4]). Kernel support for this feature has been explicitly requested by glibc and I see no reason not to help them with this. The child helper process on Linux posix_spawn must ensure that no signal handlers are enabled, so the signal disposition must be either SIG_DFL or SIG_IGN. However, it requires a sigprocmask to obtain the current signal mask and at least _NSIG sigaction calls to reset the signal handlers for each posix_spawn call or complex state tracking that might lead to data corruption in glibc. Adding this flags lets glibc avoid these problems. [1]: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00149.html [3]: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00158.html [4]: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00160.html [2]: https://lwn.net/Articles/799331/ '[...] by asking for better cooperation with the C-library projects in general. They should be copied on patches containing ABI changes, for example. I noted that there are often times where C-library developers wish the kernel community had done things differently; how could those be avoided in the future? Members of the audience suggested that more glibc developers should perhaps join the linux-api list. The other suggestion was to "copy Florian on everything".' Cc: Florian Weimer Cc: libc-alpha@sourceware.org Cc: linux-api@vger.kernel.org Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20191014104538.3096-1-christian.brauner@ubuntu.com --- kernel/fork.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index ffa314838b43..954e875e72b1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1517,6 +1517,11 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) spin_lock_irq(¤t->sighand->siglock); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); spin_unlock_irq(¤t->sighand->siglock); + + /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */ + if (clone_flags & CLONE_CLEAR_SIGHAND) + flush_signal_handlers(tsk, 0); + return 0; } @@ -2619,11 +2624,8 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, static bool clone3_args_valid(const struct kernel_clone_args *kargs) { - /* - * All lower bits of the flag word are taken. - * Verify that no other unknown flags are passed along. - */ - if (kargs->flags & ~CLONE_LEGACY_FLAGS) + /* Verify that no unknown flags are passed along. */ + if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND)) return false; /* @@ -2633,6 +2635,10 @@ static bool clone3_args_valid(const struct kernel_clone_args *kargs) if (kargs->flags & (CLONE_DETACHED | CSIGNAL)) return false; + if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) == + (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) + return false; + if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) && kargs->exit_signal) return false; -- cgit From fa729c4df558936b4a1a7b3e2234011f44ede28b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 31 Oct 2019 12:36:08 +0100 Subject: clone3: validate stack arguments Validate the stack arguments and setup the stack depening on whether or not it is growing down or up. Legacy clone() required userspace to know in which direction the stack is growing and pass down the stack pointer appropriately. To make things more confusing microblaze uses a variant of the clone() syscall selected by CONFIG_CLONE_BACKWARDS3 that takes an additional stack_size argument. IA64 has a separate clone2() syscall which also takes an additional stack_size argument. Finally, parisc has a stack that is growing upwards. Userspace therefore has a lot nasty code like the following: #define __STACK_SIZE (8 * 1024 * 1024) pid_t sys_clone(int (*fn)(void *), void *arg, int flags, int *pidfd) { pid_t ret; void *stack; stack = malloc(__STACK_SIZE); if (!stack) return -ENOMEM; #ifdef __ia64__ ret = __clone2(fn, stack, __STACK_SIZE, flags | SIGCHLD, arg, pidfd); #elif defined(__parisc__) /* stack grows up */ ret = clone(fn, stack, flags | SIGCHLD, arg, pidfd); #else ret = clone(fn, stack + __STACK_SIZE, flags | SIGCHLD, arg, pidfd); #endif return ret; } or even crazier variants such as [3]. With clone3() we have the ability to validate the stack. We can check that when stack_size is passed, the stack pointer is valid and the other way around. We can also check that the memory area userspace gave us is fine to use via access_ok(). Furthermore, we probably should not require userspace to know in which direction the stack is growing. It is easy for us to do this in the kernel and I couldn't find the original reasoning behind exposing this detail to userspace. /* Intentional user visible API change */ clone3() was released with 5.3. Currently, it is not documented and very unclear to userspace how the stack and stack_size argument have to be passed. After talking to glibc folks we concluded that trying to change clone3() to setup the stack instead of requiring userspace to do this is the right course of action. Note, that this is an explicit change in user visible behavior we introduce with this patch. If it breaks someone's use-case we will revert! (And then e.g. place the new behavior under an appropriate flag.) Breaking someone's use-case is very unlikely though. First, neither glibc nor musl currently expose a wrapper for clone3(). Second, there is no real motivation for anyone to use clone3() directly since it does not provide features that legacy clone doesn't. New features for clone3() will first happen in v5.5 which is why v5.4 is still a good time to try and make that change now and backport it to v5.3. Searches on [4] did not reveal any packages calling clone3(). [1]: https://lore.kernel.org/r/CAG48ez3q=BeNcuVTKBN79kJui4vC6nw0Bfq6xc-i0neheT17TA@mail.gmail.com [2]: https://lore.kernel.org/r/20191028172143.4vnnjpdljfnexaq5@wittgenstein [3]: https://github.com/systemd/systemd/blob/5238e9575906297608ff802a27e2ff9effa3b338/src/basic/raw-clone.h#L31 [4]: https://codesearch.debian.net Fixes: 7f192e3cd316 ("fork: add clone3") Cc: Kees Cook Cc: Jann Horn Cc: David Howells Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Linus Torvalds Cc: Florian Weimer Cc: Peter Zijlstra Cc: linux-api@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: # 5.3 Cc: GNU C Library Signed-off-by: Christian Brauner Acked-by: Arnd Bergmann Acked-by: Aleksa Sarai Link: https://lore.kernel.org/r/20191031113608.20713-1-christian.brauner@ubuntu.com --- kernel/fork.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index bcdf53125210..55af6931c6ec 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2561,7 +2561,35 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, return 0; } -static bool clone3_args_valid(const struct kernel_clone_args *kargs) +/** + * clone3_stack_valid - check and prepare stack + * @kargs: kernel clone args + * + * Verify that the stack arguments userspace gave us are sane. + * In addition, set the stack direction for userspace since it's easy for us to + * determine. + */ +static inline bool clone3_stack_valid(struct kernel_clone_args *kargs) +{ + if (kargs->stack == 0) { + if (kargs->stack_size > 0) + return false; + } else { + if (kargs->stack_size == 0) + return false; + + if (!access_ok((void __user *)kargs->stack, kargs->stack_size)) + return false; + +#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64) + kargs->stack += kargs->stack_size; +#endif + } + + return true; +} + +static bool clone3_args_valid(struct kernel_clone_args *kargs) { /* * All lower bits of the flag word are taken. @@ -2581,6 +2609,9 @@ static bool clone3_args_valid(const struct kernel_clone_args *kargs) kargs->exit_signal) return false; + if (!clone3_stack_valid(kargs)) + return false; + return true; } -- cgit From cf25e24db61cc9df42c47485a2ec2bff4e9a3692 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Nov 2019 11:07:58 +0100 Subject: time: Rename tsk->real_start_time to ->start_boottime Since it stores CLOCK_BOOTTIME, not, as the name suggests, CLOCK_REALTIME, let's rename ->real_start_time to ->start_bootime. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index bcdf53125210..1392ee8f4848 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2130,7 +2130,7 @@ static __latent_entropy struct task_struct *copy_process( */ p->start_time = ktime_get_ns(); - p->real_start_time = ktime_get_boottime_ns(); + p->start_boottime = ktime_get_boottime_ns(); /* * Make it visible to the rest of the system, but dont wake it up yet. -- cgit From 49cb2fc42ce4b7a656ee605e30c302efaa39c1a7 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 15 Nov 2019 13:36:20 +0100 Subject: fork: extend clone3() to support setting a PID The main motivation to add set_tid to clone3() is CRIU. To restore a process with the same PID/TID CRIU currently uses /proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to ns_last_pid and then (quickly) does a clone(). This works most of the time, but it is racy. It is also slow as it requires multiple syscalls. Extending clone3() to support *set_tid makes it possible restore a process using CRIU without accessing /proc/sys/kernel/ns_last_pid and race free (as long as the desired PID/TID is available). This clone3() extension places the same restrictions (CAP_SYS_ADMIN) on clone3() with *set_tid as they are currently in place for ns_last_pid. The original version of this change was using a single value for set_tid. At the 2019 LPC, after presenting set_tid, it was, however, decided to change set_tid to an array to enable setting the PID of a process in multiple PID namespaces at the same time. If a process is created in a PID namespace it is possible to influence the PID inside and outside of the PID namespace. Details also in the corresponding selftest. To create a process with the following PIDs: PID NS level Requested PID 0 (host) 31496 1 42 2 1 For that example the two newly introduced parameters to struct clone_args (set_tid and set_tid_size) would need to be: set_tid[0] = 1; set_tid[1] = 42; set_tid[2] = 31496; set_tid_size = 3; If only the PIDs of the two innermost nested PID namespaces should be defined it would look like this: set_tid[0] = 1; set_tid[1] = 42; set_tid_size = 2; The PID of the newly created process would then be the next available free PID in the PID namespace level 0 (host) and 42 in the PID namespace at level 1 and the PID of the process in the innermost PID namespace would be 1. The set_tid array is used to specify the PID of a process starting from the innermost nested PID namespaces up to set_tid_size PID namespaces. set_tid_size cannot be larger then the current PID namespace level. Signed-off-by: Adrian Reber Reviewed-by: Christian Brauner Reviewed-by: Oleg Nesterov Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Acked-by: Andrei Vagin Link: https://lore.kernel.org/r/20191115123621.142252-1-areber@redhat.com Signed-off-by: Christian Brauner --- kernel/fork.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 954e875e72b1..417570263f1f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2087,7 +2087,8 @@ static __latent_entropy struct task_struct *copy_process( stackleak_task_init(p); if (pid != &init_struct_pid) { - pid = alloc_pid(p->nsproxy->pid_ns_for_children); + pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid, + args->set_tid_size); if (IS_ERR(pid)) { retval = PTR_ERR(pid); goto bad_fork_cleanup_thread; @@ -2590,6 +2591,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, { int err; struct clone_args args; + pid_t *kset_tid = kargs->set_tid; if (unlikely(usize > PAGE_SIZE)) return -E2BIG; @@ -2600,6 +2602,15 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, if (err) return err; + if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL)) + return -EINVAL; + + if (unlikely(!args.set_tid && args.set_tid_size > 0)) + return -EINVAL; + + if (unlikely(args.set_tid && args.set_tid_size == 0)) + return -EINVAL; + /* * Verify that higher 32bits of exit_signal are unset and that * it is a valid signal @@ -2617,8 +2628,16 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, .stack = args.stack, .stack_size = args.stack_size, .tls = args.tls, + .set_tid_size = args.set_tid_size, }; + if (args.set_tid && + copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid), + (kargs->set_tid_size * sizeof(pid_t)))) + return -EFAULT; + + kargs->set_tid = kset_tid; + return 0; } @@ -2662,6 +2681,9 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) int err; struct kernel_clone_args kargs; + pid_t set_tid[MAX_PID_NS_LEVEL]; + + kargs.set_tid = set_tid; err = copy_clone_args_from_user(&kargs, uargs, size); if (err) -- cgit From ba31c1a48538992316cc71ce94fa9cd3e7b427c0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Nov 2019 22:55:36 +0100 Subject: futex: Move futex exit handling into futex code The futex exit handling is #ifdeffed into mm_release() which is not pretty to begin with. But upcoming changes to address futex exit races need to add more functionality to this exit code. Split it out into a function, move it into futex code and make the various futex exit functions static. Preparatory only and no functional change. Folded build fix from Borislav. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191106224556.049705556@linutronix.de --- kernel/fork.c | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index bcdf53125210..bd7c218691d4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1286,20 +1286,7 @@ static int wait_for_vfork_done(struct task_struct *child, void mm_release(struct task_struct *tsk, struct mm_struct *mm) { /* Get rid of any futexes when releasing the mm */ -#ifdef CONFIG_FUTEX - if (unlikely(tsk->robust_list)) { - exit_robust_list(tsk); - tsk->robust_list = NULL; - } -#ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) { - compat_exit_robust_list(tsk); - tsk->compat_robust_list = NULL; - } -#endif - if (unlikely(!list_empty(&tsk->pi_state_list))) - exit_pi_state_list(tsk); -#endif + futex_mm_release(tsk); uprobe_free_utask(tsk); @@ -2062,14 +2049,8 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_BLOCK p->plug = NULL; #endif -#ifdef CONFIG_FUTEX - p->robust_list = NULL; -#ifdef CONFIG_COMPAT - p->compat_robust_list = NULL; -#endif - INIT_LIST_HEAD(&p->pi_state_list); - p->pi_state_cache = NULL; -#endif + futex_init_task(p); + /* * sigaltstack should be cleared when sharing the same VM */ -- cgit From 4610ba7ad877fafc0a25a30c6c82015304120426 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Nov 2019 22:55:38 +0100 Subject: exit/exec: Seperate mm_release() mm_release() contains the futex exit handling. mm_release() is called from do_exit()->exit_mm() and from exec()->exec_mm(). In the exit_mm() case PF_EXITING and the futex state is updated. In the exec_mm() case these states are not touched. As the futex exit code needs further protections against exit races, this needs to be split into two functions. Preparatory only, no functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191106224556.240518241@linutronix.de --- kernel/fork.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index bd7c218691d4..096f9d840bb8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1283,7 +1283,7 @@ static int wait_for_vfork_done(struct task_struct *child, * restoring the old one. . . * Eric Biederman 10 January 1998 */ -void mm_release(struct task_struct *tsk, struct mm_struct *mm) +static void mm_release(struct task_struct *tsk, struct mm_struct *mm) { /* Get rid of any futexes when releasing the mm */ futex_mm_release(tsk); @@ -1320,6 +1320,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) complete_vfork_done(tsk); } +void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm) +{ + mm_release(tsk, mm); +} + +void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm) +{ + mm_release(tsk, mm); +} + /** * dup_mm() - duplicates an existing mm structure * @tsk: the task_struct with which the new mm will be associated. -- cgit From 150d71584b12809144b8145b817e83b81158ae5f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Nov 2019 22:55:39 +0100 Subject: futex: Split futex_mm_release() for exit/exec To allow separate handling of the futex exit state in the futex exit code for exit and exec, split futex_mm_release() into two functions and invoke them from the corresponding exit/exec_mm_release() callsites. Preparatory only, no functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191106224556.332094221@linutronix.de --- kernel/fork.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 096f9d840bb8..f1eb4d1f1a3b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1285,9 +1285,6 @@ static int wait_for_vfork_done(struct task_struct *child, */ static void mm_release(struct task_struct *tsk, struct mm_struct *mm) { - /* Get rid of any futexes when releasing the mm */ - futex_mm_release(tsk); - uprobe_free_utask(tsk); /* Get rid of any cached register state */ @@ -1322,11 +1319,13 @@ static void mm_release(struct task_struct *tsk, struct mm_struct *mm) void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm) { + futex_exit_release(tsk); mm_release(tsk, mm); } void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm) { + futex_exec_release(tsk); mm_release(tsk, mm); } -- cgit From 9e77716a75bc6cf54965e5ec069ba7c02b32251c Mon Sep 17 00:00:00 2001 From: Luc Van Oostenryck Date: Wed, 20 Nov 2019 01:33:20 +0100 Subject: fork: fix pidfd_poll()'s return type pidfd_poll() is defined as returning 'unsigned int' but the .poll method is declared as returning '__poll_t', a bitwise type. Fix this by using the proper return type and using the EPOLL constants instead of the POLL ones, as required for __poll_t. Fixes: b53b0b9d9a61 ("pidfd: add polling support") Cc: Joel Fernandes (Google) Cc: stable@vger.kernel.org # 5.3 Signed-off-by: Luc Van Oostenryck Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20191120003320.31138-1-luc.vanoostenryck@gmail.com Signed-off-by: Christian Brauner --- kernel/fork.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 55af6931c6ec..13b38794efb5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1708,11 +1708,11 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) /* * Poll support for process exit notification. */ -static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts) +static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { struct task_struct *task; struct pid *pid = file->private_data; - int poll_flags = 0; + __poll_t poll_flags = 0; poll_wait(file, &pid->wait_pidfd, pts); @@ -1724,7 +1724,7 @@ static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts) * group, then poll(2) should block, similar to the wait(2) family. */ if (!task || (task->exit_state && thread_group_empty(task))) - poll_flags = POLLIN | POLLRDNORM; + poll_flags = EPOLLIN | EPOLLRDNORM; rcu_read_unlock(); return poll_flags; -- cgit From 107e899874e95dcddc779142942bf285eba38bc5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 Nov 2019 16:22:21 -0400 Subject: mm/hmm: define the pre-processor related parts of hmm.h even if disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only the function calls are stubbed out with static inlines that always fail. This is the standard way to write a header for an optional component and makes it easier for drivers that only optionally need HMM_MIRROR. Link: https://lore.kernel.org/r/20191112202231.3856-5-jgg@ziepe.ca Reviewed-by: Jérôme Glisse Tested-by: Ralph Campbell Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index bcdf53125210..ca39cfc404e3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include -- cgit From eafb149ed73a8bb8359c0ce027b98acd4e95b070 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Sat, 30 Nov 2019 17:54:57 -0800 Subject: fork: support VMAP_STACK with KASAN_VMALLOC Supporting VMAP_STACK with KASAN_VMALLOC is straightforward: - clear the shadow region of vmapped stacks when swapping them in - tweak Kconfig to allow VMAP_STACK to be turned on with KASAN Link: http://lkml.kernel.org/r/20191031093909.9228-4-dja@axtens.net Signed-off-by: Daniel Axtens Reviewed-by: Dmitry Vyukov Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Christophe Leroy Cc: Mark Rutland Cc: Vasily Gorbik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 0f0bac8318dd..21c6c1e29b98 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -93,6 +93,7 @@ #include #include #include +#include #include #include @@ -223,6 +224,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) if (!s) continue; + /* Clear the KASAN shadow of the stack. */ + kasan_unpoison_shadow(s->addr, THREAD_SIZE); + /* Clear stale pointers from reused stack. */ memset(s->addr, 0, THREAD_SIZE); -- cgit