diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/exit.c | 31 | ||||
-rw-r--r-- | kernel/fork.c | 147 | ||||
-rw-r--r-- | kernel/nsproxy.c | 2 | ||||
-rw-r--r-- | kernel/pid.c | 57 | ||||
-rw-r--r-- | kernel/signal.c | 110 |
5 files changed, 143 insertions, 204 deletions
diff --git a/kernel/exit.c b/kernel/exit.c index dfb963d2f862..41a12630cbbc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -739,6 +739,13 @@ static void exit_notify(struct task_struct *tsk, int group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); tsk->exit_state = EXIT_ZOMBIE; + /* + * sub-thread or delay_group_leader(), wake up the + * PIDFD_THREAD waiters. + */ + if (!thread_group_empty(tsk)) + do_notify_pidfd(tsk); + if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && @@ -1889,30 +1896,6 @@ Efault: } #endif -/** - * thread_group_exited - check that a thread group has exited - * @pid: tgid of thread group to be checked. - * - * Test if the thread group represented by tgid has exited (all - * threads are zombies, dead or completely gone). - * - * Return: true if the thread group has exited. false otherwise. - */ -bool thread_group_exited(struct pid *pid) -{ - struct task_struct *task; - bool exited; - - rcu_read_lock(); - task = pid_task(pid, PIDTYPE_PID); - exited = !task || - (READ_ONCE(task->exit_state) && thread_group_empty(task)); - rcu_read_unlock(); - - return exited; -} -EXPORT_SYMBOL(thread_group_exited); - /* * This needs to be __function_aligned as GCC implicitly makes any * implementation of abort() cold and drops alignment specified by diff --git a/kernel/fork.c b/kernel/fork.c index 0d944e92a43f..1af8dfd149ee 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -101,6 +101,8 @@ #include <linux/user_events.h> #include <linux/iommu.h> #include <linux/rseq.h> +#include <uapi/linux/pidfd.h> +#include <linux/pidfs.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -1985,119 +1987,6 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } -struct pid *pidfd_pid(const struct file *file) -{ - if (file->f_op == &pidfd_fops) - return file->private_data; - - return ERR_PTR(-EBADF); -} - -static int pidfd_release(struct inode *inode, struct file *file) -{ - struct pid *pid = file->private_data; - - file->private_data = NULL; - put_pid(pid); - return 0; -} - -#ifdef CONFIG_PROC_FS -/** - * pidfd_show_fdinfo - print information about a pidfd - * @m: proc fdinfo file - * @f: file referencing a pidfd - * - * Pid: - * This function will print the pid that a given pidfd refers to in the - * pid namespace of the procfs instance. - * If the pid namespace of the process is not a descendant of the pid - * namespace of the procfs instance 0 will be shown as its pid. This is - * similar to calling getppid() on a process whose parent is outside of - * its pid namespace. - * - * NSpid: - * If pid namespaces are supported then this function will also print - * the pid of a given pidfd refers to for all descendant pid namespaces - * starting from the current pid namespace of the instance, i.e. the - * Pid field and the first entry in the NSpid field will be identical. - * If the pid namespace of the process is not a descendant of the pid - * namespace of the procfs instance 0 will be shown as its first NSpid - * entry and no others will be shown. - * Note that this differs from the Pid and NSpid fields in - * /proc/<pid>/status where Pid and NSpid are always shown relative to - * the pid namespace of the procfs instance. The difference becomes - * obvious when sending around a pidfd between pid namespaces from a - * different branch of the tree, i.e. where no ancestral relation is - * present between the pid namespaces: - * - create two new pid namespaces ns1 and ns2 in the initial pid - * namespace (also take care to create new mount namespaces in the - * new pid namespace and mount procfs) - * - create a process with a pidfd in ns1 - * - send pidfd from ns1 to ns2 - * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid - * have exactly one entry, which is 0 - */ -static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) -{ - struct pid *pid = f->private_data; - struct pid_namespace *ns; - pid_t nr = -1; - - if (likely(pid_has_task(pid, PIDTYPE_PID))) { - ns = proc_pid_ns(file_inode(m->file)->i_sb); - nr = pid_nr_ns(pid, ns); - } - - seq_put_decimal_ll(m, "Pid:\t", nr); - -#ifdef CONFIG_PID_NS - seq_put_decimal_ll(m, "\nNSpid:\t", nr); - if (nr > 0) { - int i; - - /* If nr is non-zero it means that 'pid' is valid and that - * ns, i.e. the pid namespace associated with the procfs - * instance, is in the pid namespace hierarchy of pid. - * Start at one below the already printed level. - */ - for (i = ns->level + 1; i <= pid->level; i++) - seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); - } -#endif - seq_putc(m, '\n'); -} -#endif - -/* - * Poll support for process exit notification. - */ -static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) -{ - struct pid *pid = file->private_data; - __poll_t poll_flags = 0; - - poll_wait(file, &pid->wait_pidfd, pts); - - /* - * Inform pollers only when the whole thread group exits. - * If the thread group leader exits before all other threads in the - * group, then poll(2) should block, similar to the wait(2) family. - */ - if (thread_group_exited(pid)) - poll_flags = EPOLLIN | EPOLLRDNORM; - - return poll_flags; -} - -const struct file_operations pidfd_fops = { - .release = pidfd_release, - .poll = pidfd_poll, -#ifdef CONFIG_PROC_FS - .show_fdinfo = pidfd_show_fdinfo, -#endif -}; - /** * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd * @pid: the struct pid for which to create a pidfd @@ -2131,20 +2020,20 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re int pidfd; struct file *pidfd_file; - if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) - return -EINVAL; - - pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + pidfd = get_unused_fd_flags(O_CLOEXEC); if (pidfd < 0) return pidfd; - pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, - flags | O_RDWR | O_CLOEXEC); + pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR); if (IS_ERR(pidfd_file)) { put_unused_fd(pidfd); return PTR_ERR(pidfd_file); } - get_pid(pid); /* held by pidfd_file now */ + /* + * anon_inode_getfile() ignores everything outside of the + * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually. + */ + pidfd_file->f_flags |= (flags & PIDFD_THREAD); *ret = pidfd_file; return pidfd; } @@ -2158,7 +2047,8 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re * Allocate a new file that stashes @pid and reserve a new pidfd number in the * caller's file descriptor table. The pidfd is reserved but not installed yet. * - * The helper verifies that @pid is used as a thread group leader. + * The helper verifies that @pid is still in use, without PIDFD_THREAD the + * task identified by @pid must be a thread-group leader. * * If this function returns successfully the caller is responsible to either * call fd_install() passing the returned pidfd and pidfd file as arguments in @@ -2177,7 +2067,9 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re */ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { - if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) + bool thread = flags & PIDFD_THREAD; + + if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID)) return -EINVAL; return __pidfd_prepare(pid, flags, ret); @@ -2299,9 +2191,8 @@ __latent_entropy struct task_struct *copy_process( /* * - CLONE_DETACHED is blocked so that we can potentially * reuse it later for CLONE_PIDFD. - * - CLONE_THREAD is blocked until someone really needs it. */ - if (clone_flags & (CLONE_DETACHED | CLONE_THREAD)) + if (clone_flags & CLONE_DETACHED) return ERR_PTR(-EINVAL); } @@ -2524,8 +2415,10 @@ __latent_entropy struct task_struct *copy_process( * if the fd table isn't shared). */ if (clone_flags & CLONE_PIDFD) { + int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0; + /* Note that no task has been attached to @pid yet. */ - retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile); + retval = __pidfd_prepare(pid, flags, &pidfile); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; @@ -2876,8 +2769,8 @@ pid_t kernel_clone(struct kernel_clone_args *args) * here has the advantage that we don't need to have a separate helper * to check for legacy clone(). */ - if ((args->flags & CLONE_PIDFD) && - (args->flags & CLONE_PARENT_SETTID) && + if ((clone_flags & CLONE_PIDFD) && + (clone_flags & CLONE_PARENT_SETTID) && (args->pidfd == args->parent_tid)) return -EINVAL; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 15781acaac1c..6ec3deec68c2 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -573,7 +573,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) if (proc_ns_file(f.file)) err = validate_ns(&nsset, ns); else - err = validate_nsset(&nsset, f.file->private_data); + err = validate_nsset(&nsset, pidfd_pid(f.file)); if (!err) { commit_nsset(&nsset); perf_event_namespaces(current); diff --git a/kernel/pid.c b/kernel/pid.c index b52b10865454..99a0c5eb24b8 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -42,6 +42,7 @@ #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/idr.h> +#include <linux/pidfs.h> #include <net/sock.h> #include <uapi/linux/pidfd.h> @@ -65,6 +66,13 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; +#ifdef CONFIG_FS_PID +/* + * Pseudo filesystems start inode numbering after one. We use Reserved + * PIDs as a natural offset. + */ +static u64 pidfs_ino = RESERVED_PIDS; +#endif /* * PID-map pages start out as NULL, they get allocated upon @@ -272,6 +280,10 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; +#ifdef CONFIG_FS_PID + pid->stashed = NULL; + pid->ino = ++pidfs_ino; +#endif for ( ; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); @@ -349,6 +361,11 @@ static void __change_pid(struct task_struct *task, enum pid_type type, hlist_del_rcu(&task->pid_links[type]); *pid_ptr = new; + if (type == PIDTYPE_PID) { + WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID)); + wake_up_all(&pid->wait_pidfd); + } + for (tmp = PIDTYPE_MAX; --tmp >= 0; ) if (pid_has_task(pid, tmp)) return; @@ -391,8 +408,7 @@ void exchange_tids(struct task_struct *left, struct task_struct *right) void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { - if (type == PIDTYPE_PID) - new->thread_pid = old->thread_pid; + WARN_ON_ONCE(type == PIDTYPE_PID); hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); } @@ -552,11 +568,6 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) * Return the task associated with @pidfd. The function takes a reference on * the returned task. The caller is responsible for releasing that reference. * - * Currently, the process identified by @pidfd is always a thread-group leader. - * This restriction currently exists for all aspects of pidfds including pidfd - * creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling - * (only supports thread group leaders). - * * Return: On success, the task_struct associated with the pidfd. * On error, a negative errno number will be returned. */ @@ -595,7 +606,7 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ -int pidfd_create(struct pid *pid, unsigned int flags) +static int pidfd_create(struct pid *pid, unsigned int flags) { int pidfd; struct file *pidfd_file; @@ -615,11 +626,8 @@ int pidfd_create(struct pid *pid, unsigned int flags) * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set for - * the process identified by @pid. Currently, the process identified by - * @pid must be a thread-group leader. This restriction currently exists - * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot - * be used with CLONE_THREAD) and pidfd polling (only supports thread group - * leaders). + * the task identified by @pid. Without PIDFD_THREAD flag the target task + * must be a thread-group leader. * * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. @@ -629,7 +637,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) int fd; struct pid *p; - if (flags & ~PIDFD_NONBLOCK) + if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD)) return -EINVAL; if (pid <= 0) @@ -682,7 +690,26 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd) up_read(&task->signal->exec_update_lock); - return file ?: ERR_PTR(-EBADF); + if (!file) { + /* + * It is possible that the target thread is exiting; it can be + * either: + * 1. before exit_signals(), which gives a real fd + * 2. before exit_files() takes the task_lock() gives a real fd + * 3. after exit_files() releases task_lock(), ->files is NULL; + * this has PF_EXITING, since it was set in exit_signals(), + * __pidfd_fget() returns EBADF. + * In case 3 we get EBADF, but that really means ESRCH, since + * the task is currently exiting and has freed its files + * struct, so we fix it up. + */ + if (task->flags & PF_EXITING) + file = ERR_PTR(-ESRCH); + else + file = ERR_PTR(-EBADF); + } + + return file; } static int pidfd_getfd(struct pid *pid, int fd) diff --git a/kernel/signal.c b/kernel/signal.c index c9c57d053ce4..bdca529f0f7b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -47,6 +47,7 @@ #include <linux/cgroup.h> #include <linux/audit.h> #include <linux/sysctl.h> +#include <uapi/linux/pidfd.h> #define CREATE_TRACE_POINTS #include <trace/events/signal.h> @@ -1436,7 +1437,8 @@ void lockdep_assert_task_sighand_held(struct task_struct *task) #endif /* - * send signal info to all the members of a group + * send signal info to all the members of a thread group or to the + * individual thread if type == PIDTYPE_PID. */ int group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type) @@ -1478,7 +1480,8 @@ int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) return ret; } -int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) +static int kill_pid_info_type(int sig, struct kernel_siginfo *info, + struct pid *pid, enum pid_type type) { int error = -ESRCH; struct task_struct *p; @@ -1487,11 +1490,10 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (p) - error = group_send_sig_info(sig, info, p, PIDTYPE_TGID); + error = group_send_sig_info(sig, info, p, type); rcu_read_unlock(); if (likely(!p || error != -ESRCH)) return error; - /* * The task was unhashed in between, try again. If it * is dead, pid_task() will return NULL, if we race with @@ -1500,6 +1502,11 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) } } +int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) +{ + return kill_pid_info_type(sig, info, pid, PIDTYPE_TGID); +} + static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid) { int error; @@ -1898,16 +1905,19 @@ int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno, return send_sig_info(info.si_signo, &info, t); } -int kill_pgrp(struct pid *pid, int sig, int priv) +static int kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) { int ret; - read_lock(&tasklist_lock); - ret = __kill_pgrp_info(sig, __si_special(priv), pid); + ret = __kill_pgrp_info(sig, info, pgrp); read_unlock(&tasklist_lock); - return ret; } + +int kill_pgrp(struct pid *pid, int sig, int priv) +{ + return kill_pgrp_info(sig, __si_special(priv), pid); +} EXPORT_SYMBOL(kill_pgrp); int kill_pid(struct pid *pid, int sig, int priv) @@ -2019,13 +2029,14 @@ ret: return ret; } -static void do_notify_pidfd(struct task_struct *task) +void do_notify_pidfd(struct task_struct *task) { - struct pid *pid; + struct pid *pid = task_pid(task); WARN_ON(task->exit_state == 0); - pid = task_pid(task); - wake_up_all(&pid->wait_pidfd); + + __wake_up(&pid->wait_pidfd, TASK_NORMAL, 0, + poll_to_key(EPOLLIN | EPOLLRDNORM)); } /* @@ -2050,9 +2061,12 @@ bool do_notify_parent(struct task_struct *tsk, int sig) WARN_ON_ONCE(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); - - /* Wake up all pidfd waiters */ - do_notify_pidfd(tsk); + /* + * tsk is a group leader and has no threads, wake up the + * non-PIDFD_THREAD waiters. + */ + if (thread_group_empty(tsk)) + do_notify_pidfd(tsk); if (sig != SIGCHLD) { /* @@ -3789,12 +3803,13 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese, #endif #endif -static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info) +static void prepare_kill_siginfo(int sig, struct kernel_siginfo *info, + enum pid_type type) { clear_siginfo(info); info->si_signo = sig; info->si_errno = 0; - info->si_code = SI_USER; + info->si_code = (type == PIDTYPE_PID) ? SI_TKILL : SI_USER; info->si_pid = task_tgid_vnr(current); info->si_uid = from_kuid_munged(current_user_ns(), current_uid()); } @@ -3808,7 +3823,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct kernel_siginfo info; - prepare_kill_siginfo(sig, &info); + prepare_kill_siginfo(sig, &info, PIDTYPE_TGID); return kill_something_info(sig, &info, pid); } @@ -3861,6 +3876,10 @@ static struct pid *pidfd_to_pid(const struct file *file) return tgid_pidfd_to_pid(file); } +#define PIDFD_SEND_SIGNAL_FLAGS \ + (PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \ + PIDFD_SIGNAL_PROCESS_GROUP) + /** * sys_pidfd_send_signal - Signal a process through a pidfd * @pidfd: file descriptor of the process @@ -3868,14 +3887,10 @@ static struct pid *pidfd_to_pid(const struct file *file) * @info: signal info * @flags: future flags * - * The syscall currently only signals via PIDTYPE_PID which covers - * kill(<positive-pid>, <signal>. It does not signal threads or process - * groups. - * In order to extend the syscall to threads and process groups the @flags - * argument should be used. In essence, the @flags argument will determine - * what is signaled and not the file descriptor itself. Put in other words, - * grouping is a property of the flags argument not a property of the file - * descriptor. + * Send the signal to the thread group or to the individual thread depending + * on PIDFD_THREAD. + * In the future extension to @flags may be used to override the default scope + * of @pidfd. * * Return: 0 on success, negative errno on failure */ @@ -3886,9 +3901,14 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, struct fd f; struct pid *pid; kernel_siginfo_t kinfo; + enum pid_type type; /* Enforce flags be set to 0 until we add an extension. */ - if (flags) + if (flags & ~PIDFD_SEND_SIGNAL_FLAGS) + return -EINVAL; + + /* Ensure that only a single signal scope determining flag is set. */ + if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1) return -EINVAL; f = fdget(pidfd); @@ -3906,6 +3926,25 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, if (!access_pidfd_pidns(pid)) goto err; + switch (flags) { + case 0: + /* Infer scope from the type of pidfd. */ + if (f.file->f_flags & PIDFD_THREAD) + type = PIDTYPE_PID; + else + type = PIDTYPE_TGID; + break; + case PIDFD_SIGNAL_THREAD: + type = PIDTYPE_PID; + break; + case PIDFD_SIGNAL_THREAD_GROUP: + type = PIDTYPE_TGID; + break; + case PIDFD_SIGNAL_PROCESS_GROUP: + type = PIDTYPE_PGID; + break; + } + if (info) { ret = copy_siginfo_from_user_any(&kinfo, info); if (unlikely(ret)) @@ -3917,15 +3956,17 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, /* Only allow sending arbitrary signals to yourself. */ ret = -EPERM; - if ((task_pid(current) != pid) && + if ((task_pid(current) != pid || type > PIDTYPE_TGID) && (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) goto err; } else { - prepare_kill_siginfo(sig, &kinfo); + prepare_kill_siginfo(sig, &kinfo, type); } - ret = kill_pid_info(sig, &kinfo, pid); - + if (type == PIDTYPE_PGID) + ret = kill_pgrp_info(sig, &kinfo, pid); + else + ret = kill_pid_info_type(sig, &kinfo, pid, type); err: fdput(f); return ret; @@ -3965,12 +4006,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig) { struct kernel_siginfo info; - clear_siginfo(&info); - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_TKILL; - info.si_pid = task_tgid_vnr(current); - info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); + prepare_kill_siginfo(sig, &info, PIDTYPE_PID); return do_send_specific(tgid, pid, sig, &info); } |