From cdefbf2324ceda662e2667aa2f44e8b9de3d780f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 25 Jan 2024 17:17:34 +0100 Subject: pidfd: cleanup the usage of __pidfd_prepare's flags - make pidfd_create() static. - Don't pass O_RDWR | O_CLOEXEC to __pidfd_prepare() in copy_process(), __pidfd_prepare() adds these flags unconditionally. - Kill the flags check in __pidfd_prepare(). sys_pidfd_open() checks the flags itself, all other users of pidfd_prepare() pass flags = 0. If we need a sanity check for those other in kernel users then WARN_ON_ONCE(flags & ~PIDFD_NONBLOCK) makes more sense. - Don't pass O_RDWR to get_unused_fd_flags(), it ignores everything except O_CLOEXEC. - Don't pass O_CLOEXEC to anon_inode_getfile(), it ignores everything except O_ACCMODE | O_NONBLOCK. Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240125161734.GA778@redhat.com Signed-off-by: Christian Brauner --- kernel/pid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/pid.c') diff --git a/kernel/pid.c b/kernel/pid.c index b52b10865454..c7a3e359f8f5 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -595,7 +595,7 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ -int pidfd_create(struct pid *pid, unsigned int flags) +static int pidfd_create(struct pid *pid, unsigned int flags) { int pidfd; struct file *pidfd_file; -- cgit From 64bef697d33b75fc06c5789b3f8108680271529f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 31 Jan 2024 14:26:02 +0100 Subject: pidfd: implement PIDFD_THREAD flag for pidfd_open() With this flag: - pidfd_open() doesn't require that the target task must be a thread-group leader - pidfd_poll() succeeds when the task exits and becomes a zombie (iow, passes exit_notify()), even if it is a leader and thread-group is not empty. This means that the behaviour of pidfd_poll(PIDFD_THREAD, pid-of-group-leader) is not well defined if it races with exec() from its sub-thread; pidfd_poll() can succeed or not depending on whether pidfd_task_exited() is called before or after exchange_tids(). Perhaps we can improve this behaviour later, pidfd_poll() can probably take sig->group_exec_task into account. But this doesn't really differ from the case when the leader exits before other threads (so pidfd_poll() succeeds) and then another thread execs and pidfd_poll() will block again. thread_group_exited() is no longer used, perhaps it can die. Co-developed-by: Tycho Andersen Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240131132602.GA23641@redhat.com Tested-by: Tycho Andersen Reviewed-by: Tycho Andersen Signed-off-by: Christian Brauner --- kernel/pid.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'kernel/pid.c') diff --git a/kernel/pid.c b/kernel/pid.c index c7a3e359f8f5..e11144466828 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -552,11 +552,6 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) * Return the task associated with @pidfd. The function takes a reference on * the returned task. The caller is responsible for releasing that reference. * - * Currently, the process identified by @pidfd is always a thread-group leader. - * This restriction currently exists for all aspects of pidfds including pidfd - * creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling - * (only supports thread group leaders). - * * Return: On success, the task_struct associated with the pidfd. * On error, a negative errno number will be returned. */ @@ -615,11 +610,8 @@ static int pidfd_create(struct pid *pid, unsigned int flags) * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set for - * the process identified by @pid. Currently, the process identified by - * @pid must be a thread-group leader. This restriction currently exists - * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot - * be used with CLONE_THREAD) and pidfd polling (only supports thread group - * leaders). + * the task identified by @pid. Without PIDFD_THREAD flag the target task + * must be a thread-group leader. * * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. @@ -629,7 +621,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) int fd; struct pid *p; - if (flags & ~PIDFD_NONBLOCK) + if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD)) return -EINVAL; if (pid <= 0) -- cgit From 43f0df54c96fa5abcab9df8649c1e52119bf0238 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 2 Feb 2024 14:12:26 +0100 Subject: pidfd_poll: report POLLHUP when pid_task() == NULL Add another wake_up_all(wait_pidfd) into __change_pid() and change pidfd_poll() to include EPOLLHUP if task == NULL. This allows to wait until the target process/thread is reaped. TODO: change do_notify_pidfd() to use the keyed wakeups. Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240202131226.GA26018@redhat.com Signed-off-by: Christian Brauner --- kernel/pid.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/pid.c') diff --git a/kernel/pid.c b/kernel/pid.c index e11144466828..62461c7c82b8 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -349,6 +349,11 @@ static void __change_pid(struct task_struct *task, enum pid_type type, hlist_del_rcu(&task->pid_links[type]); *pid_ptr = new; + if (type == PIDTYPE_PID) { + WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID)); + wake_up_all(&pid->wait_pidfd); + } + for (tmp = PIDTYPE_MAX; --tmp >= 0; ) if (pid_has_task(pid, tmp)) return; -- cgit From a1c6d5439fbddd06aad3ddbb7f12df0b98354070 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 2 Feb 2024 14:12:55 +0100 Subject: pid: kill the obsolete PIDTYPE_PID code in transfer_pid() transfer_pid() must be never called with pid == PIDTYPE_PID, new_leader->thread_pid should be changed by exchange_tids(). Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240202131255.GA26025@redhat.com Signed-off-by: Christian Brauner --- kernel/pid.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/pid.c') diff --git a/kernel/pid.c b/kernel/pid.c index 62461c7c82b8..de0bf2f8d18b 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -396,8 +396,7 @@ void exchange_tids(struct task_struct *left, struct task_struct *right) void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { - if (type == PIDTYPE_PID) - new->thread_pid = old->thread_pid; + WARN_ON_ONCE(type == PIDTYPE_PID); hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); } -- cgit From 0c9bd6bc4bb2ecfe8ce12e9a77ccd762dabe18b4 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Wed, 7 Feb 2024 10:19:29 +0100 Subject: pidfd: getfd should always report ESRCH if a task is exiting We can get EBADF from pidfd_getfd() if a task is currently exiting, which might be confusing. Let's check PF_EXITING, and just report ESRCH if so. I chose PF_EXITING, because it is set in exit_signals(), which is called before exit_files(). Since ->exit_status is mostly set after exit_files() in exit_notify(), using that still leaves a window open for the race. Reviewed-by: Oleg Nesterov Signed-off-by: Tycho Andersen Link: https://lore.kernel.org/r/20240206192357.81942-1-tycho@tycho.pizza Signed-off-by: Christian Brauner --- kernel/pid.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel/pid.c') diff --git a/kernel/pid.c b/kernel/pid.c index de0bf2f8d18b..c1d940fbd314 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -678,7 +678,26 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd) up_read(&task->signal->exec_update_lock); - return file ?: ERR_PTR(-EBADF); + if (!file) { + /* + * It is possible that the target thread is exiting; it can be + * either: + * 1. before exit_signals(), which gives a real fd + * 2. before exit_files() takes the task_lock() gives a real fd + * 3. after exit_files() releases task_lock(), ->files is NULL; + * this has PF_EXITING, since it was set in exit_signals(), + * __pidfd_fget() returns EBADF. + * In case 3 we get EBADF, but that really means ESRCH, since + * the task is currently exiting and has freed its files + * struct, so we fix it up. + */ + if (task->flags & PF_EXITING) + file = ERR_PTR(-ESRCH); + else + file = ERR_PTR(-EBADF); + } + + return file; } static int pidfd_getfd(struct pid *pid, int fd) -- cgit From cb12fd8e0dabb9a1c8aef55a6a41e2c255fcdf4b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 12 Feb 2024 16:32:38 +0100 Subject: pidfd: add pidfs This moves pidfds from the anonymous inode infrastructure to a tiny pseudo filesystem. This has been on my todo for quite a while as it will unblock further work that we weren't able to do simply because of the very justified limitations of anonymous inodes. Moving pidfds to a tiny pseudo filesystem allows: * statx() on pidfds becomes useful for the first time. * pidfds can be compared simply via statx() and then comparing inode numbers. * pidfds have unique inode numbers for the system lifetime. * struct pid is now stashed in inode->i_private instead of file->private_data. This means it is now possible to introduce concepts that operate on a process once all file descriptors have been closed. A concrete example is kill-on-last-close. * file->private_data is freed up for per-file options for pidfds. * Each struct pid will refer to a different inode but the same struct pid will refer to the same inode if it's opened multiple times. In contrast to now where each struct pid refers to the same inode. Even if we were to move to anon_inode_create_getfile() which creates new inodes we'd still be associating the same struct pid with multiple different inodes. The tiny pseudo filesystem is not visible anywhere in userspace exactly like e.g., pipefs and sockfs. There's no lookup, there's no complex inode operations, nothing. Dentries and inodes are always deleted when the last pidfd is closed. We allocate a new inode for each struct pid and we reuse that inode for all pidfds. We use iget_locked() to find that inode again based on the inode number which isn't recycled. We allocate a new dentry for each pidfd that uses the same inode. That is similar to anonymous inodes which reuse the same inode for thousands of dentries. For pidfds we're talking way less than that. There usually won't be a lot of concurrent openers of the same struct pid. They can probably often be counted on two hands. I know that systemd does use separate pidfd for the same struct pid for various complex process tracking issues. So I think with that things actually become way simpler. Especially because we don't have to care about lookup. Dentries and inodes continue to be always deleted. The code is entirely optional and fairly small. If it's not selected we fallback to anonymous inodes. Heavily inspired by nsfs which uses a similar stashing mechanism just for namespaces. Link: https://lore.kernel.org/r/20240213-vfs-pidfd_fs-v1-2-f863f58cfce1@kernel.org Signed-off-by: Christian Brauner --- kernel/pid.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel/pid.c') diff --git a/kernel/pid.c b/kernel/pid.c index c1d940fbd314..581cc34341fd 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -65,6 +66,13 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; +#ifdef CONFIG_FS_PID +/* + * Pseudo filesystems start inode numbering after one. We use Reserved + * PIDs as a natural offset. + */ +static u64 pidfs_ino = RESERVED_PIDS; +#endif /* * PID-map pages start out as NULL, they get allocated upon @@ -272,6 +280,9 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; +#ifdef CONFIG_FS_PID + pid->ino = ++pidfs_ino; +#endif for ( ; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); -- cgit From b28ddcc32d8fa3e20745b3a47dff863fe0376d79 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 19 Feb 2024 16:30:57 +0100 Subject: pidfs: convert to path_from_stashed() helper Moving pidfds from the anonymous inode infrastructure to a separate tiny in-kernel filesystem similar to sockfs, pipefs, and anon_inodefs causes selinux denials and thus various userspace components that make heavy use of pidfds to fail as pidfds used anon_inode_getfile() which aren't subject to any LSM hooks. But dentry_open() is and that would cause regressions. The failures that are seen are selinux denials. But the core failure is dbus-broker. That cascades into other services failing that depend on dbus-broker. For example, when dbus-broker fails to start polkit and all the others won't be able to work because they depend on dbus-broker. The reason for dbus-broker failing is because it doesn't handle failures for SO_PEERPIDFD correctly. Last kernel release we introduced SO_PEERPIDFD (and SCM_PIDFD). SO_PEERPIDFD allows dbus-broker and polkit and others to receive a pidfd for the peer of an AF_UNIX socket. This is the first time in the history of Linux that we can safely authenticate clients in a race-free manner. dbus-broker immediately made use of this but messed up the error checking. It only allowed EINVAL as a valid failure for SO_PEERPIDFD. That's obviously problematic not just because of LSM denials but because of seccomp denials that would prevent SO_PEERPIDFD from working; or any other new error code from there. So this is catching a flawed implementation in dbus-broker as well. It has to fallback to the old pid-based authentication when SO_PEERPIDFD doesn't work no matter the reasons otherwise it'll always risk such failures. So overall that LSM denial should not have caused dbus-broker to fail. It can never assume that a feature released one kernel ago like SO_PEERPIDFD can be assumed to be available. So, the next fix separate from the selinux policy update is to try and fix dbus-broker at [3]. That should make it into Fedora as well. In addition the selinux reference policy should also be updated. See [4] for that. If Selinux is in enforcing mode in userspace and it encounters anything that it doesn't know about it will deny it by default. And the policy is entirely in userspace including declaring new types for stuff like nsfs or pidfs to allow it. For now we continue to raise S_PRIVATE on the inode if it's a pidfs inode which means things behave exactly like before. Link: https://bugzilla.redhat.com/show_bug.cgi?id=2265630 Link: https://github.com/fedora-selinux/selinux-policy/pull/2050 Link: https://github.com/bus1/dbus-broker/pull/343 [3] Link: https://github.com/SELinuxProject/refpolicy/pull/762 [4] Reported-by: Nathan Chancellor Link: https://lore.kernel.org/r/20240222190334.GA412503@dev-arch.thelio-3990X Link: https://lore.kernel.org/r/20240218-neufahrzeuge-brauhaus-fb0eb6459771@brauner Signed-off-by: Christian Brauner --- kernel/pid.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/pid.c') diff --git a/kernel/pid.c b/kernel/pid.c index 581cc34341fd..99a0c5eb24b8 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -281,6 +281,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; #ifdef CONFIG_FS_PID + pid->stashed = NULL; pid->ino = ++pidfs_ino; #endif for ( ; upid >= pid->numbers; --upid) { -- cgit