summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/pidfs.c212
-rw-r--r--include/linux/pid.h3
-rw-r--r--include/linux/pidfs.h1
-rw-r--r--kernel/pid.c2
4 files changed, 151 insertions, 67 deletions
diff --git a/fs/pidfs.c b/fs/pidfs.c
index ff2560b34ed1..6a907457b1fe 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -25,7 +25,10 @@
#include "internal.h"
#include "mount.h"
+#define PIDFS_PID_DEAD ERR_PTR(-ESRCH)
+
static struct kmem_cache *pidfs_cachep __ro_after_init;
+static struct kmem_cache *pidfs_attr_cachep __ro_after_init;
/*
* Stashes information that userspace needs to access even after the
@@ -37,6 +40,11 @@ struct pidfs_exit_info {
__u32 coredump_mask;
};
+struct pidfs_attr {
+ struct pidfs_exit_info __pei;
+ struct pidfs_exit_info *exit_info;
+};
+
struct pidfs_inode {
struct pidfs_exit_info __pei;
struct pidfs_exit_info *exit_info;
@@ -125,6 +133,7 @@ void pidfs_add_pid(struct pid *pid)
pid->ino = pidfs_ino_nr;
pid->stashed = NULL;
+ pid->attr = NULL;
pidfs_ino_nr++;
write_seqcount_begin(&pidmap_lock_seq);
@@ -139,6 +148,18 @@ void pidfs_remove_pid(struct pid *pid)
write_seqcount_end(&pidmap_lock_seq);
}
+void pidfs_free_pid(struct pid *pid)
+{
+ /*
+ * Any dentry must've been wiped from the pid by now.
+ * Otherwise there's a reference count bug.
+ */
+ VFS_WARN_ON_ONCE(pid->stashed);
+
+ if (!IS_ERR(pid->attr))
+ kfree(pid->attr);
+}
+
#ifdef CONFIG_PROC_FS
/**
* pidfd_show_fdinfo - print information about a pidfd
@@ -261,13 +282,13 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags)
static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
{
struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
- struct inode *inode = file_inode(file);
struct pid *pid = pidfd_pid(file);
size_t usize = _IOC_SIZE(cmd);
struct pidfd_info kinfo = {};
struct pidfs_exit_info *exit_info;
struct user_namespace *user_ns;
struct task_struct *task;
+ struct pidfs_attr *attr;
const struct cred *c;
__u64 mask;
@@ -286,8 +307,9 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
if (!pid_in_current_pidns(pid))
return -ESRCH;
+ attr = READ_ONCE(pid->attr);
if (mask & PIDFD_INFO_EXIT) {
- exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
+ exit_info = READ_ONCE(attr->exit_info);
if (exit_info) {
kinfo.mask |= PIDFD_INFO_EXIT;
#ifdef CONFIG_CGROUPS
@@ -300,7 +322,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
if (mask & PIDFD_INFO_COREDUMP) {
kinfo.mask |= PIDFD_INFO_COREDUMP;
- kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask);
+ kinfo.coredump_mask = READ_ONCE(attr->__pei.coredump_mask);
}
task = get_pid_task(pid, PIDTYPE_PID);
@@ -552,41 +574,61 @@ struct pid *pidfd_pid(const struct file *file)
* task has been reaped which cannot happen until we're out of
* release_task().
*
- * If this struct pid is referred to by a pidfd then
- * stashed_dentry_get() will return the dentry and inode for that struct
- * pid. Since we've taken a reference on it there's now an additional
- * reference from the exit path on it. Which is fine. We're going to put
- * it again in a second and we know that the pid is kept alive anyway.
+ * If this struct pid has at least once been referred to by a pidfd then
+ * pid->attr will be allocated. If not we mark the struct pid as dead so
+ * anyone who is trying to register it with pidfs will fail to do so.
+ * Otherwise we would hand out pidfs for reaped tasks without having
+ * exit information available.
*
- * Worst case is that we've filled in the info and immediately free the
- * dentry and inode afterwards since the pidfd has been closed. Since
+ * Worst case is that we've filled in the info and the pid gets freed
+ * right away in free_pid() when no one holds a pidfd anymore. Since
* pidfs_exit() currently is placed after exit_task_work() we know that
- * it cannot be us aka the exiting task holding a pidfd to ourselves.
+ * it cannot be us aka the exiting task holding a pidfd to itself.
*/
void pidfs_exit(struct task_struct *tsk)
{
- struct dentry *dentry;
+ struct pid *pid = task_pid(tsk);
+ struct pidfs_attr *attr;
+ struct pidfs_exit_info *exit_info;
+#ifdef CONFIG_CGROUPS
+ struct cgroup *cgrp;
+#endif
might_sleep();
- dentry = stashed_dentry_get(&task_pid(tsk)->stashed);
- if (dentry) {
- struct inode *inode = d_inode(dentry);
- struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei;
-#ifdef CONFIG_CGROUPS
- struct cgroup *cgrp;
+ guard(spinlock_irq)(&pid->wait_pidfd.lock);
+ attr = pid->attr;
+ if (!attr) {
+ /*
+ * No one ever held a pidfd for this struct pid.
+ * Mark it as dead so no one can add a pidfs
+ * entry anymore. We're about to be reaped and
+ * so no exit information would be available.
+ */
+ pid->attr = PIDFS_PID_DEAD;
+ return;
+ }
- rcu_read_lock();
- cgrp = task_dfl_cgroup(tsk);
- exit_info->cgroupid = cgroup_id(cgrp);
- rcu_read_unlock();
+ /*
+ * If @pid->attr is set someone might still legitimately hold a
+ * pidfd to @pid or someone might concurrently still be getting
+ * a reference to an already stashed dentry from @pid->stashed.
+ * So defer cleaning @pid->attr until the last reference to @pid
+ * is put
+ */
+
+ exit_info = &attr->__pei;
+
+#ifdef CONFIG_CGROUPS
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(tsk);
+ exit_info->cgroupid = cgroup_id(cgrp);
+ rcu_read_unlock();
#endif
- exit_info->exit_code = tsk->exit_code;
+ exit_info->exit_code = tsk->exit_code;
- /* Ensure that PIDFD_GET_INFO sees either all or nothing. */
- smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei);
- dput(dentry);
- }
+ /* Ensure that PIDFD_GET_INFO sees either all or nothing. */
+ smp_store_release(&attr->exit_info, &attr->__pei);
}
#ifdef CONFIG_COREDUMP
@@ -594,16 +636,15 @@ void pidfs_coredump(const struct coredump_params *cprm)
{
struct pid *pid = cprm->pid;
struct pidfs_exit_info *exit_info;
- struct dentry *dentry;
- struct inode *inode;
+ struct pidfs_attr *attr;
__u32 coredump_mask = 0;
- dentry = pid->stashed;
- if (WARN_ON_ONCE(!dentry))
- return;
+ attr = READ_ONCE(pid->attr);
- inode = d_inode(dentry);
- exit_info = &pidfs_i(inode)->__pei;
+ VFS_WARN_ON_ONCE(!attr);
+ VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD);
+
+ exit_info = &attr->__pei;
/* Note how we were coredumped. */
coredump_mask = pidfs_coredump_mask(cprm->mm_flags);
/* Note that we actually did coredump. */
@@ -663,7 +704,7 @@ static struct inode *pidfs_alloc_inode(struct super_block *sb)
static void pidfs_free_inode(struct inode *inode)
{
- kmem_cache_free(pidfs_cachep, pidfs_i(inode));
+ kfree(pidfs_i(inode));
}
static const struct super_operations pidfs_sops = {
@@ -831,8 +872,13 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path,
* recorded and published can be handled correctly.
*/
if (unlikely(!pid_has_task(pid, type))) {
- struct inode *inode = d_inode(path->dentry);
- return !!READ_ONCE(pidfs_i(inode)->exit_info);
+ struct pidfs_attr *attr;
+
+ attr = READ_ONCE(pid->attr);
+ if (!attr)
+ return false;
+ if (!READ_ONCE(attr->exit_info))
+ return false;
}
return true;
@@ -878,9 +924,67 @@ static void pidfs_put_data(void *data)
put_pid(pid);
}
+/**
+ * pidfs_register_pid - register a struct pid in pidfs
+ * @pid: pid to pin
+ *
+ * Register a struct pid in pidfs. Needs to be paired with
+ * pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
+ *
+ * Return: On success zero, on error a negative error code is returned.
+ */
+int pidfs_register_pid(struct pid *pid)
+{
+ struct pidfs_attr *new_attr __free(kfree) = NULL;
+ struct pidfs_attr *attr;
+
+ might_sleep();
+
+ if (!pid)
+ return 0;
+
+ attr = READ_ONCE(pid->attr);
+ if (unlikely(attr == PIDFS_PID_DEAD))
+ return PTR_ERR(PIDFS_PID_DEAD);
+ if (attr)
+ return 0;
+
+ new_attr = kmem_cache_zalloc(pidfs_attr_cachep, GFP_KERNEL);
+ if (!new_attr)
+ return -ENOMEM;
+
+ /* Synchronize with pidfs_exit(). */
+ guard(spinlock_irq)(&pid->wait_pidfd.lock);
+
+ attr = pid->attr;
+ if (unlikely(attr == PIDFS_PID_DEAD))
+ return PTR_ERR(PIDFS_PID_DEAD);
+ if (unlikely(attr))
+ return 0;
+
+ pid->attr = no_free_ptr(new_attr);
+ return 0;
+}
+
+static struct dentry *pidfs_stash_dentry(struct dentry **stashed,
+ struct dentry *dentry)
+{
+ int ret;
+ struct pid *pid = d_inode(dentry)->i_private;
+
+ VFS_WARN_ON_ONCE(stashed != &pid->stashed);
+
+ ret = pidfs_register_pid(pid);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return stash_dentry(stashed, dentry);
+}
+
static const struct stashed_operations pidfs_stashed_ops = {
- .init_inode = pidfs_init_inode,
- .put_data = pidfs_put_data,
+ .stash_dentry = pidfs_stash_dentry,
+ .init_inode = pidfs_init_inode,
+ .put_data = pidfs_put_data,
};
static int pidfs_init_fs_context(struct fs_context *fc)
@@ -937,33 +1041,6 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
}
/**
- * pidfs_register_pid - register a struct pid in pidfs
- * @pid: pid to pin
- *
- * Register a struct pid in pidfs. Needs to be paired with
- * pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
- *
- * Return: On success zero, on error a negative error code is returned.
- */
-int pidfs_register_pid(struct pid *pid)
-{
- struct path path __free(path_put) = {};
- int ret;
-
- might_sleep();
-
- if (!pid)
- return 0;
-
- ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
- if (unlikely(ret))
- return ret;
- /* Keep the dentry and only put the reference to the mount. */
- path.dentry = NULL;
- return 0;
-}
-
-/**
* pidfs_get_pid - pin a struct pid through pidfs
* @pid: pid to pin
*
@@ -1008,6 +1085,9 @@ void __init pidfs_init(void)
(SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
SLAB_ACCOUNT | SLAB_PANIC),
pidfs_inode_init_once);
+ pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0,
+ (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
+ SLAB_ACCOUNT | SLAB_PANIC), NULL);
pidfs_mnt = kern_mount(&pidfs_type);
if (IS_ERR(pidfs_mnt))
panic("Failed to mount pidfs pseudo filesystem");
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 00646a692dd4..003a1027d219 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -47,6 +47,8 @@
#define RESERVED_PIDS 300
+struct pidfs_attr;
+
struct upid {
int nr;
struct pid_namespace *ns;
@@ -60,6 +62,7 @@ struct pid {
u64 ino;
struct rb_node pidfs_node;
struct dentry *stashed;
+ struct pidfs_attr *attr;
};
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index 77e7db194914..8f6ed59bb3fb 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -16,5 +16,6 @@ extern const struct dentry_operations pidfs_dentry_operations;
int pidfs_register_pid(struct pid *pid);
void pidfs_get_pid(struct pid *pid);
void pidfs_put_pid(struct pid *pid);
+void pidfs_free_pid(struct pid *pid);
#endif /* _LINUX_PID_FS_H */
diff --git a/kernel/pid.c b/kernel/pid.c
index 8317bcbc7cf7..07db7d8d066c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -100,7 +100,7 @@ void put_pid(struct pid *pid)
ns = pid->numbers[pid->level].ns;
if (refcount_dec_and_test(&pid->count)) {
- WARN_ON_ONCE(pid->stashed);
+ pidfs_free_pid(pid);
kmem_cache_free(ns->pid_cachep, pid);
put_pid_ns(ns);
}