diff options
-rw-r--r-- | fs/pidfs.c | 212 | ||||
-rw-r--r-- | include/linux/pid.h | 3 | ||||
-rw-r--r-- | include/linux/pidfs.h | 1 | ||||
-rw-r--r-- | kernel/pid.c | 2 |
4 files changed, 151 insertions, 67 deletions
diff --git a/fs/pidfs.c b/fs/pidfs.c index ff2560b34ed1..6a907457b1fe 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -25,7 +25,10 @@ #include "internal.h" #include "mount.h" +#define PIDFS_PID_DEAD ERR_PTR(-ESRCH) + static struct kmem_cache *pidfs_cachep __ro_after_init; +static struct kmem_cache *pidfs_attr_cachep __ro_after_init; /* * Stashes information that userspace needs to access even after the @@ -37,6 +40,11 @@ struct pidfs_exit_info { __u32 coredump_mask; }; +struct pidfs_attr { + struct pidfs_exit_info __pei; + struct pidfs_exit_info *exit_info; +}; + struct pidfs_inode { struct pidfs_exit_info __pei; struct pidfs_exit_info *exit_info; @@ -125,6 +133,7 @@ void pidfs_add_pid(struct pid *pid) pid->ino = pidfs_ino_nr; pid->stashed = NULL; + pid->attr = NULL; pidfs_ino_nr++; write_seqcount_begin(&pidmap_lock_seq); @@ -139,6 +148,18 @@ void pidfs_remove_pid(struct pid *pid) write_seqcount_end(&pidmap_lock_seq); } +void pidfs_free_pid(struct pid *pid) +{ + /* + * Any dentry must've been wiped from the pid by now. + * Otherwise there's a reference count bug. + */ + VFS_WARN_ON_ONCE(pid->stashed); + + if (!IS_ERR(pid->attr)) + kfree(pid->attr); +} + #ifdef CONFIG_PROC_FS /** * pidfd_show_fdinfo - print information about a pidfd @@ -261,13 +282,13 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags) static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) { struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; - struct inode *inode = file_inode(file); struct pid *pid = pidfd_pid(file); size_t usize = _IOC_SIZE(cmd); struct pidfd_info kinfo = {}; struct pidfs_exit_info *exit_info; struct user_namespace *user_ns; struct task_struct *task; + struct pidfs_attr *attr; const struct cred *c; __u64 mask; @@ -286,8 +307,9 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) if (!pid_in_current_pidns(pid)) return -ESRCH; + attr = READ_ONCE(pid->attr); if (mask & PIDFD_INFO_EXIT) { - exit_info = READ_ONCE(pidfs_i(inode)->exit_info); + exit_info = READ_ONCE(attr->exit_info); if (exit_info) { kinfo.mask |= PIDFD_INFO_EXIT; #ifdef CONFIG_CGROUPS @@ -300,7 +322,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) if (mask & PIDFD_INFO_COREDUMP) { kinfo.mask |= PIDFD_INFO_COREDUMP; - kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask); + kinfo.coredump_mask = READ_ONCE(attr->__pei.coredump_mask); } task = get_pid_task(pid, PIDTYPE_PID); @@ -552,41 +574,61 @@ struct pid *pidfd_pid(const struct file *file) * task has been reaped which cannot happen until we're out of * release_task(). * - * If this struct pid is referred to by a pidfd then - * stashed_dentry_get() will return the dentry and inode for that struct - * pid. Since we've taken a reference on it there's now an additional - * reference from the exit path on it. Which is fine. We're going to put - * it again in a second and we know that the pid is kept alive anyway. + * If this struct pid has at least once been referred to by a pidfd then + * pid->attr will be allocated. If not we mark the struct pid as dead so + * anyone who is trying to register it with pidfs will fail to do so. + * Otherwise we would hand out pidfs for reaped tasks without having + * exit information available. * - * Worst case is that we've filled in the info and immediately free the - * dentry and inode afterwards since the pidfd has been closed. Since + * Worst case is that we've filled in the info and the pid gets freed + * right away in free_pid() when no one holds a pidfd anymore. Since * pidfs_exit() currently is placed after exit_task_work() we know that - * it cannot be us aka the exiting task holding a pidfd to ourselves. + * it cannot be us aka the exiting task holding a pidfd to itself. */ void pidfs_exit(struct task_struct *tsk) { - struct dentry *dentry; + struct pid *pid = task_pid(tsk); + struct pidfs_attr *attr; + struct pidfs_exit_info *exit_info; +#ifdef CONFIG_CGROUPS + struct cgroup *cgrp; +#endif might_sleep(); - dentry = stashed_dentry_get(&task_pid(tsk)->stashed); - if (dentry) { - struct inode *inode = d_inode(dentry); - struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei; -#ifdef CONFIG_CGROUPS - struct cgroup *cgrp; + guard(spinlock_irq)(&pid->wait_pidfd.lock); + attr = pid->attr; + if (!attr) { + /* + * No one ever held a pidfd for this struct pid. + * Mark it as dead so no one can add a pidfs + * entry anymore. We're about to be reaped and + * so no exit information would be available. + */ + pid->attr = PIDFS_PID_DEAD; + return; + } - rcu_read_lock(); - cgrp = task_dfl_cgroup(tsk); - exit_info->cgroupid = cgroup_id(cgrp); - rcu_read_unlock(); + /* + * If @pid->attr is set someone might still legitimately hold a + * pidfd to @pid or someone might concurrently still be getting + * a reference to an already stashed dentry from @pid->stashed. + * So defer cleaning @pid->attr until the last reference to @pid + * is put + */ + + exit_info = &attr->__pei; + +#ifdef CONFIG_CGROUPS + rcu_read_lock(); + cgrp = task_dfl_cgroup(tsk); + exit_info->cgroupid = cgroup_id(cgrp); + rcu_read_unlock(); #endif - exit_info->exit_code = tsk->exit_code; + exit_info->exit_code = tsk->exit_code; - /* Ensure that PIDFD_GET_INFO sees either all or nothing. */ - smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei); - dput(dentry); - } + /* Ensure that PIDFD_GET_INFO sees either all or nothing. */ + smp_store_release(&attr->exit_info, &attr->__pei); } #ifdef CONFIG_COREDUMP @@ -594,16 +636,15 @@ void pidfs_coredump(const struct coredump_params *cprm) { struct pid *pid = cprm->pid; struct pidfs_exit_info *exit_info; - struct dentry *dentry; - struct inode *inode; + struct pidfs_attr *attr; __u32 coredump_mask = 0; - dentry = pid->stashed; - if (WARN_ON_ONCE(!dentry)) - return; + attr = READ_ONCE(pid->attr); - inode = d_inode(dentry); - exit_info = &pidfs_i(inode)->__pei; + VFS_WARN_ON_ONCE(!attr); + VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD); + + exit_info = &attr->__pei; /* Note how we were coredumped. */ coredump_mask = pidfs_coredump_mask(cprm->mm_flags); /* Note that we actually did coredump. */ @@ -663,7 +704,7 @@ static struct inode *pidfs_alloc_inode(struct super_block *sb) static void pidfs_free_inode(struct inode *inode) { - kmem_cache_free(pidfs_cachep, pidfs_i(inode)); + kfree(pidfs_i(inode)); } static const struct super_operations pidfs_sops = { @@ -831,8 +872,13 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path, * recorded and published can be handled correctly. */ if (unlikely(!pid_has_task(pid, type))) { - struct inode *inode = d_inode(path->dentry); - return !!READ_ONCE(pidfs_i(inode)->exit_info); + struct pidfs_attr *attr; + + attr = READ_ONCE(pid->attr); + if (!attr) + return false; + if (!READ_ONCE(attr->exit_info)) + return false; } return true; @@ -878,9 +924,67 @@ static void pidfs_put_data(void *data) put_pid(pid); } +/** + * pidfs_register_pid - register a struct pid in pidfs + * @pid: pid to pin + * + * Register a struct pid in pidfs. Needs to be paired with + * pidfs_put_pid() to not risk leaking the pidfs dentry and inode. + * + * Return: On success zero, on error a negative error code is returned. + */ +int pidfs_register_pid(struct pid *pid) +{ + struct pidfs_attr *new_attr __free(kfree) = NULL; + struct pidfs_attr *attr; + + might_sleep(); + + if (!pid) + return 0; + + attr = READ_ONCE(pid->attr); + if (unlikely(attr == PIDFS_PID_DEAD)) + return PTR_ERR(PIDFS_PID_DEAD); + if (attr) + return 0; + + new_attr = kmem_cache_zalloc(pidfs_attr_cachep, GFP_KERNEL); + if (!new_attr) + return -ENOMEM; + + /* Synchronize with pidfs_exit(). */ + guard(spinlock_irq)(&pid->wait_pidfd.lock); + + attr = pid->attr; + if (unlikely(attr == PIDFS_PID_DEAD)) + return PTR_ERR(PIDFS_PID_DEAD); + if (unlikely(attr)) + return 0; + + pid->attr = no_free_ptr(new_attr); + return 0; +} + +static struct dentry *pidfs_stash_dentry(struct dentry **stashed, + struct dentry *dentry) +{ + int ret; + struct pid *pid = d_inode(dentry)->i_private; + + VFS_WARN_ON_ONCE(stashed != &pid->stashed); + + ret = pidfs_register_pid(pid); + if (ret) + return ERR_PTR(ret); + + return stash_dentry(stashed, dentry); +} + static const struct stashed_operations pidfs_stashed_ops = { - .init_inode = pidfs_init_inode, - .put_data = pidfs_put_data, + .stash_dentry = pidfs_stash_dentry, + .init_inode = pidfs_init_inode, + .put_data = pidfs_put_data, }; static int pidfs_init_fs_context(struct fs_context *fc) @@ -937,33 +1041,6 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) } /** - * pidfs_register_pid - register a struct pid in pidfs - * @pid: pid to pin - * - * Register a struct pid in pidfs. Needs to be paired with - * pidfs_put_pid() to not risk leaking the pidfs dentry and inode. - * - * Return: On success zero, on error a negative error code is returned. - */ -int pidfs_register_pid(struct pid *pid) -{ - struct path path __free(path_put) = {}; - int ret; - - might_sleep(); - - if (!pid) - return 0; - - ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); - if (unlikely(ret)) - return ret; - /* Keep the dentry and only put the reference to the mount. */ - path.dentry = NULL; - return 0; -} - -/** * pidfs_get_pid - pin a struct pid through pidfs * @pid: pid to pin * @@ -1008,6 +1085,9 @@ void __init pidfs_init(void) (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT | SLAB_PANIC), pidfs_inode_init_once); + pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0, + (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | + SLAB_ACCOUNT | SLAB_PANIC), NULL); pidfs_mnt = kern_mount(&pidfs_type); if (IS_ERR(pidfs_mnt)) panic("Failed to mount pidfs pseudo filesystem"); diff --git a/include/linux/pid.h b/include/linux/pid.h index 00646a692dd4..003a1027d219 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -47,6 +47,8 @@ #define RESERVED_PIDS 300 +struct pidfs_attr; + struct upid { int nr; struct pid_namespace *ns; @@ -60,6 +62,7 @@ struct pid { u64 ino; struct rb_node pidfs_node; struct dentry *stashed; + struct pidfs_attr *attr; }; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 77e7db194914..8f6ed59bb3fb 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -16,5 +16,6 @@ extern const struct dentry_operations pidfs_dentry_operations; int pidfs_register_pid(struct pid *pid); void pidfs_get_pid(struct pid *pid); void pidfs_put_pid(struct pid *pid); +void pidfs_free_pid(struct pid *pid); #endif /* _LINUX_PID_FS_H */ diff --git a/kernel/pid.c b/kernel/pid.c index 8317bcbc7cf7..07db7d8d066c 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -100,7 +100,7 @@ void put_pid(struct pid *pid) ns = pid->numbers[pid->level].ns; if (refcount_dec_and_test(&pid->count)) { - WARN_ON_ONCE(pid->stashed); + pidfs_free_pid(pid); kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } |