diff options
-rw-r--r-- | fs/pidfs.c | 127 | ||||
-rw-r--r-- | include/linux/pidfs.h | 2 | ||||
-rw-r--r-- | kernel/pid.c | 14 |
3 files changed, 95 insertions, 48 deletions
diff --git a/fs/pidfs.c b/fs/pidfs.c index 618abb1fa1b8..8d62d900d20d 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -23,6 +23,79 @@ #include "internal.h" #include "mount.h" +static DEFINE_IDR(pidfs_ino_idr); + +static u32 pidfs_ino_upper_32_bits = 0; + +#if BITS_PER_LONG == 32 +/* + * On 32 bit systems the lower 32 bits are the inode number and + * the higher 32 bits are the generation number. The starting + * value for the inode number and the generation number is one. + */ +static u32 pidfs_ino_lower_32_bits = 1; + +static inline unsigned long pidfs_ino(u64 ino) +{ + return lower_32_bits(ino); +} + +/* On 32 bit the generation number are the upper 32 bits. */ +static inline u32 pidfs_gen(u64 ino) +{ + return upper_32_bits(ino); +} + +#else + +static u32 pidfs_ino_lower_32_bits = 0; + +/* On 64 bit simply return ino. */ +static inline unsigned long pidfs_ino(u64 ino) +{ + return ino; +} + +/* On 64 bit the generation number is 1. */ +static inline u32 pidfs_gen(u64 ino) +{ + return 1; +} +#endif + +/* + * Construct an inode number for struct pid in a way that we can use the + * lower 32bit to lookup struct pid independent of any pid numbers that + * could be leaked into userspace (e.g., via file handle encoding). + */ +int pidfs_add_pid(struct pid *pid) +{ + u32 upper; + int lower; + + /* + * Inode numbering for pidfs start at 2. This avoids collisions + * with the root inode which is 1 for pseudo filesystems. + */ + lower = idr_alloc_cyclic(&pidfs_ino_idr, pid, 2, 0, GFP_ATOMIC); + if (lower >= 0 && lower < pidfs_ino_lower_32_bits) + pidfs_ino_upper_32_bits++; + upper = pidfs_ino_upper_32_bits; + pidfs_ino_lower_32_bits = lower; + if (lower < 0) + return lower; + + pid->ino = ((u64)upper << 32) | lower; + pid->stashed = NULL; + return 0; +} + +/* The idr number to remove is the lower 32 bits of the inode. */ +void pidfs_remove_pid(struct pid *pid) +{ + idr_remove(&pidfs_ino_idr, lower_32_bits(pid->ino)); +} + #ifdef CONFIG_PROC_FS /** * pidfd_show_fdinfo - print information about a pidfd @@ -198,6 +271,14 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct ns_common *ns_common = NULL; struct pid_namespace *pid_ns; + if (cmd == FS_IOC_GETVERSION) { + if (!arg) + return -EINVAL; + + __u32 __user *argp = (__u32 __user *)arg; + return put_user(file_inode(file)->i_generation, argp); + } + task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; @@ -318,40 +399,6 @@ struct pid *pidfd_pid(const struct file *file) static struct vfsmount *pidfs_mnt __ro_after_init; -#if BITS_PER_LONG == 32 -/* - * Provide a fallback mechanism for 32-bit systems so processes remain - * reliably comparable by inode number even on those systems. - */ -static DEFINE_IDA(pidfd_inum_ida); - -static int pidfs_inum(struct pid *pid, unsigned long *ino) -{ - int ret; - - ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1, - UINT_MAX, GFP_ATOMIC); - if (ret < 0) - return -ENOSPC; - - *ino = ret; - return 0; -} - -static inline void pidfs_free_inum(unsigned long ino) -{ - if (ino > 0) - ida_free(&pidfd_inum_ida, ino); -} -#else -static inline int pidfs_inum(struct pid *pid, unsigned long *ino) -{ - *ino = pid->ino; - return 0; -} -#define pidfs_free_inum(ino) ((void)(ino)) -#endif - /* * The vfs falls back to simple_setattr() if i_op->setattr() isn't * implemented. Let's reject it completely until we have a clean @@ -403,7 +450,6 @@ static void pidfs_evict_inode(struct inode *inode) clear_inode(inode); put_pid(pid); - pidfs_free_inum(inode->i_ino); } static const struct super_operations pidfs_sops = { @@ -429,17 +475,16 @@ static const struct dentry_operations pidfs_dentry_operations = { static int pidfs_init_inode(struct inode *inode, void *data) { + const struct pid *pid = data; + inode->i_private = data; inode->i_flags |= S_PRIVATE; inode->i_mode |= S_IRWXU; inode->i_op = &pidfs_inode_operations; inode->i_fop = &pidfs_file_operations; - /* - * Inode numbering for pidfs start at RESERVED_PIDS + 1. This - * avoids collisions with the root inode which is 1 for pseudo - * filesystems. - */ - return pidfs_inum(data, &inode->i_ino); + inode->i_ino = pidfs_ino(pid->ino); + inode->i_generation = pidfs_gen(pid->ino); + return 0; } static void pidfs_put_data(void *data) diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 75bdf9807802..2958652bb108 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -4,5 +4,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); void __init pidfs_init(void); +int pidfs_add_pid(struct pid *pid); +void pidfs_remove_pid(struct pid *pid); #endif /* _LINUX_PID_FS_H */ diff --git a/kernel/pid.c b/kernel/pid.c index 115448e89c3e..58567d6904b2 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -64,11 +64,6 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; -/* - * Pseudo filesystems start inode numbering after one. We use Reserved - * PIDs as a natural offset. - */ -static u64 pidfs_ino = RESERVED_PIDS; /* * PID-map pages start out as NULL, they get allocated upon @@ -158,6 +153,7 @@ void free_pid(struct pid *pid) idr_remove(&ns->idr, upid->nr); } + pidfs_remove_pid(pid); spin_unlock_irqrestore(&pidmap_lock, flags); call_rcu(&pid->rcu, delayed_put_pid); @@ -273,22 +269,26 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, INIT_HLIST_HEAD(&pid->inodes); upid = pid->numbers + ns->level; + idr_preload(GFP_KERNEL); spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; - pid->stashed = NULL; - pid->ino = ++pidfs_ino; + retval = pidfs_add_pid(pid); + if (retval) + goto out_unlock; for ( ; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; } spin_unlock_irq(&pidmap_lock); + idr_preload_end(); return pid; out_unlock: spin_unlock_irq(&pidmap_lock); + idr_preload_end(); put_pid_ns(ns); out_free: |