diff options
author | Christian Brauner <brauner@kernel.org> | 2024-11-29 20:49:42 +0100 |
---|---|---|
committer | Christian Brauner <brauner@kernel.org> | 2024-12-14 12:40:40 +0100 |
commit | d8b47d051eab9729417ea35d3c27fbe8ebd9f5c6 (patch) | |
tree | 708d79442a8807c49c8e0cf06fae38ca17b778aa | |
parent | 40384c840ea1944d7c5a392e8975ed088ecf0b37 (diff) | |
parent | 230536ff6b06b199995687aa7fbf164970ebda85 (diff) |
Merge patch series "pidfs: file handle preliminaries"
Christian Brauner <brauner@kernel.org> says:
This reworks the inode number allocation for pidfs in order to support
file handles properly.
Recently we received a patchset that aims to enable file handle encoding
and decoding via name_to_handle_at(2) and open_by_handle_at(2).
A crucical step in the patch series is how to go from inode number to
struct pid without leaking information into unprivileged contexts. The
issue is that in order to find a struct pid the pid number in the
initial pid namespace must be encoded into the file handle via
name_to_handle_at(2). This can be used by containers using a separate
pid namespace to learn what the pid number of a given process in the
initial pid namespace is. While this is a weak information leak it could
be used in various exploits and in general is an ugly wart in the
design.
To solve this problem a new way is needed to lookup a struct pid based
on the inode number allocated for that struct pid. The other part is to
remove the custom inode number allocation on 32bit systems that is also
an ugly wart that should go away.
So, a new scheme is used that I was discusssing with Tejun some time
back. A cyclic ida is used for the lower 32 bits and a the high 32 bits
are used for the generation number. This gives a 64 bit inode number
that is unique on both 32 bit and 64 bit. The lower 32 bit number is
recycled slowly and can be used to lookup struct pids.
* patches from https://lore.kernel.org/r/20241129-work-pidfs-v2-0-61043d66fbce@kernel.org:
pidfs: support FS_IOC_GETVERSION
pidfs: remove 32bit inode number handling
pidfs: rework inode number allocation
Link: https://lore.kernel.org/r/20241129-work-pidfs-v2-0-61043d66fbce@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
-rw-r--r-- | fs/pidfs.c | 127 | ||||
-rw-r--r-- | include/linux/pidfs.h | 2 | ||||
-rw-r--r-- | kernel/pid.c | 14 |
3 files changed, 95 insertions, 48 deletions
diff --git a/fs/pidfs.c b/fs/pidfs.c index 618abb1fa1b8..8d62d900d20d 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -23,6 +23,79 @@ #include "internal.h" #include "mount.h" +static DEFINE_IDR(pidfs_ino_idr); + +static u32 pidfs_ino_upper_32_bits = 0; + +#if BITS_PER_LONG == 32 +/* + * On 32 bit systems the lower 32 bits are the inode number and + * the higher 32 bits are the generation number. The starting + * value for the inode number and the generation number is one. + */ +static u32 pidfs_ino_lower_32_bits = 1; + +static inline unsigned long pidfs_ino(u64 ino) +{ + return lower_32_bits(ino); +} + +/* On 32 bit the generation number are the upper 32 bits. */ +static inline u32 pidfs_gen(u64 ino) +{ + return upper_32_bits(ino); +} + +#else + +static u32 pidfs_ino_lower_32_bits = 0; + +/* On 64 bit simply return ino. */ +static inline unsigned long pidfs_ino(u64 ino) +{ + return ino; +} + +/* On 64 bit the generation number is 1. */ +static inline u32 pidfs_gen(u64 ino) +{ + return 1; +} +#endif + +/* + * Construct an inode number for struct pid in a way that we can use the + * lower 32bit to lookup struct pid independent of any pid numbers that + * could be leaked into userspace (e.g., via file handle encoding). + */ +int pidfs_add_pid(struct pid *pid) +{ + u32 upper; + int lower; + + /* + * Inode numbering for pidfs start at 2. This avoids collisions + * with the root inode which is 1 for pseudo filesystems. + */ + lower = idr_alloc_cyclic(&pidfs_ino_idr, pid, 2, 0, GFP_ATOMIC); + if (lower >= 0 && lower < pidfs_ino_lower_32_bits) + pidfs_ino_upper_32_bits++; + upper = pidfs_ino_upper_32_bits; + pidfs_ino_lower_32_bits = lower; + if (lower < 0) + return lower; + + pid->ino = ((u64)upper << 32) | lower; + pid->stashed = NULL; + return 0; +} + +/* The idr number to remove is the lower 32 bits of the inode. */ +void pidfs_remove_pid(struct pid *pid) +{ + idr_remove(&pidfs_ino_idr, lower_32_bits(pid->ino)); +} + #ifdef CONFIG_PROC_FS /** * pidfd_show_fdinfo - print information about a pidfd @@ -198,6 +271,14 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct ns_common *ns_common = NULL; struct pid_namespace *pid_ns; + if (cmd == FS_IOC_GETVERSION) { + if (!arg) + return -EINVAL; + + __u32 __user *argp = (__u32 __user *)arg; + return put_user(file_inode(file)->i_generation, argp); + } + task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; @@ -318,40 +399,6 @@ struct pid *pidfd_pid(const struct file *file) static struct vfsmount *pidfs_mnt __ro_after_init; -#if BITS_PER_LONG == 32 -/* - * Provide a fallback mechanism for 32-bit systems so processes remain - * reliably comparable by inode number even on those systems. - */ -static DEFINE_IDA(pidfd_inum_ida); - -static int pidfs_inum(struct pid *pid, unsigned long *ino) -{ - int ret; - - ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1, - UINT_MAX, GFP_ATOMIC); - if (ret < 0) - return -ENOSPC; - - *ino = ret; - return 0; -} - -static inline void pidfs_free_inum(unsigned long ino) -{ - if (ino > 0) - ida_free(&pidfd_inum_ida, ino); -} -#else -static inline int pidfs_inum(struct pid *pid, unsigned long *ino) -{ - *ino = pid->ino; - return 0; -} -#define pidfs_free_inum(ino) ((void)(ino)) -#endif - /* * The vfs falls back to simple_setattr() if i_op->setattr() isn't * implemented. Let's reject it completely until we have a clean @@ -403,7 +450,6 @@ static void pidfs_evict_inode(struct inode *inode) clear_inode(inode); put_pid(pid); - pidfs_free_inum(inode->i_ino); } static const struct super_operations pidfs_sops = { @@ -429,17 +475,16 @@ static const struct dentry_operations pidfs_dentry_operations = { static int pidfs_init_inode(struct inode *inode, void *data) { + const struct pid *pid = data; + inode->i_private = data; inode->i_flags |= S_PRIVATE; inode->i_mode |= S_IRWXU; inode->i_op = &pidfs_inode_operations; inode->i_fop = &pidfs_file_operations; - /* - * Inode numbering for pidfs start at RESERVED_PIDS + 1. This - * avoids collisions with the root inode which is 1 for pseudo - * filesystems. - */ - return pidfs_inum(data, &inode->i_ino); + inode->i_ino = pidfs_ino(pid->ino); + inode->i_generation = pidfs_gen(pid->ino); + return 0; } static void pidfs_put_data(void *data) diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 75bdf9807802..2958652bb108 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -4,5 +4,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); void __init pidfs_init(void); +int pidfs_add_pid(struct pid *pid); +void pidfs_remove_pid(struct pid *pid); #endif /* _LINUX_PID_FS_H */ diff --git a/kernel/pid.c b/kernel/pid.c index 115448e89c3e..58567d6904b2 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -64,11 +64,6 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; -/* - * Pseudo filesystems start inode numbering after one. We use Reserved - * PIDs as a natural offset. - */ -static u64 pidfs_ino = RESERVED_PIDS; /* * PID-map pages start out as NULL, they get allocated upon @@ -158,6 +153,7 @@ void free_pid(struct pid *pid) idr_remove(&ns->idr, upid->nr); } + pidfs_remove_pid(pid); spin_unlock_irqrestore(&pidmap_lock, flags); call_rcu(&pid->rcu, delayed_put_pid); @@ -273,22 +269,26 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, INIT_HLIST_HEAD(&pid->inodes); upid = pid->numbers + ns->level; + idr_preload(GFP_KERNEL); spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; - pid->stashed = NULL; - pid->ino = ++pidfs_ino; + retval = pidfs_add_pid(pid); + if (retval) + goto out_unlock; for ( ; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; } spin_unlock_irq(&pidmap_lock); + idr_preload_end(); return pid; out_unlock: spin_unlock_irq(&pidmap_lock); + idr_preload_end(); put_pid_ns(ns); out_free: |