diff options
| author | Christian Brauner <brauner@kernel.org> | 2025-09-02 11:37:34 +0200 | 
|---|---|---|
| committer | Christian Brauner <brauner@kernel.org> | 2025-09-02 11:40:35 +0200 | 
| commit | 46582a15c1742ff0dd9d2faffd62672a79603a42 (patch) | |
| tree | d4f36a7757be29fcc2537cb1b4b8d7c862aaeee0 /fs/proc/root.c | |
| parent | 998541db0ed257ab0682e4a392d8ced5f2d5ff6b (diff) | |
| parent | 5554d820f71c72fbe64e12c3d171908c5ef7257d (diff) | |
Merge patch series "procfs: make reference pidns more user-visible"
Aleksa Sarai <cyphar@cyphar.com> says:
Ever since the introduction of pid namespaces, procfs has had very
implicit behaviour surrounding them (the pidns used by a procfs mount is
auto-selected based on the mounting process's active pidns, and the
pidns itself is basically hidden once the mount has been constructed).
/* pidns mount option for procfs */
This implicit behaviour has historically meant that userspace was
required to do some special dances in order to configure the pidns of a
procfs mount as desired. Examples include:
 * In order to bypass the mnt_too_revealing() check, Kubernetes creates
   a procfs mount from an empty pidns so that user namespaced containers
   can be nested (without this, the nested containers would fail to
   mount procfs). But this requires forking off a helper process because
   you cannot just one-shot this using mount(2).
 * Container runtimes in general need to fork into a container before
   configuring its mounts, which can lead to security issues in the case
   of shared-pidns containers (a privileged process in the pidns can
   interact with your container runtime process). While
   SUID_DUMP_DISABLE and user namespaces make this less of an issue, the
   strict need for this due to a minor uAPI wart is kind of unfortunate.
Things would be much easier if there was a way for userspace to just
specify the pidns they want. Patch 1 implements a new "pidns" argument
which can be set using fsconfig(2):
    fsconfig(procfd, FSCONFIG_SET_FD, "pidns", NULL, nsfd);
    fsconfig(procfd, FSCONFIG_SET_STRING, "pidns", "/proc/self/ns/pid", 0);
or classic mount(2) / mount(8):
    // mount -t proc -o pidns=/proc/self/ns/pid proc /tmp/proc
    mount("proc", "/tmp/proc", "proc", MS_..., "pidns=/proc/self/ns/pid");
The initial security model I have in this RFC is to be as conservative
as possible and just mirror the security model for setns(2) -- which
means that you can only set pidns=... to pid namespaces that your
current pid namespace is a direct ancestor of and you have CAP_SYS_ADMIN
privileges over the pid namespace. This fulfils the requirements of
container runtimes, but I suspect that this may be too strict for some
usecases.
The pidns argument is not displayed in mountinfo -- it's not clear to me
what value it would make sense to show (maybe we could just use ns_dname
to provide an identifier for the namespace, but this number would be
fairly useless to userspace). I'm open to suggestions. Note that
PROCFS_GET_PID_NAMESPACE (see below) does at least let userspace get
information about this outside of mountinfo.
Note that you cannot change the pidns of an already-created procfs
instance. The primary reason is that allowing this to be changed would
require RCU-protecting proc_pid_ns(sb) and thus auditing all of
fs/proc/* and some of the users in fs/* to make sure they wouldn't UAF
the pid namespace. Since creating procfs instances is very cheap, it
seems unnecessary to overcomplicate this upfront. Trying to reconfigure
procfs this way errors out with -EBUSY.
* patches from https://lore.kernel.org/20250805-procfs-pidns-api-v4-0-705f984940e7@cyphar.com:
  selftests/proc: add tests for new pidns APIs
  procfs: add "pidns" mount option
  pidns: move is-ancestor logic to helper
Link: https://lore.kernel.org/20250805-procfs-pidns-api-v4-0-705f984940e7@cyphar.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
Diffstat (limited to 'fs/proc/root.c')
| -rw-r--r-- | fs/proc/root.c | 98 | 
1 files changed, 92 insertions, 6 deletions
| diff --git a/fs/proc/root.c b/fs/proc/root.c index ed86ac710384..fd1f1c8a939a 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -38,12 +38,14 @@ enum proc_param {  	Opt_gid,  	Opt_hidepid,  	Opt_subset, +	Opt_pidns,  };  static const struct fs_parameter_spec proc_fs_parameters[] = { -	fsparam_u32("gid",	Opt_gid), +	fsparam_u32("gid",		Opt_gid),  	fsparam_string("hidepid",	Opt_hidepid),  	fsparam_string("subset",	Opt_subset), +	fsparam_file_or_string("pidns",	Opt_pidns),  	{}  }; @@ -109,11 +111,66 @@ static int proc_parse_subset_param(struct fs_context *fc, char *value)  	return 0;  } +#ifdef CONFIG_PID_NS +static int proc_parse_pidns_param(struct fs_context *fc, +				  struct fs_parameter *param, +				  struct fs_parse_result *result) +{ +	struct proc_fs_context *ctx = fc->fs_private; +	struct pid_namespace *target, *active = task_active_pid_ns(current); +	struct ns_common *ns; +	struct file *ns_filp __free(fput) = NULL; + +	switch (param->type) { +	case fs_value_is_file: +		/* came through fsconfig, steal the file reference */ +		ns_filp = no_free_ptr(param->file); +		break; +	case fs_value_is_string: +		ns_filp = filp_open(param->string, O_RDONLY, 0); +		break; +	default: +		WARN_ON_ONCE(true); +		break; +	} +	if (!ns_filp) +		ns_filp = ERR_PTR(-EBADF); +	if (IS_ERR(ns_filp)) { +		errorfc(fc, "could not get file from pidns argument"); +		return PTR_ERR(ns_filp); +	} + +	if (!proc_ns_file(ns_filp)) +		return invalfc(fc, "pidns argument is not an nsfs file"); +	ns = get_proc_ns(file_inode(ns_filp)); +	if (ns->ops->type != CLONE_NEWPID) +		return invalfc(fc, "pidns argument is not a pidns file"); +	target = container_of(ns, struct pid_namespace, ns); + +	/* +	 * pidns= is shorthand for joining the pidns to get a fsopen fd, so the +	 * permission model should be the same as pidns_install(). +	 */ +	if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) { +		errorfc(fc, "insufficient permissions to set pidns"); +		return -EPERM; +	} +	if (!pidns_is_ancestor(target, active)) +		return invalfc(fc, "cannot set pidns to non-descendant pidns"); + +	put_pid_ns(ctx->pid_ns); +	ctx->pid_ns = get_pid_ns(target); +	put_user_ns(fc->user_ns); +	fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); +	return 0; +} +#endif /* CONFIG_PID_NS */ +  static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)  {  	struct proc_fs_context *ctx = fc->fs_private;  	struct fs_parse_result result; -	int opt; +	int opt, err;  	opt = fs_parse(fc, proc_fs_parameters, param, &result);  	if (opt < 0) @@ -125,14 +182,38 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)  		break;  	case Opt_hidepid: -		if (proc_parse_hidepid_param(fc, param)) -			return -EINVAL; +		err = proc_parse_hidepid_param(fc, param); +		if (err) +			return err;  		break;  	case Opt_subset: -		if (proc_parse_subset_param(fc, param->string) < 0) -			return -EINVAL; +		err = proc_parse_subset_param(fc, param->string); +		if (err) +			return err; +		break; + +	case Opt_pidns: +#ifdef CONFIG_PID_NS +		/* +		 * We would have to RCU-protect every proc_pid_ns() or +		 * proc_sb_info() access if we allowed this to be reconfigured +		 * for an existing procfs instance. Luckily, procfs instances +		 * are cheap to create, and mount-beneath would let you +		 * atomically replace an instance even with overmounts. +		 */ +		if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { +			errorfc(fc, "cannot reconfigure pidns for existing procfs"); +			return -EBUSY; +		} +		err = proc_parse_pidns_param(fc, param, &result); +		if (err) +			return err;  		break; +#else +		errorfc(fc, "pidns mount flag not supported on this system"); +		return -EOPNOTSUPP; +#endif  	default:  		return -EINVAL; @@ -154,6 +235,11 @@ static void proc_apply_options(struct proc_fs_info *fs_info,  		fs_info->hide_pid = ctx->hidepid;  	if (ctx->mask & (1 << Opt_subset))  		fs_info->pidonly = ctx->pidonly; +	if (ctx->mask & (1 << Opt_pidns) && +	    !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) { +		put_pid_ns(fs_info->pid_ns); +		fs_info->pid_ns = get_pid_ns(ctx->pid_ns); +	}  }  static int proc_fill_super(struct super_block *s, struct fs_context *fc) | 
