summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-12-01 09:47:41 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2025-12-01 09:47:41 -0800
commit415d34b92c1f921a9ff3c38f56319cbc5536f642 (patch)
tree461dc8621de93dcc175f8bef9233a41d1a47e23f /include
parentebaeabfa5ab711a9b69b686d58329e258fdae75f (diff)
parenta71e4f103aed69e7a11ea913312726bb194c76ee (diff)
Merge tag 'namespace-6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull namespace updates from Christian Brauner: "This contains substantial namespace infrastructure changes including a new system call, active reference counting, and extensive header cleanups. The branch depends on the shared kbuild branch for -fms-extensions support. Features: - listns() system call Add a new listns() system call that allows userspace to iterate through namespaces in the system. This provides a programmatic interface to discover and inspect namespaces, addressing longstanding limitations: Currently, there is no direct way for userspace to enumerate namespaces. Applications must resort to scanning /proc/*/ns/ across all processes, which is: - Inefficient - requires iterating over all processes - Incomplete - misses namespaces not attached to any running process but kept alive by file descriptors, bind mounts, or parent references - Permission-heavy - requires access to /proc for many processes - No ordering or ownership information - No filtering per namespace type The listns() system call solves these problems: ssize_t listns(const struct ns_id_req *req, u64 *ns_ids, size_t nr_ns_ids, unsigned int flags); struct ns_id_req { __u32 size; __u32 spare; __u64 ns_id; struct /* listns */ { __u32 ns_type; __u32 spare2; __u64 user_ns_id; }; }; Features include: - Pagination support for large namespace sets - Filtering by namespace type (MNT_NS, NET_NS, USER_NS, etc.) - Filtering by owning user namespace - Permission checks respecting namespace isolation - Active Reference Counting Introduce an active reference count that tracks namespace visibility to userspace. A namespace is visible in the following cases: - The namespace is in use by a task - The namespace is persisted through a VFS object (namespace file descriptor or bind-mount) - The namespace is a hierarchical type and is the parent of child namespaces The active reference count does not regulate lifetime (that's still done by the normal reference count) - it only regulates visibility to namespace file handles and listns(). This prevents resurrection of namespaces that are pinned only for internal kernel reasons (e.g., user namespaces held by file->f_cred, lazy TLB references on idle CPUs, etc.) which should not be accessible via (1)-(3). - Unified Namespace Tree Introduce a unified tree structure for all namespaces with: - Fixed IDs assigned to initial namespaces - Lookup based solely on inode number - Maintained list of owned namespaces per user namespace - Simplified rbtree comparison helpers Cleanups - Header Reorganization: - Move namespace types into separate header (ns_common_types.h) - Decouple nstree from ns_common header - Move nstree types into separate header - Switch to new ns_tree_{node,root} structures with helper functions - Use guards for ns_tree_lock - Initial Namespace Reference Count Optimization - Make all reference counts on initial namespaces a nop to avoid pointless cacheline ping-pong for namespaces that can never go away - Drop custom reference count initialization for initial namespaces - Add NS_COMMON_INIT() macro and use it for all namespaces - pid: rely on common reference count behavior - Miscellaneous Cleanups - Rename exit_task_namespaces() to exit_nsproxy_namespaces() - Rename is_initial_namespace() and make argument const - Use boolean to indicate anonymous mount namespace - Simplify owner list iteration in nstree - nsfs: raise SB_I_NODEV, SB_I_NOEXEC, and DCACHE_DONTCACHE explicitly - nsfs: use inode_just_drop() - pidfs: raise DCACHE_DONTCACHE explicitly - pidfs: simplify PIDFD_GET__NAMESPACE ioctls - libfs: allow to specify s_d_flags - cgroup: add cgroup namespace to tree after owner is set - nsproxy: fix free_nsproxy() and simplify create_new_namespaces() Fixes: - setns(pidfd, ...) race condition Fix a subtle race when using pidfds with setns(). When the target task exits after prepare_nsset() but before commit_nsset(), the namespace's active reference count might have been dropped. If setns() then installs the namespaces, it would bump the active reference count from zero without taking the required reference on the owner namespace, leading to underflow when later decremented. The fix resurrects the ownership chain if necessary - if the caller succeeded in grabbing passive references, the setns() should succeed even if the target task exits or gets reaped. - Return EFAULT on put_user() error instead of success - Make sure references are dropped outside of RCU lock (some namespaces like mount namespace sleep when putting the last reference) - Don't skip active reference count initialization for network namespace - Add asserts for active refcount underflow - Add asserts for initial namespace reference counts (both passive and active) - ipc: enable is_ns_init_id() assertions - Fix kernel-doc comments for internal nstree functions - Selftests - 15 active reference count tests - 9 listns() functionality tests - 7 listns() permission tests - 12 inactive namespace resurrection tests - 3 threaded active reference count tests - commit_creds() active reference tests - Pagination and stress tests - EFAULT handling test - nsid tests fixes" * tag 'namespace-6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (103 commits) pidfs: simplify PIDFD_GET_<type>_NAMESPACE ioctls nstree: fix kernel-doc comments for internal functions nsproxy: fix free_nsproxy() and simplify create_new_namespaces() selftests/namespaces: fix nsid tests ns: drop custom reference count initialization for initial namespaces pid: rely on common reference count behavior ns: add asserts for initial namespace active reference counts ns: add asserts for initial namespace reference counts ns: make all reference counts on initial namespace a nop ipc: enable is_ns_init_id() assertions fs: use boolean to indicate anonymous mount namespace ns: rename is_initial_namespace() ns: make is_initial_namespace() argument const nstree: use guards for ns_tree_lock nstree: simplify owner list iteration nstree: switch to new structures nstree: add helper to operate on struct ns_tree_{node,root} nstree: move nstree types into separate header nstree: decouple from ns_common header ns: move namespace types into separate header ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/ns/ns_common_types.h196
-rw-r--r--include/linux/ns/nstree_types.h55
-rw-r--r--include/linux/ns_common.h233
-rw-r--r--include/linux/nsfs.h3
-rw-r--r--include/linux/nsproxy.h9
-rw-r--r--include/linux/nstree.h52
-rw-r--r--include/linux/pid_namespace.h3
-rw-r--r--include/linux/pseudo_fs.h1
-rw-r--r--include/linux/syscalls.h4
-rw-r--r--include/linux/user_namespace.h4
-rw-r--r--include/uapi/asm-generic/unistd.h4
-rw-r--r--include/uapi/linux/nsfs.h58
12 files changed, 479 insertions, 143 deletions
diff --git a/include/linux/ns/ns_common_types.h b/include/linux/ns/ns_common_types.h
new file mode 100644
index 000000000000..b332b019b29c
--- /dev/null
+++ b/include/linux/ns/ns_common_types.h
@@ -0,0 +1,196 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_NS_COMMON_TYPES_H
+#define _LINUX_NS_COMMON_TYPES_H
+
+#include <linux/atomic.h>
+#include <linux/ns/nstree_types.h>
+#include <linux/rbtree.h>
+#include <linux/refcount.h>
+#include <linux/types.h>
+
+struct cgroup_namespace;
+struct dentry;
+struct ipc_namespace;
+struct mnt_namespace;
+struct net;
+struct pid_namespace;
+struct proc_ns_operations;
+struct time_namespace;
+struct user_namespace;
+struct uts_namespace;
+
+extern struct cgroup_namespace init_cgroup_ns;
+extern struct ipc_namespace init_ipc_ns;
+extern struct mnt_namespace init_mnt_ns;
+extern struct net init_net;
+extern struct pid_namespace init_pid_ns;
+extern struct time_namespace init_time_ns;
+extern struct user_namespace init_user_ns;
+extern struct uts_namespace init_uts_ns;
+
+extern const struct proc_ns_operations cgroupns_operations;
+extern const struct proc_ns_operations ipcns_operations;
+extern const struct proc_ns_operations mntns_operations;
+extern const struct proc_ns_operations netns_operations;
+extern const struct proc_ns_operations pidns_operations;
+extern const struct proc_ns_operations pidns_for_children_operations;
+extern const struct proc_ns_operations timens_operations;
+extern const struct proc_ns_operations timens_for_children_operations;
+extern const struct proc_ns_operations userns_operations;
+extern const struct proc_ns_operations utsns_operations;
+
+/*
+ * Namespace lifetimes are managed via a two-tier reference counting model:
+ *
+ * (1) __ns_ref (refcount_t): Main reference count tracking memory
+ * lifetime. Controls when the namespace structure itself is freed.
+ * It also pins the namespace on the namespace trees whereas (2)
+ * only regulates their visibility to userspace.
+ *
+ * (2) __ns_ref_active (atomic_t): Reference count tracking active users.
+ * Controls visibility of the namespace in the namespace trees.
+ * Any live task that uses the namespace (via nsproxy or cred) holds
+ * an active reference. Any open file descriptor or bind-mount of
+ * the namespace holds an active reference. Once all tasks have
+ * called exited their namespaces and all file descriptors and
+ * bind-mounts have been released the active reference count drops
+ * to zero and the namespace becomes inactive. IOW, the namespace
+ * cannot be listed or opened via file handles anymore.
+ *
+ * Note that it is valid to transition from active to inactive and
+ * back from inactive to active e.g., when resurrecting an inactive
+ * namespace tree via the SIOCGSKNS ioctl().
+ *
+ * Relationship and lifecycle states:
+ *
+ * - Active (__ns_ref_active > 0):
+ * Namespace is actively used and visible to userspace. The namespace
+ * can be reopened via /proc/<pid>/ns/<ns_type>, via namespace file
+ * handles, or discovered via listns().
+ *
+ * - Inactive (__ns_ref_active == 0, __ns_ref > 0):
+ * No tasks are actively using the namespace and it isn't pinned by
+ * any bind-mounts or open file descriptors anymore. But the namespace
+ * is still kept alive by internal references. For example, the user
+ * namespace could be pinned by an open file through file->f_cred
+ * references when one of the now defunct tasks had opened a file and
+ * handed the file descriptor off to another process via a UNIX
+ * sockets. Such references keep the namespace structure alive through
+ * __ns_ref but will not hold an active reference.
+ *
+ * - Destroyed (__ns_ref == 0):
+ * No references remain. The namespace is removed from the tree and freed.
+ *
+ * State transitions:
+ *
+ * Active -> Inactive:
+ * When the last task using the namespace exits it drops its active
+ * references to all namespaces. However, user and pid namespaces
+ * remain accessible until the task has been reaped.
+ *
+ * Inactive -> Active:
+ * An inactive namespace tree might be resurrected due to e.g., the
+ * SIOCGSKNS ioctl() on a socket.
+ *
+ * Inactive -> Destroyed:
+ * When __ns_ref drops to zero the namespace is removed from the
+ * namespaces trees and the memory is freed (after RCU grace period).
+ *
+ * Initial namespaces:
+ * Boot-time namespaces (init_net, init_pid_ns, etc.) start with
+ * __ns_ref_active = 1 and remain active forever.
+ *
+ * @ns_type: type of namespace (e.g., CLONE_NEWNET)
+ * @stashed: cached dentry to be used by the vfs
+ * @ops: namespace operations
+ * @inum: namespace inode number (quickly recycled for non-initial namespaces)
+ * @__ns_ref: main reference count (do not use directly)
+ * @ns_tree: namespace tree nodes and active reference count
+ */
+struct ns_common {
+ u32 ns_type;
+ struct dentry *stashed;
+ const struct proc_ns_operations *ops;
+ unsigned int inum;
+ refcount_t __ns_ref; /* do not use directly */
+ union {
+ struct ns_tree;
+ struct rcu_head ns_rcu;
+ };
+};
+
+#define to_ns_common(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: &(__ns)->ns, \
+ const struct cgroup_namespace *: &(__ns)->ns, \
+ struct ipc_namespace *: &(__ns)->ns, \
+ const struct ipc_namespace *: &(__ns)->ns, \
+ struct mnt_namespace *: &(__ns)->ns, \
+ const struct mnt_namespace *: &(__ns)->ns, \
+ struct net *: &(__ns)->ns, \
+ const struct net *: &(__ns)->ns, \
+ struct pid_namespace *: &(__ns)->ns, \
+ const struct pid_namespace *: &(__ns)->ns, \
+ struct time_namespace *: &(__ns)->ns, \
+ const struct time_namespace *: &(__ns)->ns, \
+ struct user_namespace *: &(__ns)->ns, \
+ const struct user_namespace *: &(__ns)->ns, \
+ struct uts_namespace *: &(__ns)->ns, \
+ const struct uts_namespace *: &(__ns)->ns)
+
+#define ns_init_inum(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: CGROUP_NS_INIT_INO, \
+ struct ipc_namespace *: IPC_NS_INIT_INO, \
+ struct mnt_namespace *: MNT_NS_INIT_INO, \
+ struct net *: NET_NS_INIT_INO, \
+ struct pid_namespace *: PID_NS_INIT_INO, \
+ struct time_namespace *: TIME_NS_INIT_INO, \
+ struct user_namespace *: USER_NS_INIT_INO, \
+ struct uts_namespace *: UTS_NS_INIT_INO)
+
+#define ns_init_ns(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: &init_cgroup_ns, \
+ struct ipc_namespace *: &init_ipc_ns, \
+ struct mnt_namespace *: &init_mnt_ns, \
+ struct net *: &init_net, \
+ struct pid_namespace *: &init_pid_ns, \
+ struct time_namespace *: &init_time_ns, \
+ struct user_namespace *: &init_user_ns, \
+ struct uts_namespace *: &init_uts_ns)
+
+#define ns_init_id(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: CGROUP_NS_INIT_ID, \
+ struct ipc_namespace *: IPC_NS_INIT_ID, \
+ struct mnt_namespace *: MNT_NS_INIT_ID, \
+ struct net *: NET_NS_INIT_ID, \
+ struct pid_namespace *: PID_NS_INIT_ID, \
+ struct time_namespace *: TIME_NS_INIT_ID, \
+ struct user_namespace *: USER_NS_INIT_ID, \
+ struct uts_namespace *: UTS_NS_INIT_ID)
+
+#define to_ns_operations(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \
+ struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \
+ struct mnt_namespace *: &mntns_operations, \
+ struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \
+ struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \
+ struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \
+ struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \
+ struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL))
+
+#define ns_common_type(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: CLONE_NEWCGROUP, \
+ struct ipc_namespace *: CLONE_NEWIPC, \
+ struct mnt_namespace *: CLONE_NEWNS, \
+ struct net *: CLONE_NEWNET, \
+ struct pid_namespace *: CLONE_NEWPID, \
+ struct time_namespace *: CLONE_NEWTIME, \
+ struct user_namespace *: CLONE_NEWUSER, \
+ struct uts_namespace *: CLONE_NEWUTS)
+
+#endif /* _LINUX_NS_COMMON_TYPES_H */
diff --git a/include/linux/ns/nstree_types.h b/include/linux/ns/nstree_types.h
new file mode 100644
index 000000000000..2fb28ee31efb
--- /dev/null
+++ b/include/linux/ns/nstree_types.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
+#ifndef _LINUX_NSTREE_TYPES_H
+#define _LINUX_NSTREE_TYPES_H
+
+#include <linux/rbtree.h>
+#include <linux/list.h>
+
+/**
+ * struct ns_tree_root - Root of a namespace tree
+ * @ns_rb: Red-black tree root for efficient lookups
+ * @ns_list_head: List head for sequential iteration
+ *
+ * Each namespace tree maintains both an rbtree (for O(log n) lookups)
+ * and a list (for efficient sequential iteration). The list is kept in
+ * the same sorted order as the rbtree.
+ */
+struct ns_tree_root {
+ struct rb_root ns_rb;
+ struct list_head ns_list_head;
+};
+
+/**
+ * struct ns_tree_node - Node in a namespace tree
+ * @ns_node: Red-black tree node
+ * @ns_list_entry: List entry for sequential iteration
+ *
+ * Represents a namespace's position in a tree. Each namespace has
+ * multiple tree nodes for different trees (unified, per-type, owner).
+ */
+struct ns_tree_node {
+ struct rb_node ns_node;
+ struct list_head ns_list_entry;
+};
+
+/**
+ * struct ns_tree - Namespace tree nodes and active reference count
+ * @ns_id: Unique namespace identifier
+ * @__ns_ref_active: Active reference count (do not use directly)
+ * @ns_unified_node: Node in the global namespace tree
+ * @ns_tree_node: Node in the per-type namespace tree
+ * @ns_owner_node: Node in the owner namespace's tree of owned namespaces
+ * @ns_owner_root: Root of the tree of namespaces owned by this namespace
+ * (only used when this namespace is an owner)
+ */
+struct ns_tree {
+ u64 ns_id;
+ atomic_t __ns_ref_active;
+ struct ns_tree_node ns_unified_node;
+ struct ns_tree_node ns_tree_node;
+ struct ns_tree_node ns_owner_node;
+ struct ns_tree_root ns_owner_root;
+};
+
+#endif /* _LINUX_NSTREE_TYPES_H */
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index f5b68b8abb54..825f5865bfc5 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -2,122 +2,44 @@
#ifndef _LINUX_NS_COMMON_H
#define _LINUX_NS_COMMON_H
+#include <linux/ns/ns_common_types.h>
#include <linux/refcount.h>
-#include <linux/rbtree.h>
+#include <linux/vfsdebug.h>
#include <uapi/linux/sched.h>
+#include <uapi/linux/nsfs.h>
-struct proc_ns_operations;
-
-struct cgroup_namespace;
-struct ipc_namespace;
-struct mnt_namespace;
-struct net;
-struct pid_namespace;
-struct time_namespace;
-struct user_namespace;
-struct uts_namespace;
-
-extern struct cgroup_namespace init_cgroup_ns;
-extern struct ipc_namespace init_ipc_ns;
-extern struct mnt_namespace init_mnt_ns;
-extern struct net init_net;
-extern struct pid_namespace init_pid_ns;
-extern struct time_namespace init_time_ns;
-extern struct user_namespace init_user_ns;
-extern struct uts_namespace init_uts_ns;
-
-extern const struct proc_ns_operations netns_operations;
-extern const struct proc_ns_operations utsns_operations;
-extern const struct proc_ns_operations ipcns_operations;
-extern const struct proc_ns_operations pidns_operations;
-extern const struct proc_ns_operations pidns_for_children_operations;
-extern const struct proc_ns_operations userns_operations;
-extern const struct proc_ns_operations mntns_operations;
-extern const struct proc_ns_operations cgroupns_operations;
-extern const struct proc_ns_operations timens_operations;
-extern const struct proc_ns_operations timens_for_children_operations;
-
-struct ns_common {
- u32 ns_type;
- struct dentry *stashed;
- const struct proc_ns_operations *ops;
- unsigned int inum;
- refcount_t __ns_ref; /* do not use directly */
- union {
- struct {
- u64 ns_id;
- struct rb_node ns_tree_node;
- struct list_head ns_list_node;
- };
- struct rcu_head ns_rcu;
- };
-};
-
+bool is_current_namespace(struct ns_common *ns);
int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum);
void __ns_common_free(struct ns_common *ns);
+struct ns_common *__must_check ns_owner(struct ns_common *ns);
+
+static __always_inline bool is_ns_init_inum(const struct ns_common *ns)
+{
+ VFS_WARN_ON_ONCE(ns->inum == 0);
+ return unlikely(in_range(ns->inum, MNT_NS_INIT_INO,
+ IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1));
+}
+
+static __always_inline bool is_ns_init_id(const struct ns_common *ns)
+{
+ VFS_WARN_ON_ONCE(ns->ns_id == 0);
+ return ns->ns_id <= NS_LAST_INIT_ID;
+}
-#define to_ns_common(__ns) \
- _Generic((__ns), \
- struct cgroup_namespace *: &(__ns)->ns, \
- const struct cgroup_namespace *: &(__ns)->ns, \
- struct ipc_namespace *: &(__ns)->ns, \
- const struct ipc_namespace *: &(__ns)->ns, \
- struct mnt_namespace *: &(__ns)->ns, \
- const struct mnt_namespace *: &(__ns)->ns, \
- struct net *: &(__ns)->ns, \
- const struct net *: &(__ns)->ns, \
- struct pid_namespace *: &(__ns)->ns, \
- const struct pid_namespace *: &(__ns)->ns, \
- struct time_namespace *: &(__ns)->ns, \
- const struct time_namespace *: &(__ns)->ns, \
- struct user_namespace *: &(__ns)->ns, \
- const struct user_namespace *: &(__ns)->ns, \
- struct uts_namespace *: &(__ns)->ns, \
- const struct uts_namespace *: &(__ns)->ns)
-
-#define ns_init_inum(__ns) \
- _Generic((__ns), \
- struct cgroup_namespace *: CGROUP_NS_INIT_INO, \
- struct ipc_namespace *: IPC_NS_INIT_INO, \
- struct mnt_namespace *: MNT_NS_INIT_INO, \
- struct net *: NET_NS_INIT_INO, \
- struct pid_namespace *: PID_NS_INIT_INO, \
- struct time_namespace *: TIME_NS_INIT_INO, \
- struct user_namespace *: USER_NS_INIT_INO, \
- struct uts_namespace *: UTS_NS_INIT_INO)
-
-#define ns_init_ns(__ns) \
- _Generic((__ns), \
- struct cgroup_namespace *: &init_cgroup_ns, \
- struct ipc_namespace *: &init_ipc_ns, \
- struct mnt_namespace *: &init_mnt_ns, \
- struct net *: &init_net, \
- struct pid_namespace *: &init_pid_ns, \
- struct time_namespace *: &init_time_ns, \
- struct user_namespace *: &init_user_ns, \
- struct uts_namespace *: &init_uts_ns)
-
-#define to_ns_operations(__ns) \
- _Generic((__ns), \
- struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \
- struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \
- struct mnt_namespace *: &mntns_operations, \
- struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \
- struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \
- struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \
- struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \
- struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL))
-
-#define ns_common_type(__ns) \
- _Generic((__ns), \
- struct cgroup_namespace *: CLONE_NEWCGROUP, \
- struct ipc_namespace *: CLONE_NEWIPC, \
- struct mnt_namespace *: CLONE_NEWNS, \
- struct net *: CLONE_NEWNET, \
- struct pid_namespace *: CLONE_NEWPID, \
- struct time_namespace *: CLONE_NEWTIME, \
- struct user_namespace *: CLONE_NEWUSER, \
- struct uts_namespace *: CLONE_NEWUTS)
+#define NS_COMMON_INIT(nsname) \
+{ \
+ .ns_type = ns_common_type(&nsname), \
+ .ns_id = ns_init_id(&nsname), \
+ .inum = ns_init_inum(&nsname), \
+ .ops = to_ns_operations(&nsname), \
+ .stashed = NULL, \
+ .__ns_ref = REFCOUNT_INIT(1), \
+ .__ns_ref_active = ATOMIC_INIT(1), \
+ .ns_unified_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_unified_node.ns_list_entry), \
+ .ns_tree_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_tree_node.ns_list_entry), \
+ .ns_owner_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_owner_node.ns_list_entry), \
+ .ns_owner_root.ns_list_head = LIST_HEAD_INIT(nsname.ns.ns_owner_root.ns_list_head), \
+}
#define ns_common_init(__ns) \
__ns_common_init(to_ns_common(__ns), \
@@ -133,21 +55,96 @@ void __ns_common_free(struct ns_common *ns);
#define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns)))
+static __always_inline __must_check int __ns_ref_active_read(const struct ns_common *ns)
+{
+ return atomic_read(&ns->__ns_ref_active);
+}
+
+static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns)
+{
+ return refcount_read(&ns->__ns_ref);
+}
+
static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns)
{
- return refcount_dec_and_test(&ns->__ns_ref);
+ if (is_ns_init_id(ns)) {
+ VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
+ return false;
+ }
+ if (refcount_dec_and_test(&ns->__ns_ref)) {
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns));
+ return true;
+ }
+ return false;
}
static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns)
{
- return refcount_inc_not_zero(&ns->__ns_ref);
+ if (is_ns_init_id(ns)) {
+ VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
+ return true;
+ }
+ if (refcount_inc_not_zero(&ns->__ns_ref))
+ return true;
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns));
+ return false;
}
-#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->__ns_ref)
-#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref)
-#define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns)))
-#define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns)))
-#define ns_ref_put_and_lock(__ns, __lock) \
- refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock))
+static __always_inline void __ns_ref_inc(struct ns_common *ns)
+{
+ if (is_ns_init_id(ns)) {
+ VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
+ return;
+ }
+ refcount_inc(&ns->__ns_ref);
+}
+
+static __always_inline __must_check bool __ns_ref_dec_and_lock(struct ns_common *ns,
+ spinlock_t *ns_lock)
+{
+ if (is_ns_init_id(ns)) {
+ VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
+ return false;
+ }
+ return refcount_dec_and_lock(&ns->__ns_ref, ns_lock);
+}
+
+#define ns_ref_read(__ns) __ns_ref_read(to_ns_common((__ns)))
+#define ns_ref_inc(__ns) \
+ do { if (__ns) __ns_ref_inc(to_ns_common((__ns))); } while (0)
+#define ns_ref_get(__ns) \
+ ((__ns) ? __ns_ref_get(to_ns_common((__ns))) : false)
+#define ns_ref_put(__ns) \
+ ((__ns) ? __ns_ref_put(to_ns_common((__ns))) : false)
+#define ns_ref_put_and_lock(__ns, __ns_lock) \
+ ((__ns) ? __ns_ref_dec_and_lock(to_ns_common((__ns)), __ns_lock) : false)
+
+#define ns_ref_active_read(__ns) \
+ ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0)
+
+void __ns_ref_active_put(struct ns_common *ns);
+
+#define ns_ref_active_put(__ns) \
+ do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0)
+
+static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns)
+{
+ if (!__ns_ref_active_read(ns)) {
+ VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+ return NULL;
+ }
+ if (!__ns_ref_get(ns))
+ return NULL;
+ return ns;
+}
+
+void __ns_ref_active_get(struct ns_common *ns);
+
+#define ns_ref_active_get(__ns) \
+ do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0)
#endif
diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h
index e5a5fa83d36b..731b67fc2fec 100644
--- a/include/linux/nsfs.h
+++ b/include/linux/nsfs.h
@@ -37,4 +37,7 @@ void nsfs_init(void);
#define current_in_namespace(__ns) (__current_namespace_from_type(__ns) == __ns)
+void nsproxy_ns_active_get(struct nsproxy *ns);
+void nsproxy_ns_active_put(struct nsproxy *ns);
+
#endif /* _LINUX_NSFS_H */
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index bd118a187dec..5a67648721c7 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -93,10 +93,13 @@ static inline struct cred *nsset_cred(struct nsset *set)
*/
int copy_namespaces(u64 flags, struct task_struct *tsk);
-void exit_task_namespaces(struct task_struct *tsk);
+void switch_cred_namespaces(const struct cred *old, const struct cred *new);
+void exit_nsproxy_namespaces(struct task_struct *tsk);
+void get_cred_namespaces(struct task_struct *tsk);
+void exit_cred_namespaces(struct task_struct *tsk);
void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
int exec_task_namespaces(void);
-void free_nsproxy(struct nsproxy *ns);
+void deactivate_nsproxy(struct nsproxy *ns);
int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
struct cred *, struct fs_struct *);
int __init nsproxy_cache_init(void);
@@ -104,7 +107,7 @@ int __init nsproxy_cache_init(void);
static inline void put_nsproxy(struct nsproxy *ns)
{
if (refcount_dec_and_test(&ns->count))
- free_nsproxy(ns);
+ deactivate_nsproxy(ns);
}
static inline void get_nsproxy(struct nsproxy *ns)
diff --git a/include/linux/nstree.h b/include/linux/nstree.h
index 8b8636690473..175e4625bfa6 100644
--- a/include/linux/nstree.h
+++ b/include/linux/nstree.h
@@ -1,22 +1,34 @@
/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
#ifndef _LINUX_NSTREE_H
#define _LINUX_NSTREE_H
-#include <linux/ns_common.h>
+#include <linux/ns/nstree_types.h>
#include <linux/nsproxy.h>
#include <linux/rbtree.h>
#include <linux/seqlock.h>
#include <linux/rculist.h>
#include <linux/cookie.h>
+#include <uapi/linux/nsfs.h>
-extern struct ns_tree cgroup_ns_tree;
-extern struct ns_tree ipc_ns_tree;
-extern struct ns_tree mnt_ns_tree;
-extern struct ns_tree net_ns_tree;
-extern struct ns_tree pid_ns_tree;
-extern struct ns_tree time_ns_tree;
-extern struct ns_tree user_ns_tree;
-extern struct ns_tree uts_ns_tree;
+struct ns_common;
+
+extern struct ns_tree_root cgroup_ns_tree;
+extern struct ns_tree_root ipc_ns_tree;
+extern struct ns_tree_root mnt_ns_tree;
+extern struct ns_tree_root net_ns_tree;
+extern struct ns_tree_root pid_ns_tree;
+extern struct ns_tree_root time_ns_tree;
+extern struct ns_tree_root user_ns_tree;
+extern struct ns_tree_root uts_ns_tree;
+
+void ns_tree_node_init(struct ns_tree_node *node);
+void ns_tree_root_init(struct ns_tree_root *root);
+bool ns_tree_node_empty(const struct ns_tree_node *node);
+struct rb_node *ns_tree_node_add(struct ns_tree_node *node,
+ struct ns_tree_root *root,
+ int (*cmp)(struct rb_node *, const struct rb_node *));
+void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root);
#define to_ns_tree(__ns) \
_Generic((__ns), \
@@ -29,17 +41,21 @@ extern struct ns_tree uts_ns_tree;
struct user_namespace *: &(user_ns_tree), \
struct uts_namespace *: &(uts_ns_tree))
-u64 ns_tree_gen_id(struct ns_common *ns);
-void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree);
-void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree);
+#define ns_tree_gen_id(__ns) \
+ __ns_tree_gen_id(to_ns_common(__ns), \
+ (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0))
+
+u64 __ns_tree_gen_id(struct ns_common *ns, u64 id);
+void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree);
+void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree);
struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type);
struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns,
- struct ns_tree *ns_tree,
+ struct ns_tree_root *ns_tree,
bool previous);
-static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree)
+static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_tree, u64 id)
{
- ns_tree_gen_id(ns);
+ __ns_tree_gen_id(ns, id);
__ns_tree_add_raw(ns, ns_tree);
}
@@ -59,7 +75,9 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree)
* This function assigns a new id to the namespace and adds it to the
* appropriate namespace tree and list.
*/
-#define ns_tree_add(__ns) __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns))
+#define ns_tree_add(__ns) \
+ __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns), \
+ (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0))
/**
* ns_tree_remove - Remove a namespace from a namespace tree
@@ -73,6 +91,6 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree)
#define ns_tree_adjoined_rcu(__ns, __previous) \
__ns_tree_adjoined_rcu(to_ns_common(__ns), to_ns_tree(__ns), __previous)
-#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node))
+#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node.ns_node))
#endif /* _LINUX_NSTREE_H */
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 445517a72ad0..0e7ae12c96d2 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -61,8 +61,7 @@ static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
{
- if (ns != &init_pid_ns)
- ns_ref_inc(ns);
+ ns_ref_inc(ns);
return ns;
}
diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h
index 2503f7625d65..a651e60d9410 100644
--- a/include/linux/pseudo_fs.h
+++ b/include/linux/pseudo_fs.h
@@ -9,6 +9,7 @@ struct pseudo_fs_context {
const struct xattr_handler * const *xattr;
const struct dentry_operations *dops;
unsigned long magic;
+ unsigned int s_d_flags;
};
struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 66c06fcdfe19..cf84d98964b2 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -77,6 +77,7 @@ struct cachestat_range;
struct cachestat;
struct statmount;
struct mnt_id_req;
+struct ns_id_req;
struct xattr_args;
struct file_attr;
@@ -437,6 +438,9 @@ asmlinkage long sys_statmount(const struct mnt_id_req __user *req,
asmlinkage long sys_listmount(const struct mnt_id_req __user *req,
u64 __user *mnt_ids, size_t nr_mnt_ids,
unsigned int flags);
+asmlinkage long sys_listns(const struct ns_id_req __user *req,
+ u64 __user *ns_ids, size_t nr_ns_ids,
+ unsigned int flags);
asmlinkage long sys_truncate(const char __user *path, long length);
asmlinkage long sys_ftruncate(unsigned int fd, off_t length);
#if BITS_PER_LONG == 32
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 9a9aebbf96b9..9c3be157397e 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -166,13 +166,13 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns,
ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX;
}
-#ifdef CONFIG_USER_NS
-
static inline struct user_namespace *to_user_ns(struct ns_common *ns)
{
return container_of(ns, struct user_namespace, ns);
}
+#ifdef CONFIG_USER_NS
+
static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
if (ns)
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 04e0077fb4c9..942370b3f5d2 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -857,9 +857,11 @@ __SYSCALL(__NR_open_tree_attr, sys_open_tree_attr)
__SYSCALL(__NR_file_getattr, sys_file_getattr)
#define __NR_file_setattr 469
__SYSCALL(__NR_file_setattr, sys_file_setattr)
+#define __NR_listns 470
+__SYSCALL(__NR_listns, sys_listns)
#undef __NR_syscalls
-#define __NR_syscalls 470
+#define __NR_syscalls 471
/*
* 32 bit systems traditionally used different
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index e098759ec917..a25e38d1c874 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -67,4 +67,62 @@ struct nsfs_file_handle {
#define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */
#define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */
+enum init_ns_id {
+ IPC_NS_INIT_ID = 1ULL,
+ UTS_NS_INIT_ID = 2ULL,
+ USER_NS_INIT_ID = 3ULL,
+ PID_NS_INIT_ID = 4ULL,
+ CGROUP_NS_INIT_ID = 5ULL,
+ TIME_NS_INIT_ID = 6ULL,
+ NET_NS_INIT_ID = 7ULL,
+ MNT_NS_INIT_ID = 8ULL,
+#ifdef __KERNEL__
+ NS_LAST_INIT_ID = MNT_NS_INIT_ID,
+#endif
+};
+
+enum ns_type {
+ TIME_NS = (1ULL << 7), /* CLONE_NEWTIME */
+ MNT_NS = (1ULL << 17), /* CLONE_NEWNS */
+ CGROUP_NS = (1ULL << 25), /* CLONE_NEWCGROUP */
+ UTS_NS = (1ULL << 26), /* CLONE_NEWUTS */
+ IPC_NS = (1ULL << 27), /* CLONE_NEWIPC */
+ USER_NS = (1ULL << 28), /* CLONE_NEWUSER */
+ PID_NS = (1ULL << 29), /* CLONE_NEWPID */
+ NET_NS = (1ULL << 30), /* CLONE_NEWNET */
+};
+
+/**
+ * struct ns_id_req - namespace ID request structure
+ * @size: size of this structure
+ * @spare: reserved for future use
+ * @filter: filter mask
+ * @ns_id: last namespace id
+ * @user_ns_id: owning user namespace ID
+ *
+ * Structure for passing namespace ID and miscellaneous parameters to
+ * statns(2) and listns(2).
+ *
+ * For statns(2) @param represents the request mask.
+ * For listns(2) @param represents the last listed mount id (or zero).
+ */
+struct ns_id_req {
+ __u32 size;
+ __u32 spare;
+ __u64 ns_id;
+ struct /* listns */ {
+ __u32 ns_type;
+ __u32 spare2;
+ __u64 user_ns_id;
+ };
+};
+
+/*
+ * Special @user_ns_id value that can be passed to listns()
+ */
+#define LISTNS_CURRENT_USER 0xffffffffffffffff /* Caller's userns */
+
+/* List of all ns_id_req versions. */
+#define NS_ID_REQ_SIZE_VER0 32 /* sizeof first published struct */
+
#endif /* __LINUX_NSFS_H */