summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-01-20 10:44:51 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2025-01-20 10:44:51 -0800
commit100ceb4817a2ac650e29f107cf97161ce3e2289a (patch)
tree7bb96d686ec399553dc4ab8941bed5578f7c47f5
parent1a89a6924b581884b1b54bcd3ea790b3668be2e0 (diff)
parent68e6b7d98bc64bbf1a54d963ca85111432f3a0b4 (diff)
Merge tag 'vfs-6.14-rc1.mount.v2' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs mount updates from Christian Brauner: - Add a mountinfo program to demonstrate statmount()/listmount() Add a new "mountinfo" sample userland program that demonstrates how to use statmount() and listmount() to get at the same info that /proc/pid/mountinfo provides - Remove pointless nospec.h include - Prepend statmount.mnt_opts string with security_sb_mnt_opts() Currently these mount options aren't accessible via statmount() - Add new mount namespaces to mount namespace rbtree outside of the namespace semaphore - Lockless mount namespace lookup Currently we take the read lock when looking for a mount namespace to list mounts in. We can make this lockless. The simple search case can just use a sequence counter to detect concurrent changes to the rbtree For walking the list of mount namespaces sequentially via nsfs we keep a separate rcu list as rb_prev() and rb_next() aren't usable safely with rcu. Currently there is no primitive for retrieving the previous list member. To do this we need a new deletion primitive that doesn't poison the prev pointer and a corresponding retrieval helper Since creating mount namespaces is a relatively rare event compared with querying mounts in a foreign mount namespace this is worth it. Once libmount and systemd pick up this mechanism to list mounts in foreign mount namespaces this will be used very frequently - Add extended selftests for lockless mount namespace iteration - Add a sample program to list all mounts on the system, i.e., in all mount namespaces - Improve mount namespace iteration performance Make finding the last or first mount to start iterating the mount namespace from an O(1) operation and add selftests for iterating the mount table starting from the first and last mount - Use an xarray for the old mount id While the ida does use the xarray internally we can use it explicitly which allows us to increment the unique mount id under the xa lock. This allows us to remove the atomic as we're now allocating both ids in one go - Use a shared header for vfs sample programs - Fix build warnings for new sample program to list all mounts * tag 'vfs-6.14-rc1.mount.v2' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: samples/vfs: fix build warnings samples/vfs: use shared header samples/vfs/mountinfo: Use __u64 instead of uint64_t fs: remove useless lockdep assertion fs: use xarray for old mount id selftests: add listmount() iteration tests fs: cache first and last mount samples: add test-list-all-mounts selftests: remove unneeded include selftests: add tests for mntns iteration seltests: move nsfs into filesystems subfolder fs: simplify rwlock to spinlock fs: lockless mntns lookup for nsfs rculist: add list_bidir_{del,prev}_rcu() fs: lockless mntns rbtree lookup fs: add mount namespace to rbtree late fs: prepend statmount.mnt_opts string with security_sb_mnt_opts() mount: remove inlude/nospec.h include samples: add a mountinfo program to demonstrate statmount()/listmount()
-rw-r--r--fs/mount.h31
-rw-r--r--fs/namespace.c200
-rw-r--r--fs/nsfs.c5
-rw-r--r--include/linux/rculist.h44
-rw-r--r--samples/vfs/.gitignore2
-rw-r--r--samples/vfs/Makefile2
-rw-r--r--samples/vfs/mountinfo.c274
-rw-r--r--samples/vfs/samples-vfs.h241
-rw-r--r--samples/vfs/test-list-all-mounts.c150
-rw-r--r--tools/testing/selftests/filesystems/nsfs/.gitignore (renamed from tools/testing/selftests/nsfs/.gitignore)1
-rw-r--r--tools/testing/selftests/filesystems/nsfs/Makefile (renamed from tools/testing/selftests/nsfs/Makefile)4
-rw-r--r--tools/testing/selftests/filesystems/nsfs/config (renamed from tools/testing/selftests/nsfs/config)0
-rw-r--r--tools/testing/selftests/filesystems/nsfs/iterate_mntns.c149
-rw-r--r--tools/testing/selftests/filesystems/nsfs/owner.c (renamed from tools/testing/selftests/nsfs/owner.c)0
-rw-r--r--tools/testing/selftests/filesystems/nsfs/pidns.c (renamed from tools/testing/selftests/nsfs/pidns.c)0
-rw-r--r--tools/testing/selftests/filesystems/statmount/Makefile2
-rw-r--r--tools/testing/selftests/filesystems/statmount/listmount_test.c66
-rw-r--r--tools/testing/selftests/pidfd/pidfd.h1
18 files changed, 1075 insertions, 97 deletions
diff --git a/fs/mount.h b/fs/mount.h
index 179f690a0c72..ffb613cdfeee 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -8,15 +8,23 @@
struct mnt_namespace {
struct ns_common ns;
struct mount * root;
- struct rb_root mounts; /* Protected by namespace_sem */
+ struct {
+ struct rb_root mounts; /* Protected by namespace_sem */
+ struct rb_node *mnt_last_node; /* last (rightmost) mount in the rbtree */
+ struct rb_node *mnt_first_node; /* first (leftmost) mount in the rbtree */
+ };
struct user_namespace *user_ns;
struct ucounts *ucounts;
u64 seq; /* Sequence number to prevent loops */
- wait_queue_head_t poll;
+ union {
+ wait_queue_head_t poll;
+ struct rcu_head mnt_ns_rcu;
+ };
u64 event;
unsigned int nr_mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */
+ struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */
refcount_t passive; /* number references not pinning @mounts */
} __randomize_layout;
@@ -150,22 +158,21 @@ static inline bool mnt_ns_attached(const struct mount *mnt)
static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
{
+ struct mnt_namespace *ns = mnt->mnt_ns;
WARN_ON(!mnt_ns_attached(mnt));
- rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts);
+ if (ns->mnt_last_node == &mnt->mnt_node)
+ ns->mnt_last_node = rb_prev(&mnt->mnt_node);
+ if (ns->mnt_first_node == &mnt->mnt_node)
+ ns->mnt_first_node = rb_next(&mnt->mnt_node);
+ rb_erase(&mnt->mnt_node, &ns->mounts);
RB_CLEAR_NODE(&mnt->mnt_node);
list_add_tail(&mnt->mnt_list, dt_list);
}
bool has_locked_children(struct mount *mnt, struct dentry *dentry);
-struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mnt_ns, bool previous);
-static inline struct mnt_namespace *lookup_next_mnt_ns(struct mnt_namespace *mntns)
-{
- return __lookup_next_mnt_ns(mntns, false);
-}
-static inline struct mnt_namespace *lookup_prev_mnt_ns(struct mnt_namespace *mntns)
-{
- return __lookup_next_mnt_ns(mntns, true);
-}
+struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mnt_ns,
+ bool previous);
+
static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{
return container_of(ns, struct mnt_namespace, ns);
diff --git a/fs/namespace.c b/fs/namespace.c
index 64deda6f5b2c..4013fbac354a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -33,7 +33,6 @@
#include <linux/shmem_fs.h>
#include <linux/mnt_idmapping.h>
#include <linux/pidfs.h>
-#include <linux/nospec.h>
#include "pnode.h"
#include "internal.h"
@@ -67,12 +66,12 @@ static int __init set_mphash_entries(char *str)
__setup("mphash_entries=", set_mphash_entries);
static u64 event;
-static DEFINE_IDA(mnt_id_ida);
+static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC);
static DEFINE_IDA(mnt_group_ida);
/* Don't allow confusion with old 32bit mount ID */
#define MNT_UNIQUE_ID_OFFSET (1ULL << 31)
-static atomic64_t mnt_id_ctr = ATOMIC64_INIT(MNT_UNIQUE_ID_OFFSET);
+static u64 mnt_id_ctr = MNT_UNIQUE_ID_OFFSET;
static struct hlist_head *mount_hashtable __ro_after_init;
static struct hlist_head *mountpoint_hashtable __ro_after_init;
@@ -80,8 +79,10 @@ static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
-static DEFINE_RWLOCK(mnt_ns_tree_lock);
+static DEFINE_SEQLOCK(mnt_ns_tree_lock);
+
static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
+static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */
struct mount_kattr {
unsigned int attr_set;
@@ -107,17 +108,6 @@ EXPORT_SYMBOL_GPL(fs_kobj);
*/
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
-static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
-{
- u64 seq_b = ns->seq;
-
- if (seq < seq_b)
- return -1;
- if (seq > seq_b)
- return 1;
- return 0;
-}
-
static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
{
if (!node)
@@ -125,25 +115,52 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
}
-static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
+static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
{
struct mnt_namespace *ns_a = node_to_mnt_ns(a);
struct mnt_namespace *ns_b = node_to_mnt_ns(b);
u64 seq_a = ns_a->seq;
+ u64 seq_b = ns_b->seq;
- return mnt_ns_cmp(seq_a, ns_b) < 0;
+ if (seq_a < seq_b)
+ return -1;
+ if (seq_a > seq_b)
+ return 1;
+ return 0;
+}
+
+static inline void mnt_ns_tree_write_lock(void)
+{
+ write_seqlock(&mnt_ns_tree_lock);
+}
+
+static inline void mnt_ns_tree_write_unlock(void)
+{
+ write_sequnlock(&mnt_ns_tree_lock);
}
static void mnt_ns_tree_add(struct mnt_namespace *ns)
{
- guard(write_lock)(&mnt_ns_tree_lock);
- rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
+ struct rb_node *node, *prev;
+
+ mnt_ns_tree_write_lock();
+ node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
+ /*
+ * If there's no previous entry simply add it after the
+ * head and if there is add it after the previous entry.
+ */
+ prev = rb_prev(&ns->mnt_ns_tree_node);
+ if (!prev)
+ list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list);
+ else
+ list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list);
+ mnt_ns_tree_write_unlock();
+
+ WARN_ON_ONCE(node);
}
static void mnt_ns_release(struct mnt_namespace *ns)
{
- lockdep_assert_not_held(&mnt_ns_tree_lock);
-
/* keep alive for {list,stat}mount() */
if (refcount_dec_and_test(&ns->passive)) {
put_user_ns(ns->user_ns);
@@ -152,41 +169,34 @@ static void mnt_ns_release(struct mnt_namespace *ns)
}
DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
+static void mnt_ns_release_rcu(struct rcu_head *rcu)
+{
+ mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu));
+}
+
static void mnt_ns_tree_remove(struct mnt_namespace *ns)
{
/* remove from global mount namespace list */
if (!is_anon_ns(ns)) {
- guard(write_lock)(&mnt_ns_tree_lock);
+ mnt_ns_tree_write_lock();
rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
+ list_bidir_del_rcu(&ns->mnt_ns_list);
+ mnt_ns_tree_write_unlock();
}
- mnt_ns_release(ns);
+ call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
}
-/*
- * Returns the mount namespace which either has the specified id, or has the
- * next smallest id afer the specified one.
- */
-static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
+static int mnt_ns_find(const void *key, const struct rb_node *node)
{
- struct rb_node *node = mnt_ns_tree.rb_node;
- struct mnt_namespace *ret = NULL;
-
- lockdep_assert_held(&mnt_ns_tree_lock);
-
- while (node) {
- struct mnt_namespace *n = node_to_mnt_ns(node);
+ const u64 mnt_ns_id = *(u64 *)key;
+ const struct mnt_namespace *ns = node_to_mnt_ns(node);
- if (mnt_ns_id <= n->seq) {
- ret = node_to_mnt_ns(node);
- if (mnt_ns_id == n->seq)
- break;
- node = node->rb_left;
- } else {
- node = node->rb_right;
- }
- }
- return ret;
+ if (mnt_ns_id < ns->seq)
+ return -1;
+ if (mnt_ns_id > ns->seq)
+ return 1;
+ return 0;
}
/*
@@ -196,18 +206,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
* namespace the @namespace_sem must first be acquired. If the namespace has
* already shut down before acquiring @namespace_sem, {list,stat}mount() will
* see that the mount rbtree of the namespace is empty.
+ *
+ * Note the lookup is lockless protected by a sequence counter. We only
+ * need to guard against false negatives as false positives aren't
+ * possible. So if we didn't find a mount namespace and the sequence
+ * counter has changed we need to retry. If the sequence counter is
+ * still the same we know the search actually failed.
*/
static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
{
- struct mnt_namespace *ns;
+ struct mnt_namespace *ns;
+ struct rb_node *node;
+ unsigned int seq;
+
+ guard(rcu)();
+ do {
+ seq = read_seqbegin(&mnt_ns_tree_lock);
+ node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
+ if (node)
+ break;
+ } while (read_seqretry(&mnt_ns_tree_lock, seq));
- guard(read_lock)(&mnt_ns_tree_lock);
- ns = mnt_ns_find_id_at(mnt_ns_id);
- if (!ns || ns->seq != mnt_ns_id)
- return NULL;
+ if (!node)
+ return NULL;
- refcount_inc(&ns->passive);
- return ns;
+ /*
+ * The last reference count is put with RCU delay so we can
+ * unconditonally acquire a reference here.
+ */
+ ns = node_to_mnt_ns(node);
+ refcount_inc(&ns->passive);
+ return ns;
}
static inline void lock_mount_hash(void)
@@ -237,18 +266,19 @@ static inline struct hlist_head *mp_hash(struct dentry *dentry)
static int mnt_alloc_id(struct mount *mnt)
{
- int res = ida_alloc(&mnt_id_ida, GFP_KERNEL);
+ int res;
- if (res < 0)
- return res;
- mnt->mnt_id = res;
- mnt->mnt_id_unique = atomic64_inc_return(&mnt_id_ctr);
- return 0;
+ xa_lock(&mnt_id_xa);
+ res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, XA_LIMIT(1, INT_MAX), GFP_KERNEL);
+ if (!res)
+ mnt->mnt_id_unique = ++mnt_id_ctr;
+ xa_unlock(&mnt_id_xa);
+ return res;
}
static void mnt_free_id(struct mount *mnt)
{
- ida_free(&mnt_id_ida, mnt->mnt_id);
+ xa_erase(&mnt_id_xa, mnt->mnt_id);
}
/*
@@ -1125,16 +1155,25 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
{
struct rb_node **link = &ns->mounts.rb_node;
struct rb_node *parent = NULL;
+ bool mnt_first_node = true, mnt_last_node = true;
WARN_ON(mnt_ns_attached(mnt));
mnt->mnt_ns = ns;
while (*link) {
parent = *link;
- if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique)
+ if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) {
link = &parent->rb_left;
- else
+ mnt_last_node = false;
+ } else {
link = &parent->rb_right;
+ mnt_first_node = false;
+ }
}
+
+ if (mnt_last_node)
+ ns->mnt_last_node = &mnt->mnt_node;
+ if (mnt_first_node)
+ ns->mnt_first_node = &mnt->mnt_node;
rb_link_node(&mnt->mnt_node, parent, link);
rb_insert_color(&mnt->mnt_node, &ns->mounts);
}
@@ -2070,30 +2109,34 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
return &mnt->ns;
}
-struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous)
+struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous)
{
- guard(read_lock)(&mnt_ns_tree_lock);
+ guard(rcu)();
+
for (;;) {
- struct rb_node *node;
+ struct list_head *list;
if (previous)
- node = rb_prev(&mntns->mnt_ns_tree_node);
+ list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list));
else
- node = rb_next(&mntns->mnt_ns_tree_node);
- if (!node)
+ list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list));
+ if (list_is_head(list, &mnt_ns_list))
return ERR_PTR(-ENOENT);
- mntns = node_to_mnt_ns(node);
- node = &mntns->mnt_ns_tree_node;
+ mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list);
+ /*
+ * The last passive reference count is put with RCU
+ * delay so accessing the mount namespace is not just
+ * safe but all relevant members are still valid.
+ */
if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
continue;
/*
- * Holding mnt_ns_tree_lock prevents the mount namespace from
- * being freed but it may well be on it's deathbed. We want an
- * active reference, not just a passive one here as we're
- * persisting the mount namespace.
+ * We need an active reference count as we're persisting
+ * the mount namespace and it might already be on its
+ * deathbed.
*/
if (!refcount_inc_not_zero(&mntns->ns.count))
continue;
@@ -3915,6 +3958,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
refcount_set(&new_ns->ns.count, 1);
refcount_set(&new_ns->passive, 1);
new_ns->mounts = RB_ROOT;
+ INIT_LIST_HEAD(&new_ns->mnt_ns_list);
RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
init_waitqueue_head(&new_ns->poll);
new_ns->user_ns = get_user_ns(user_ns);
@@ -3994,7 +4038,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
while (p->mnt.mnt_root != q->mnt.mnt_root)
p = next_mnt(skip_mnt_tree(p), old);
}
- mnt_ns_tree_add(new_ns);
namespace_unlock();
if (rootmnt)
@@ -4002,6 +4045,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
if (pwdmnt)
mntput(pwdmnt);
+ mnt_ns_tree_add(new_ns);
return new_ns;
}
@@ -5048,6 +5092,10 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
if (sb->s_op->show_options) {
size_t start = seq->count;
+ err = security_sb_show_options(seq, sb);
+ if (err)
+ return err;
+
err = sb->s_op->show_options(seq, mnt->mnt_root);
if (err)
return err;
@@ -5535,9 +5583,9 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
if (!last_mnt_id) {
if (reverse)
- first = node_to_mount(rb_last(&ns->mounts));
+ first = node_to_mount(ns->mnt_last_node);
else
- first = node_to_mount(rb_first(&ns->mounts));
+ first = node_to_mount(ns->mnt_first_node);
} else {
if (reverse)
first = mnt_find_id_at_reverse(ns, last_mnt_id - 1);
diff --git a/fs/nsfs.c b/fs/nsfs.c
index c675fc40ce2d..663f8656158d 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -274,10 +274,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
if (usize < MNT_NS_INFO_SIZE_VER0)
return -EINVAL;
- if (previous)
- mnt_ns = lookup_prev_mnt_ns(to_mnt_ns(ns));
- else
- mnt_ns = lookup_next_mnt_ns(to_mnt_ns(ns));
+ mnt_ns = get_sequential_mnt_ns(to_mnt_ns(ns), previous);
if (IS_ERR(mnt_ns))
return PTR_ERR(mnt_ns);
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 14dfa6008467..1b11926ddd47 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -30,6 +30,17 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
* way, we must not access it directly
*/
#define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next)))
+/*
+ * Return the ->prev pointer of a list_head in an rcu safe way. Don't
+ * access it directly.
+ *
+ * Any list traversed with list_bidir_prev_rcu() must never use
+ * list_del_rcu(). Doing so will poison the ->prev pointer that
+ * list_bidir_prev_rcu() relies on, which will result in segfaults.
+ * To prevent these segfaults, use list_bidir_del_rcu() instead
+ * of list_del_rcu().
+ */
+#define list_bidir_prev_rcu(list) (*((struct list_head __rcu **)(&(list)->prev)))
/**
* list_tail_rcu - returns the prev pointer of the head of the list
@@ -159,6 +170,39 @@ static inline void list_del_rcu(struct list_head *entry)
}
/**
+ * list_bidir_del_rcu - deletes entry from list without re-initialization
+ * @entry: the element to delete from the list.
+ *
+ * In contrast to list_del_rcu() doesn't poison the prev pointer thus
+ * allowing backwards traversal via list_bidir_prev_rcu().
+ *
+ * Note: list_empty() on entry does not return true after this because
+ * the entry is in a special undefined state that permits RCU-based
+ * lockfree reverse traversal. In particular this means that we can not
+ * poison the forward and backwards pointers that may still be used for
+ * walking the list.
+ *
+ * The caller must take whatever precautions are necessary (such as
+ * holding appropriate locks) to avoid racing with another list-mutation
+ * primitive, such as list_bidir_del_rcu() or list_add_rcu(), running on
+ * this same list. However, it is perfectly legal to run concurrently
+ * with the _rcu list-traversal primitives, such as
+ * list_for_each_entry_rcu().
+ *
+ * Note that list_del_rcu() and list_bidir_del_rcu() must not be used on
+ * the same list.
+ *
+ * Note that the caller is not permitted to immediately free
+ * the newly deleted entry. Instead, either synchronize_rcu()
+ * or call_rcu() must be used to defer freeing until an RCU
+ * grace period has elapsed.
+ */
+static inline void list_bidir_del_rcu(struct list_head *entry)
+{
+ __list_del_entry(entry);
+}
+
+/**
* hlist_del_init_rcu - deletes entry from hash list with re-initialization
* @n: the element to delete from the hash list.
*
diff --git a/samples/vfs/.gitignore b/samples/vfs/.gitignore
index 79212d91285b..8708341bc082 100644
--- a/samples/vfs/.gitignore
+++ b/samples/vfs/.gitignore
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
/test-fsmount
+/test-list-all-mounts
/test-statx
+/mountinfo
diff --git a/samples/vfs/Makefile b/samples/vfs/Makefile
index 6377a678134a..6554b73a75c8 100644
--- a/samples/vfs/Makefile
+++ b/samples/vfs/Makefile
@@ -1,4 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
-userprogs-always-y += test-fsmount test-statx
+userprogs-always-y += test-fsmount test-statx mountinfo test-list-all-mounts
userccflags += -I usr/include
diff --git a/samples/vfs/mountinfo.c b/samples/vfs/mountinfo.c
new file mode 100644
index 000000000000..bc78275cac69
--- /dev/null
+++ b/samples/vfs/mountinfo.c
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/*
+ * Use pidfds, nsfds, listmount() and statmount() mimic the
+ * contents of /proc/self/mountinfo.
+ */
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__
+#include <stdio.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <alloca.h>
+#include <getopt.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <errno.h>
+
+#include "samples-vfs.h"
+
+/* max mounts per listmount call */
+#define MAXMOUNTS 1024
+
+/* size of struct statmount (including trailing string buffer) */
+#define STATMOUNT_BUFSIZE 4096
+
+static bool ext_format;
+
+#ifndef __NR_pidfd_open
+#define __NR_pidfd_open -1
+#endif
+
+/*
+ * There are no bindings in glibc for listmount() and statmount() (yet),
+ * make our own here.
+ */
+static int statmount(__u64 mnt_id, __u64 mnt_ns_id, __u64 mask,
+ struct statmount *buf, size_t bufsize,
+ unsigned int flags)
+{
+ struct mnt_id_req req = {
+ .size = MNT_ID_REQ_SIZE_VER0,
+ .mnt_id = mnt_id,
+ .param = mask,
+ };
+
+ if (mnt_ns_id) {
+ req.size = MNT_ID_REQ_SIZE_VER1;
+ req.mnt_ns_id = mnt_ns_id;
+ }
+
+ return syscall(__NR_statmount, &req, buf, bufsize, flags);
+}
+
+static ssize_t listmount(__u64 mnt_id, __u64 mnt_ns_id, __u64 last_mnt_id,
+ __u64 list[], size_t num, unsigned int flags)
+{
+ struct mnt_id_req req = {
+ .size = MNT_ID_REQ_SIZE_VER0,
+ .mnt_id = mnt_id,
+ .param = last_mnt_id,
+ };
+
+ if (mnt_ns_id) {
+ req.size = MNT_ID_REQ_SIZE_VER1;
+ req.mnt_ns_id = mnt_ns_id;
+ }
+
+ return syscall(__NR_listmount, &req, list, num, flags);
+}
+
+static void show_mnt_attrs(__u64 flags)
+{
+ printf("%s", flags & MOUNT_ATTR_RDONLY ? "ro" : "rw");
+
+ if (flags & MOUNT_ATTR_NOSUID)
+ printf(",nosuid");
+ if (flags & MOUNT_ATTR_NODEV)
+ printf(",nodev");
+ if (flags & MOUNT_ATTR_NOEXEC)
+ printf(",noexec");
+
+ switch (flags & MOUNT_ATTR__ATIME) {
+ case MOUNT_ATTR_RELATIME:
+ printf(",relatime");
+ break;
+ case MOUNT_ATTR_NOATIME:
+ printf(",noatime");
+ break;
+ case MOUNT_ATTR_STRICTATIME:
+ /* print nothing */
+ break;
+ }
+
+ if (flags & MOUNT_ATTR_NODIRATIME)
+ printf(",nodiratime");
+ if (flags & MOUNT_ATTR_NOSYMFOLLOW)
+ printf(",nosymfollow");
+ if (flags & MOUNT_ATTR_IDMAP)
+ printf(",idmapped");
+}
+
+static void show_propagation(struct statmount *sm)
+{
+ if (sm->mnt_propagation & MS_SHARED)
+ printf(" shared:%llu", sm->mnt_peer_group);
+ if (sm->mnt_propagation & MS_SLAVE) {
+ printf(" master:%llu", sm->mnt_master);
+ if (sm->propagate_from && sm->propagate_from != sm->mnt_master)
+ printf(" propagate_from:%llu", sm->propagate_from);
+ }
+ if (sm->mnt_propagation & MS_UNBINDABLE)
+ printf(" unbindable");
+}
+
+static void show_sb_flags(__u64 flags)
+{
+ printf("%s", flags & MS_RDONLY ? "ro" : "rw");
+ if (flags & MS_SYNCHRONOUS)
+ printf(",sync");
+ if (flags & MS_DIRSYNC)
+ printf(",dirsync");
+ if (flags & MS_MANDLOCK)
+ printf(",mand");
+ if (flags & MS_LAZYTIME)
+ printf(",lazytime");
+}
+
+static int dump_mountinfo(__u64 mnt_id, __u64 mnt_ns_id)
+{
+ int ret;
+ struct statmount *buf = alloca(STATMOUNT_BUFSIZE);
+ const __u64 mask = STATMOUNT_SB_BASIC | STATMOUNT_MNT_BASIC |
+ STATMOUNT_PROPAGATE_FROM | STATMOUNT_FS_TYPE |
+ STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT |
+ STATMOUNT_MNT_OPTS | STATMOUNT_FS_SUBTYPE |
+ STATMOUNT_SB_SOURCE;
+
+ ret = statmount(mnt_id, mnt_ns_id, mask, buf, STATMOUNT_BUFSIZE, 0);
+ if (ret < 0) {
+ perror("statmount");
+ return 1;
+ }
+
+ if (ext_format)
+ printf("0x%llx 0x%llx 0x%llx ", mnt_ns_id, mnt_id, buf->mnt_parent_id);
+
+ printf("%u %u %u:%u %s %s ", buf->mnt_id_old, buf->mnt_parent_id_old,
+ buf->sb_dev_major, buf->sb_dev_minor,
+ &buf->str[buf->mnt_root],
+ &buf->str[buf->mnt_point]);
+ show_mnt_attrs(buf->mnt_attr);
+ show_propagation(buf);
+
+ printf(" - %s", &buf->str[buf->fs_type]);
+ if (buf->mask & STATMOUNT_FS_SUBTYPE)
+ printf(".%s", &buf->str[buf->fs_subtype]);
+ if (buf->mask & STATMOUNT_SB_SOURCE)
+ printf(" %s ", &buf->str[buf->sb_source]);
+ else
+ printf(" :none ");
+
+ show_sb_flags(buf->sb_flags);
+ if (buf->mask & STATMOUNT_MNT_OPTS)
+ printf(",%s", &buf->str[buf->mnt_opts]);
+ printf("\n");
+ return 0;
+}
+
+static int dump_mounts(__u64 mnt_ns_id)
+{
+ __u64 mntid[MAXMOUNTS];
+ __u64 last_mnt_id = 0;
+ ssize_t count;
+ int i;
+
+ /*
+ * Get a list of all mntids in mnt_ns_id. If it returns MAXMOUNTS
+ * mounts, then go again until we get everything.
+ */
+ do {
+ count = listmount(LSMT_ROOT, mnt_ns_id, last_mnt_id, mntid, MAXMOUNTS, 0);
+ if (count < 0 || count > MAXMOUNTS) {
+ errno = count < 0 ? errno : count;
+ perror("listmount");
+ return 1;
+ }
+
+ /* Walk the returned mntids and print info about each */
+ for (i = 0; i < count; ++i) {
+ int ret = dump_mountinfo(mntid[i], mnt_ns_id);
+
+ if (ret != 0)
+ return ret;
+ }
+ /* Set up last_mnt_id to pick up where we left off */
+ last_mnt_id = mntid[count - 1];
+ } while (count == MAXMOUNTS);
+ return 0;
+}
+
+static void usage(const char * const prog)
+{
+ printf("Usage:\n");
+ printf("%s [-e] [-p pid] [-r] [-h]\n", prog);
+ printf(" -e: extended format\n");
+ printf(" -h: print usage message\n");
+ printf(" -p: get mount namespace from given pid\n");
+ printf(" -r: recursively print all mounts in all child namespaces\n");
+}
+
+int main(int argc, char * const *argv)
+{
+ struct mnt_ns_info mni = { .size = MNT_NS_INFO_SIZE_VER0 };
+ int pidfd, mntns, ret, opt;
+ pid_t pid = getpid();
+ bool recursive = false;
+
+ while ((opt = getopt(argc, argv, "ehp:r")) != -1) {
+ switch (opt) {
+ case 'e':
+ ext_format = true;
+ break;
+ case 'h':
+ usage(argv[0]);
+ return 0;
+ case 'p':
+ pid = atoi(optarg);
+ break;
+ case 'r':
+ recursive = true;
+ break;
+ }
+ }
+
+ /* Get a pidfd for pid */
+ pidfd = syscall(__NR_pidfd_open, pid, 0);
+ if (pidfd < 0) {
+ perror("pidfd_open");
+ return 1;
+ }
+
+ /* Get the mnt namespace for pidfd */
+ mntns = ioctl(pidfd, PIDFD_GET_MNT_NAMESPACE, NULL);
+ if (mntns < 0) {
+ perror("PIDFD_GET_MNT_NAMESPACE");
+ return 1;
+ }
+ close(pidfd);
+
+ /* get info about mntns. In particular, the mnt_ns_id */
+ ret = ioctl(mntns, NS_MNT_GET_INFO, &mni);
+ if (ret < 0) {
+ perror("NS_MNT_GET_INFO");
+ return 1;
+ }
+
+ do {
+ int ret;
+
+ ret = dump_mounts(mni.mnt_ns_id);
+ if (ret)
+ return ret;
+
+ if (!recursive)
+ break;
+
+ /* get the next mntns (and overwrite the old mount ns info) */
+ ret = ioctl(mntns, NS_MNT_GET_NEXT, &mni);
+ close(mntns);
+ mntns = ret;
+ } while (mntns >= 0);
+
+ return 0;
+}
diff --git a/samples/vfs/samples-vfs.h b/samples/vfs/samples-vfs.h
new file mode 100644
index 000000000000..103e1e7c4cec
--- /dev/null
+++ b/samples/vfs/samples-vfs.h
@@ -0,0 +1,241 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __SAMPLES_VFS_H
+#define __SAMPLES_VFS_H
+
+#include <errno.h>
+#include <linux/types.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+
+#define die_errno(format, ...) \
+ do { \
+ fprintf(stderr, "%m | %s: %d: %s: " format "\n", __FILE__, \
+ __LINE__, __func__, ##__VA_ARGS__); \
+ exit(EXIT_FAILURE); \
+ } while (0)
+
+struct statmount {
+ __u32 size; /* Total size, including strings */
+ __u32 mnt_opts; /* [str] Options (comma separated, escaped) */
+ __u64 mask; /* What results were written */
+ __u32 sb_dev_major; /* Device ID */
+ __u32 sb_dev_minor;
+ __u64 sb_magic; /* ..._SUPER_MAGIC */
+ __u32 sb_flags; /* SB_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */
+ __u32 fs_type; /* [str] Filesystem type */
+ __u64 mnt_id; /* Unique ID of mount */
+ __u64 mnt_parent_id; /* Unique ID of parent (for root == mnt_id) */
+ __u32 mnt_id_old; /* Reused IDs used in proc/.../mountinfo */
+ __u32 mnt_parent_id_old;
+ __u64 mnt_attr; /* MOUNT_ATTR_... */
+ __u64 mnt_propagation; /* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */
+ __u64 mnt_peer_group; /* ID of shared peer group */
+ __u64 mnt_master; /* Mount receives propagation from this ID */
+ __u64 propagate_from; /* Propagation from in current namespace */
+ __u32 mnt_root; /* [str] Root of mount relative to root of fs */
+ __u32 mnt_point; /* [str] Mountpoint relative to current root */
+ __u64 mnt_ns_id; /* ID of the mount namespace */
+ __u32 fs_subtype; /* [str] Subtype of fs_type (if any) */
+ __u32 sb_source; /* [str] Source string of the mount */
+ __u32 opt_num; /* Number of fs options */
+ __u32 opt_array; /* [str] Array of nul terminated fs options */
+ __u32 opt_sec_num; /* Number of security options */
+ __u32 opt_sec_array; /* [str] Array of nul terminated security options */
+ __u64 __spare2[46];
+ char str[]; /* Variable size part containing strings */
+};
+
+struct mnt_id_req {
+ __u32 size;
+ __u32 spare;
+ __u64 mnt_id;
+ __u64 param;
+ __u64 mnt_ns_id;
+};
+
+#ifndef MNT_ID_REQ_SIZE_VER0
+#define MNT_ID_REQ_SIZE_VER0 24 /* sizeof first published struct */
+#endif
+
+#ifndef MNT_ID_REQ_SIZE_VER1
+#define MNT_ID_REQ_SIZE_VER1 32 /* sizeof second published struct */
+#endif
+
+/* Get the id for a mount namespace */
+#ifndef NS_GET_MNTNS_ID
+#define NS_GET_MNTNS_ID _IO(0xb7, 0x5)
+#endif
+
+struct mnt_ns_info {
+ __u32 size;
+ __u32 nr_mounts;
+ __u64 mnt_ns_id;
+};
+
+#ifndef MNT_NS_INFO_SIZE_VER0
+#define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct */
+#endif
+
+#ifndef NS_MNT_GET_INFO
+#define NS_MNT_GET_INFO _IOR(0xb7, 10, struct mnt_ns_info)
+#endif
+
+#ifndef NS_MNT_GET_NEXT
+#define NS_MNT_GET_NEXT _IOR(0xb7, 11, struct mnt_ns_info)
+#endif
+
+#ifndef NS_MNT_GET_PREV
+#define NS_MNT_GET_PREV _IOR(0xb7, 12, struct mnt_ns_info)
+#endif
+
+#ifndef PIDFD_GET_MNT_NAMESPACE
+#define PIDFD_GET_MNT_NAMESPACE _IO(0xFF, 3)
+#endif
+
+#ifndef __NR_listmount
+#define __NR_listmount 458
+#endif
+
+#ifndef __NR_statmount
+#define __NR_statmount 457
+#endif
+
+#ifndef LSMT_ROOT
+#define LSMT_ROOT 0xffffffffffffffff /* root mount */
+#endif
+
+/* @mask bits for statmount(2) */
+#ifndef STATMOUNT_SB_BASIC
+#define STATMOUNT_SB_BASIC 0x00000001U /* Want/got sb_... */
+#endif
+
+#ifndef STATMOUNT_MNT_BASIC
+#define STATMOUNT_MNT_BASIC 0x00000002U /* Want/got mnt_... */
+#endif
+
+#ifndef STATMOUNT_PROPAGATE_FROM
+#define STATMOUNT_PROPAGATE_FROM 0x00000004U /* Want/got propagate_from */
+#endif
+
+#ifndef STATMOUNT_MNT_ROOT
+#define STATMOUNT_MNT_ROOT 0x00000008U /* Want/got mnt_root */
+#endif
+
+#ifndef STATMOUNT_MNT_POINT
+#define STATMOUNT_MNT_POINT 0x00000010U /* Want/got mnt_point */
+#endif
+
+#ifndef STATMOUNT_FS_TYPE
+#define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */
+#endif
+
+#ifndef STATMOUNT_MNT_NS_ID
+#define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */
+#endif
+
+#ifndef STATMOUNT_MNT_OPTS
+#define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */
+#endif
+
+#ifndef STATMOUNT_FS_SUBTYPE
+#define STATMOUNT_FS_SUBTYPE 0x00000100U /* Want/got fs_subtype */
+#endif
+
+#ifndef STATMOUNT_SB_SOURCE
+#define STATMOUNT_SB_SOURCE 0x00000200U /* Want/got sb_source */
+#endif
+
+#ifndef STATMOUNT_OPT_ARRAY
+#define STATMOUNT_OPT_ARRAY 0x00000400U /* Want/got opt_... */
+#endif
+
+#ifndef STATMOUNT_OPT_SEC_ARRAY
+#define STATMOUNT_OPT_SEC_ARRAY 0x00000800U /* Want/got opt_sec... */
+#endif
+
+#ifndef STATX_MNT_ID_UNIQUE
+#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */
+#endif
+
+#ifndef MOUNT_ATTR_RDONLY
+#define MOUNT_ATTR_RDONLY 0x00000001 /* Mount read-only */
+#endif
+
+#ifndef MOUNT_ATTR_NOSUID
+#define MOUNT_ATTR_NOSUID 0x00000002 /* Ignore suid and sgid bits */
+#endif
+
+#ifndef MOUNT_ATTR_NODEV
+#define MOUNT_ATTR_NODEV 0x00000004 /* Disallow access to device special files */
+#endif
+
+#ifndef MOUNT_ATTR_NOEXEC
+#define MOUNT_ATTR_NOEXEC 0x00000008 /* Disallow program execution */
+#endif
+
+#ifndef MOUNT_ATTR__ATIME
+#define MOUNT_ATTR__ATIME 0x00000070 /* Setting on how atime should be updated */
+#endif
+
+#ifndef MOUNT_ATTR_RELATIME
+#define MOUNT_ATTR_RELATIME 0x00000000 /* - Update atime relative to mtime/ctime. */
+#endif
+
+#ifndef MOUNT_ATTR_NOATIME
+#define MOUNT_ATTR_NOATIME 0x00000010 /* - Do not update access times. */
+#endif
+
+#ifndef MOUNT_ATTR_STRICTATIME
+#define MOUNT_ATTR_STRICTATIME 0x00000020 /* - Always perform atime updates */
+#endif
+
+#ifndef MOUNT_ATTR_NODIRATIME
+#define MOUNT_ATTR_NODIRATIME 0x00000080 /* Do not update directory access times */
+#endif
+
+#ifndef MOUNT_ATTR_IDMAP
+#define MOUNT_ATTR_IDMAP 0x00100000 /* Idmap mount to @userns_fd in struct mount_attr. */
+#endif
+
+#ifndef MOUNT_ATTR_NOSYMFOLLOW
+#define MOUNT_ATTR_NOSYMFOLLOW 0x00200000 /* Do not follow symlinks */
+#endif
+
+#ifndef MS_RDONLY
+#define MS_RDONLY 1 /* Mount read-only */
+#endif
+
+#ifndef MS_SYNCHRONOUS
+#define MS_SYNCHRONOUS 16 /* Writes are synced at once */
+#endif
+
+#ifndef MS_MANDLOCK
+#define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */
+#endif
+
+#ifndef MS_DIRSYNC
+#define MS_DIRSYNC 128 /* Directory modifications are synchronous */
+#endif
+
+#ifndef MS_UNBINDABLE
+#define MS_UNBINDABLE (1<<17) /* change to unbindable */
+#endif
+
+#ifndef MS_PRIVATE
+#define MS_PRIVATE (1<<18) /* change to private */
+#endif
+
+#ifndef MS_SLAVE
+#define MS_SLAVE (1<<19) /* change to slave */
+#endif
+
+#ifndef MS_SHARED
+#define MS_SHARED (1<<20) /* change to shared */
+#endif
+
+#ifndef MS_LAZYTIME
+#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
+#endif
+
+#endif /* __SAMPLES_VFS_H */
diff --git a/samples/vfs/test-list-all-mounts.c b/samples/vfs/test-list-all-mounts.c
new file mode 100644
index 000000000000..1a02ea4593e3
--- /dev/null
+++ b/samples/vfs/test-list-all-mounts.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <limits.h>
+#include <linux/types.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#include "../../tools/testing/selftests/pidfd/pidfd.h"
+#include "samples-vfs.h"
+
+static int __statmount(__u64 mnt_id, __u64 mnt_ns_id, __u64 mask,
+ struct statmount *stmnt, size_t bufsize,
+ unsigned int flags)
+{
+ struct mnt_id_req req = {
+ .size = MNT_ID_REQ_SIZE_VER1,
+ .mnt_id = mnt_id,
+ .param = mask,
+ .mnt_ns_id = mnt_ns_id,
+ };
+
+ return syscall(__NR_statmount, &req, stmnt, bufsize, flags);
+}
+
+static struct statmount *sys_statmount(__u64 mnt_id, __u64 mnt_ns_id,
+ __u64 mask, unsigned int flags)
+{
+ size_t bufsize = 1 << 15;
+ struct statmount *stmnt = NULL, *tmp = NULL;
+ int ret;
+
+ for (;;) {
+ tmp = realloc(stmnt, bufsize);
+ if (!tmp)
+ goto out;
+
+ stmnt = tmp;
+ ret = __statmount(mnt_id, mnt_ns_id, mask, stmnt, bufsize, flags);
+ if (!ret)
+ return stmnt;
+
+ if (errno != EOVERFLOW)
+ goto out;
+
+ bufsize <<= 1;
+ if (bufsize >= UINT_MAX / 2)
+ goto out;
+ }
+
+out:
+ free(stmnt);
+ return NULL;
+}
+
+static ssize_t sys_listmount(__u64 mnt_id, __u64 last_mnt_id, __u64 mnt_ns_id,
+ __u64 list[], size_t num, unsigned int flags)
+{
+ struct mnt_id_req req = {
+ .size = MNT_ID_REQ_SIZE_VER1,
+ .mnt_id = mnt_id,
+ .param = last_mnt_id,
+ .mnt_ns_id = mnt_ns_id,
+ };
+
+ return syscall(__NR_listmount, &req, list, num, flags);
+}
+
+int main(int argc, char *argv[])
+{
+#define LISTMNT_BUFFER 10
+ __u64 list[LISTMNT_BUFFER], last_mnt_id = 0;
+ int ret, pidfd, fd_mntns;
+ struct mnt_ns_info info = {};
+
+ pidfd = sys_pidfd_open(getpid(), 0);
+ if (pidfd < 0)
+ die_errno("pidfd_open failed");
+
+ fd_mntns = ioctl(pidfd, PIDFD_GET_MNT_NAMESPACE, 0);
+ if (fd_mntns < 0)
+ die_errno("ioctl(PIDFD_GET_MNT_NAMESPACE) failed");
+
+ ret = ioctl(fd_mntns, NS_MNT_GET_INFO, &info);
+ if (ret < 0)
+ die_errno("ioctl(NS_GET_MNTNS_ID) failed");
+
+ printf("Listing %u mounts for mount namespace %" PRIu64 "\n",
+ info.nr_mounts, (uint64_t)info.mnt_ns_id);
+ for (;;) {
+ ssize_t nr_mounts;
+next:
+ nr_mounts = sys_listmount(LSMT_ROOT, last_mnt_id,
+ info.mnt_ns_id, list, LISTMNT_BUFFER,
+ 0);
+ if (nr_mounts <= 0) {
+ int fd_mntns_next;
+
+ printf("Finished listing %u mounts for mount namespace %" PRIu64 "\n\n",
+ info.nr_mounts, (uint64_t)info.mnt_ns_id);
+ fd_mntns_next = ioctl(fd_mntns, NS_MNT_GET_NEXT, &info);
+ if (fd_mntns_next < 0) {
+ if (errno == ENOENT) {
+ printf("Finished listing all mount namespaces\n");
+ exit(0);
+ }
+ die_errno("ioctl(NS_MNT_GET_NEXT) failed");
+ }
+ close(fd_mntns);
+ fd_mntns = fd_mntns_next;
+ last_mnt_id = 0;
+ printf("Listing %u mounts for mount namespace %" PRIu64 "\n",
+ info.nr_mounts, (uint64_t)info.mnt_ns_id);
+ goto next;
+ }
+
+ for (size_t cur = 0; cur < nr_mounts; cur++) {
+ struct statmount *stmnt;
+
+ last_mnt_id = list[cur];
+
+ stmnt = sys_statmount(last_mnt_id, info.mnt_ns_id,
+ STATMOUNT_SB_BASIC |
+ STATMOUNT_MNT_BASIC |
+ STATMOUNT_MNT_ROOT |
+ STATMOUNT_MNT_POINT |
+ STATMOUNT_MNT_NS_ID |
+ STATMOUNT_MNT_OPTS |
+ STATMOUNT_FS_TYPE, 0);
+ if (!stmnt) {
+ printf("Failed to statmount(%" PRIu64 ") in mount namespace(%" PRIu64 ")\n",
+ (uint64_t)last_mnt_id, (uint64_t)info.mnt_ns_id);
+ continue;
+ }
+
+ printf("mnt_id:\t\t%" PRIu64 "\nmnt_parent_id:\t%" PRIu64 "\nfs_type:\t%s\nmnt_root:\t%s\nmnt_point:\t%s\nmnt_opts:\t%s\n\n",
+ (uint64_t)stmnt->mnt_id,
+ (uint64_t)stmnt->mnt_parent_id,
+ stmnt->str + stmnt->fs_type,
+ stmnt->str + stmnt->mnt_root,
+ stmnt->str + stmnt->mnt_point,
+ stmnt->str + stmnt->mnt_opts);
+ free(stmnt);
+ }
+ }
+
+ exit(0);
+}
diff --git a/tools/testing/selftests/nsfs/.gitignore b/tools/testing/selftests/filesystems/nsfs/.gitignore
index ed79ebdf286e..92a8249006d1 100644
--- a/tools/testing/selftests/nsfs/.gitignore
+++ b/tools/testing/selftests/filesystems/nsfs/.gitignore
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
owner
pidns
+iterate_mntns
diff --git a/tools/testing/selftests/nsfs/Makefile b/tools/testing/selftests/filesystems/nsfs/Makefile
index dd9bd50b7b93..231aaa7dfd95 100644
--- a/tools/testing/selftests/nsfs/Makefile
+++ b/tools/testing/selftests/filesystems/nsfs/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
-TEST_GEN_PROGS := owner pidns
+TEST_GEN_PROGS := owner pidns iterate_mntns
CFLAGS := -Wall -Werror
-include ../lib.mk
+include ../../lib.mk
diff --git a/tools/testing/selftests/nsfs/config b/tools/testing/selftests/filesystems/nsfs/config
index 598d0a225fc9..598d0a225fc9 100644
--- a/tools/testing/selftests/nsfs/config
+++ b/tools/testing/selftests/filesystems/nsfs/config
diff --git a/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c b/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c
new file mode 100644
index 000000000000..457cf76f3c5f
--- /dev/null
+++ b/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "../../kselftest_harness.h"
+
+#define MNT_NS_COUNT 11
+#define MNT_NS_LAST_INDEX 10
+
+struct mnt_ns_info {
+ __u32 size;
+ __u32 nr_mounts;
+ __u64 mnt_ns_id;
+};
+
+#define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct */
+
+/* Get information about namespace. */
+#define NS_MNT_GET_INFO _IOR(0xb7, 10, struct mnt_ns_info)
+/* Get next namespace. */
+#define NS_MNT_GET_NEXT _IOR(0xb7, 11, struct mnt_ns_info)
+/* Get previous namespace. */
+#define NS_MNT_GET_PREV _IOR(0xb7, 12, struct mnt_ns_info)
+
+FIXTURE(iterate_mount_namespaces) {
+ int fd_mnt_ns[MNT_NS_COUNT];
+ __u64 mnt_ns_id[MNT_NS_COUNT];
+};
+
+FIXTURE_SETUP(iterate_mount_namespaces)
+{
+ for (int i = 0; i < MNT_NS_COUNT; i++)
+ self->fd_mnt_ns[i] = -EBADF;
+
+ /*
+ * Creating a new user namespace let's us guarantee that we only see
+ * mount namespaces that we did actually create.
+ */
+ ASSERT_EQ(unshare(CLONE_NEWUSER), 0);
+
+ for (int i = 0; i < MNT_NS_COUNT; i++) {
+ struct mnt_ns_info info = {};
+
+ ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+ self->fd_mnt_ns[i] = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC);
+ ASSERT_GE(self->fd_mnt_ns[i], 0);
+ ASSERT_EQ(ioctl(self->fd_mnt_ns[i], NS_MNT_GET_INFO, &info), 0);
+ self->mnt_ns_id[i] = info.mnt_ns_id;
+ }
+}
+
+FIXTURE_TEARDOWN(iterate_mount_namespaces)
+{
+ for (int i = 0; i < MNT_NS_COUNT; i++) {
+ if (self->fd_mnt_ns[i] < 0)
+ continue;
+ ASSERT_EQ(close(self->fd_mnt_ns[i]), 0);
+ }
+}
+
+TEST_F(iterate_mount_namespaces, iterate_all_forward)
+{
+ int fd_mnt_ns_cur, count = 0;
+
+ fd_mnt_ns_cur = fcntl(self->fd_mnt_ns[0], F_DUPFD_CLOEXEC);
+ ASSERT_GE(fd_mnt_ns_cur, 0);
+
+ for (;; count++) {
+ struct mnt_ns_info info = {};
+ int fd_mnt_ns_next;
+
+ fd_mnt_ns_next = ioctl(fd_mnt_ns_cur, NS_MNT_GET_NEXT, &info);
+ if (fd_mnt_ns_next < 0 && errno == ENOENT)
+ break;
+ ASSERT_GE(fd_mnt_ns_next, 0);
+ ASSERT_EQ(close(fd_mnt_ns_cur), 0);
+ fd_mnt_ns_cur = fd_mnt_ns_next;
+ }
+ ASSERT_EQ(count, MNT_NS_LAST_INDEX);
+}
+
+TEST_F(iterate_mount_namespaces, iterate_all_backwards)
+{
+ int fd_mnt_ns_cur, count = 0;
+
+ fd_mnt_ns_cur = fcntl(self->fd_mnt_ns[MNT_NS_LAST_INDEX], F_DUPFD_CLOEXEC);
+ ASSERT_GE(fd_mnt_ns_cur, 0);
+
+ for (;; count++) {
+ struct mnt_ns_info info = {};
+ int fd_mnt_ns_prev;
+
+ fd_mnt_ns_prev = ioctl(fd_mnt_ns_cur, NS_MNT_GET_PREV, &info);
+ if (fd_mnt_ns_prev < 0 && errno == ENOENT)
+ break;
+ ASSERT_GE(fd_mnt_ns_prev, 0);
+ ASSERT_EQ(close(fd_mnt_ns_cur), 0);
+ fd_mnt_ns_cur = fd_mnt_ns_prev;
+ }
+ ASSERT_EQ(count, MNT_NS_LAST_INDEX);
+}
+
+TEST_F(iterate_mount_namespaces, iterate_forward)
+{
+ int fd_mnt_ns_cur;
+
+ ASSERT_EQ(setns(self->fd_mnt_ns[0], CLONE_NEWNS), 0);
+
+ fd_mnt_ns_cur = self->fd_mnt_ns[0];
+ for (int i = 1; i < MNT_NS_COUNT; i++) {
+ struct mnt_ns_info info = {};
+ int fd_mnt_ns_next;
+
+ fd_mnt_ns_next = ioctl(fd_mnt_ns_cur, NS_MNT_GET_NEXT, &info);
+ ASSERT_GE(fd_mnt_ns_next, 0);
+ ASSERT_EQ(close(fd_mnt_ns_cur), 0);
+ fd_mnt_ns_cur = fd_mnt_ns_next;
+ ASSERT_EQ(info.mnt_ns_id, self->mnt_ns_id[i]);
+ }
+}
+
+TEST_F(iterate_mount_namespaces, iterate_backward)
+{
+ int fd_mnt_ns_cur;
+
+ ASSERT_EQ(setns(self->fd_mnt_ns[MNT_NS_LAST_INDEX], CLONE_NEWNS), 0);
+
+ fd_mnt_ns_cur = self->fd_mnt_ns[MNT_NS_LAST_INDEX];
+ for (int i = MNT_NS_LAST_INDEX - 1; i >= 0; i--) {
+ struct mnt_ns_info info = {};
+ int fd_mnt_ns_prev;
+
+ fd_mnt_ns_prev = ioctl(fd_mnt_ns_cur, NS_MNT_GET_PREV, &info);
+ ASSERT_GE(fd_mnt_ns_prev, 0);
+ ASSERT_EQ(close(fd_mnt_ns_cur), 0);
+ fd_mnt_ns_cur = fd_mnt_ns_prev;
+ ASSERT_EQ(info.mnt_ns_id, self->mnt_ns_id[i]);
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/nsfs/owner.c b/tools/testing/selftests/filesystems/nsfs/owner.c
index 96a976c74550..96a976c74550 100644
--- a/tools/testing/selftests/nsfs/owner.c
+++ b/tools/testing/selftests/filesystems/nsfs/owner.c
diff --git a/tools/testing/selftests/nsfs/pidns.c b/tools/testing/selftests/filesystems/nsfs/pidns.c
index e3c772c6a7c7..e3c772c6a7c7 100644
--- a/tools/testing/selftests/nsfs/pidns.c
+++ b/tools/testing/selftests/filesystems/nsfs/pidns.c
diff --git a/tools/testing/selftests/filesystems/statmount/Makefile b/tools/testing/selftests/filesystems/statmount/Makefile
index 3af3136e35a4..14ee91a41650 100644
--- a/tools/testing/selftests/filesystems/statmount/Makefile
+++ b/tools/testing/selftests/filesystems/statmount/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-or-later
CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES)
-TEST_GEN_PROGS := statmount_test statmount_test_ns
+TEST_GEN_PROGS := statmount_test statmount_test_ns listmount_test
include ../../lib.mk
diff --git a/tools/testing/selftests/filesystems/statmount/listmount_test.c b/tools/testing/selftests/filesystems/statmount/listmount_test.c
new file mode 100644
index 000000000000..15f0834f7557
--- /dev/null
+++ b/tools/testing/selftests/filesystems/statmount/listmount_test.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "statmount.h"
+#include "../../kselftest_harness.h"
+
+#ifndef LISTMOUNT_REVERSE
+#define LISTMOUNT_REVERSE (1 << 0) /* List later mounts first */
+#endif
+
+#define LISTMNT_BUFFER 10
+
+/* Check that all mount ids are in increasing order. */
+TEST(listmount_forward)
+{
+ uint64_t list[LISTMNT_BUFFER], last_mnt_id = 0;
+
+ for (;;) {
+ ssize_t nr_mounts;
+
+ nr_mounts = listmount(LSMT_ROOT, 0, last_mnt_id,
+ list, LISTMNT_BUFFER, 0);
+ ASSERT_GE(nr_mounts, 0);
+ if (nr_mounts == 0)
+ break;
+
+ for (size_t cur = 0; cur < nr_mounts; cur++) {
+ if (cur < nr_mounts - 1)
+ ASSERT_LT(list[cur], list[cur + 1]);
+ last_mnt_id = list[cur];
+ }
+ }
+}
+
+/* Check that all mount ids are in decreasing order. */
+TEST(listmount_backward)
+{
+ uint64_t list[LISTMNT_BUFFER], last_mnt_id = 0;
+
+ for (;;) {
+ ssize_t nr_mounts;
+
+ nr_mounts = listmount(LSMT_ROOT, 0, last_mnt_id,
+ list, LISTMNT_BUFFER, LISTMOUNT_REVERSE);
+ ASSERT_GE(nr_mounts, 0);
+ if (nr_mounts == 0)
+ break;
+
+ for (size_t cur = 0; cur < nr_mounts; cur++) {
+ if (cur < nr_mounts - 1)
+ ASSERT_GT(list[cur], list[cur + 1]);
+ last_mnt_id = list[cur];
+ }
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h
index 28a471c88c51..0b96ac4b8ce5 100644
--- a/tools/testing/selftests/pidfd/pidfd.h
+++ b/tools/testing/selftests/pidfd/pidfd.h
@@ -12,7 +12,6 @@
#include <stdlib.h>
#include <string.h>
#include <syscall.h>
-#include <sys/mount.h>
#include <sys/types.h>
#include <sys/wait.h>