From 98d2b43081972abeb5bb5a087bc3e3197531c46e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 25 Oct 2023 16:01:59 +0200 Subject: add unique mount ID If a mount is released then its mnt_id can immediately be reused. This is bad news for user interfaces that want to uniquely identify a mount. Implementing a unique mount ID is trivial (use a 64bit counter). Unfortunately userspace assumes 32bit size and would overflow after the counter reaches 2^32. Introduce a new 64bit ID alongside the old one. Initialize the counter to 2^32, this guarantees that the old and new IDs are never mixed up. Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20231025140205.3586473-2-mszeredi@redhat.com Reviewed-by: Ian Kent Signed-off-by: Christian Brauner --- fs/namespace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index fbf0e596fcd3..0bcba81402b5 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -68,6 +68,9 @@ static u64 event; static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida); +/* Don't allow confusion with old 32bit mount ID */ +static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32); + static struct hlist_head *mount_hashtable __ro_after_init; static struct hlist_head *mountpoint_hashtable __ro_after_init; static struct kmem_cache *mnt_cache __ro_after_init; @@ -131,6 +134,7 @@ static int mnt_alloc_id(struct mount *mnt) if (res < 0) return res; mnt->mnt_id = res; + mnt->mnt_id_unique = atomic64_inc_return(&mnt_id_ctr); return 0; } -- cgit From 2eea9ce4310d8c0f8ef1dbe7b0e7d9219ff02b97 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 25 Oct 2023 16:02:00 +0200 Subject: mounts: keep list of mounts in an rbtree When adding a mount to a namespace insert it into an rbtree rooted in the mnt_namespace instead of a linear list. The mnt.mnt_list is still used to set up the mount tree and for propagation, but not after the mount has been added to a namespace. Hence mnt_list can live in union with rb_node. Use MNT_ONRB mount flag to validate that the mount is on the correct list. This allows removing the cursor used for reading /proc/$PID/mountinfo. The mnt_id_unique of the next mount can be used as an index into the seq file. Tested by inserting 100k bind mounts, unsharing the mount namespace, and unmounting. No performance regressions have been observed. For the last mount in the 100k list the statmount() call was more than 100x faster due to the mount ID lookup not having to do a linear search. This patch makes the overhead of mount ID lookup non-observable in this range. Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20231025140205.3586473-3-mszeredi@redhat.com Reviewed-by: Ian Kent Signed-off-by: Christian Brauner --- fs/namespace.c | 190 +++++++++++++++++++++++++++------------------------------ 1 file changed, 89 insertions(+), 101 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 0bcba81402b5..bbe94096e262 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -734,21 +734,6 @@ struct vfsmount *lookup_mnt(const struct path *path) return m; } -static inline void lock_ns_list(struct mnt_namespace *ns) -{ - spin_lock(&ns->ns_lock); -} - -static inline void unlock_ns_list(struct mnt_namespace *ns) -{ - spin_unlock(&ns->ns_lock); -} - -static inline bool mnt_is_cursor(struct mount *mnt) -{ - return mnt->mnt.mnt_flags & MNT_CURSOR; -} - /* * __is_local_mountpoint - Test to see if dentry is a mountpoint in the * current mount namespace. @@ -767,19 +752,15 @@ static inline bool mnt_is_cursor(struct mount *mnt) bool __is_local_mountpoint(struct dentry *dentry) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; - struct mount *mnt; + struct mount *mnt, *n; bool is_covered = false; down_read(&namespace_sem); - lock_ns_list(ns); - list_for_each_entry(mnt, &ns->list, mnt_list) { - if (mnt_is_cursor(mnt)) - continue; + rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { is_covered = (mnt->mnt_mountpoint == dentry); if (is_covered) break; } - unlock_ns_list(ns); up_read(&namespace_sem); return is_covered; @@ -1026,6 +1007,30 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m mnt_add_count(old_parent, -1); } +static inline struct mount *node_to_mount(struct rb_node *node) +{ + return rb_entry(node, struct mount, mnt_node); +} + +static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) +{ + struct rb_node **link = &ns->mounts.rb_node; + struct rb_node *parent = NULL; + + WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB); + mnt->mnt_ns = ns; + while (*link) { + parent = *link; + if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) + link = &parent->rb_left; + else + link = &parent->rb_right; + } + rb_link_node(&mnt->mnt_node, parent, link); + rb_insert_color(&mnt->mnt_node, &ns->mounts); + mnt->mnt.mnt_flags |= MNT_ONRB; +} + /* * vfsmount lock must be held for write */ @@ -1039,12 +1044,13 @@ static void commit_tree(struct mount *mnt) BUG_ON(parent == mnt); list_add_tail(&head, &mnt->mnt_list); - list_for_each_entry(m, &head, mnt_list) - m->mnt_ns = n; + while (!list_empty(&head)) { + m = list_first_entry(&head, typeof(*m), mnt_list); + list_del(&m->mnt_list); - list_splice(&head, n->list.prev); - - n->mounts += n->pending_mounts; + mnt_add_to_ns(n, m); + } + n->nr_mounts += n->pending_mounts; n->pending_mounts = 0; __attach_mnt(mnt, parent); @@ -1192,7 +1198,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, } mnt->mnt.mnt_flags = old->mnt.mnt_flags; - mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); + mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB); atomic_inc(&sb->s_active); mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); @@ -1417,65 +1423,57 @@ struct vfsmount *mnt_clone_internal(const struct path *path) return &p->mnt; } -#ifdef CONFIG_PROC_FS -static struct mount *mnt_list_next(struct mnt_namespace *ns, - struct list_head *p) +/* + * Returns the mount which either has the specified mnt_id, or has the next + * smallest id afer the specified one. + */ +static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id) { - struct mount *mnt, *ret = NULL; + struct rb_node *node = ns->mounts.rb_node; + struct mount *ret = NULL; - lock_ns_list(ns); - list_for_each_continue(p, &ns->list) { - mnt = list_entry(p, typeof(*mnt), mnt_list); - if (!mnt_is_cursor(mnt)) { - ret = mnt; - break; + while (node) { + struct mount *m = node_to_mount(node); + + if (mnt_id <= m->mnt_id_unique) { + ret = node_to_mount(node); + if (mnt_id == m->mnt_id_unique) + break; + node = node->rb_left; + } else { + node = node->rb_right; } } - unlock_ns_list(ns); - return ret; } +#ifdef CONFIG_PROC_FS + /* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_mounts *p = m->private; - struct list_head *prev; down_read(&namespace_sem); - if (!*pos) { - prev = &p->ns->list; - } else { - prev = &p->cursor.mnt_list; - /* Read after we'd reached the end? */ - if (list_empty(prev)) - return NULL; - } - - return mnt_list_next(p->ns, prev); + return mnt_find_id_at(p->ns, *pos); } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct proc_mounts *p = m->private; - struct mount *mnt = v; + struct mount *next = NULL, *mnt = v; + struct rb_node *node = rb_next(&mnt->mnt_node); ++*pos; - return mnt_list_next(p->ns, &mnt->mnt_list); + if (node) { + next = node_to_mount(node); + *pos = next->mnt_id_unique; + } + return next; } static void m_stop(struct seq_file *m, void *v) { - struct proc_mounts *p = m->private; - struct mount *mnt = v; - - lock_ns_list(p->ns); - if (mnt) - list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list); - else - list_del_init(&p->cursor.mnt_list); - unlock_ns_list(p->ns); up_read(&namespace_sem); } @@ -1493,14 +1491,6 @@ const struct seq_operations mounts_op = { .show = m_show, }; -void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor) -{ - down_read(&namespace_sem); - lock_ns_list(ns); - list_del(&cursor->mnt_list); - unlock_ns_list(ns); - up_read(&namespace_sem); -} #endif /* CONFIG_PROC_FS */ /** @@ -1642,7 +1632,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) /* Gather the mounts to umount */ for (p = mnt; p; p = next_mnt(p, mnt)) { p->mnt.mnt_flags |= MNT_UMOUNT; - list_move(&p->mnt_list, &tmp_list); + if (p->mnt.mnt_flags & MNT_ONRB) + move_from_ns(p, &tmp_list); + else + list_move(&p->mnt_list, &tmp_list); } /* Hide the mounts from mnt_mounts */ @@ -1662,7 +1655,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) list_del_init(&p->mnt_list); ns = p->mnt_ns; if (ns) { - ns->mounts--; + ns->nr_mounts--; __touch_mnt_namespace(ns); } p->mnt_ns = NULL; @@ -1788,14 +1781,16 @@ static int do_umount(struct mount *mnt, int flags) event++; if (flags & MNT_DETACH) { - if (!list_empty(&mnt->mnt_list)) + if (mnt->mnt.mnt_flags & MNT_ONRB || + !list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { - if (!list_empty(&mnt->mnt_list)) + if (mnt->mnt.mnt_flags & MNT_ONRB || + !list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); retval = 0; } @@ -2213,9 +2208,9 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) unsigned int mounts = 0; struct mount *p; - if (ns->mounts >= max) + if (ns->nr_mounts >= max) return -ENOSPC; - max -= ns->mounts; + max -= ns->nr_mounts; if (ns->pending_mounts >= max) return -ENOSPC; max -= ns->pending_mounts; @@ -2359,8 +2354,12 @@ static int attach_recursive_mnt(struct mount *source_mnt, touch_mnt_namespace(source_mnt->mnt_ns); } else { if (source_mnt->mnt_ns) { + LIST_HEAD(head); + /* move from anon - the caller will destroy */ - list_del_init(&source_mnt->mnt_ns->list); + for (p = source_mnt; p; p = next_mnt(p, source_mnt)) + move_from_ns(p, &head); + list_del_init(&head); } if (beneath) mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp); @@ -2671,11 +2670,10 @@ static struct file *open_detached_copy(struct path *path, bool recursive) lock_mount_hash(); for (p = mnt; p; p = next_mnt(p, mnt)) { - p->mnt_ns = ns; - ns->mounts++; + mnt_add_to_ns(ns, p); + ns->nr_mounts++; } ns->root = mnt; - list_add_tail(&ns->list, &mnt->mnt_list); mntget(&mnt->mnt); unlock_mount_hash(); namespace_unlock(); @@ -3738,9 +3736,8 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a if (!anon) new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); refcount_set(&new_ns->ns.count, 1); - INIT_LIST_HEAD(&new_ns->list); + new_ns->mounts = RB_ROOT; init_waitqueue_head(&new_ns->poll); - spin_lock_init(&new_ns->ns_lock); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; return new_ns; @@ -3787,7 +3784,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, unlock_mount_hash(); } new_ns->root = new; - list_add_tail(&new_ns->list, &new->mnt_list); /* * Second pass: switch the tsk->fs->* elements and mark new vfsmounts @@ -3797,8 +3793,8 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, p = old; q = new; while (p) { - q->mnt_ns = new_ns; - new_ns->mounts++; + mnt_add_to_ns(new_ns, q); + new_ns->nr_mounts++; if (new_fs) { if (&p->mnt == new_fs->root.mnt) { new_fs->root.mnt = mntget(&q->mnt); @@ -3840,10 +3836,9 @@ struct dentry *mount_subtree(struct vfsmount *m, const char *name) mntput(m); return ERR_CAST(ns); } - mnt->mnt_ns = ns; ns->root = mnt; - ns->mounts++; - list_add(&mnt->mnt_list, &ns->list); + ns->nr_mounts++; + mnt_add_to_ns(ns, mnt); err = vfs_path_lookup(m->mnt_root, m, name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); @@ -4021,10 +4016,9 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, goto err_path; } mnt = real_mount(newmount.mnt); - mnt->mnt_ns = ns; ns->root = mnt; - ns->mounts = 1; - list_add(&mnt->mnt_list, &ns->list); + ns->nr_mounts = 1; + mnt_add_to_ns(ns, mnt); mntget(newmount.mnt); /* Attach to an apparent O_PATH fd with a note that we need to unmount @@ -4695,10 +4689,9 @@ static void __init init_mount_tree(void) if (IS_ERR(ns)) panic("Can't allocate initial namespace"); m = real_mount(mnt); - m->mnt_ns = ns; ns->root = m; - ns->mounts = 1; - list_add(&m->mnt_list, &ns->list); + ns->nr_mounts = 1; + mnt_add_to_ns(ns, m); init_task.nsproxy->mnt_ns = ns; get_mnt_ns(ns); @@ -4825,18 +4818,14 @@ static bool mnt_already_visible(struct mnt_namespace *ns, int *new_mnt_flags) { int new_flags = *new_mnt_flags; - struct mount *mnt; + struct mount *mnt, *n; bool visible = false; down_read(&namespace_sem); - lock_ns_list(ns); - list_for_each_entry(mnt, &ns->list, mnt_list) { + rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { struct mount *child; int mnt_flags; - if (mnt_is_cursor(mnt)) - continue; - if (mnt->mnt.mnt_sb->s_type != sb->s_type) continue; @@ -4884,7 +4873,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns, next: ; } found: - unlock_ns_list(ns); up_read(&namespace_sem); return visible; } -- cgit From 56c94c626785001dbb683312725b7d87c6ec6a91 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 25 Oct 2023 16:02:01 +0200 Subject: namespace: extract show_path() helper To be used by the statmount(2) syscall as well. Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20231025140205.3586473-4-mszeredi@redhat.com Reviewed-by: Ian Kent Signed-off-by: Christian Brauner --- fs/namespace.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index bbe94096e262..d3665d025acb 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4674,6 +4674,15 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, return err; } +int show_path(struct seq_file *m, struct dentry *root) +{ + if (root->d_sb->s_op->show_path) + return root->d_sb->s_op->show_path(m, root); + + seq_dentry(m, root, " \t\n\\"); + return 0; +} + static void __init init_mount_tree(void) { struct vfsmount *mnt; -- cgit From 46eae99ef73302f9fb3dddcd67c374b3dffe8fd6 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 25 Oct 2023 16:02:02 +0200 Subject: add statmount(2) syscall Add a way to query attributes of a single mount instead of having to parse the complete /proc/$PID/mountinfo, which might be huge. Lookup the mount the new 64bit mount ID. If a mount needs to be queried based on path, then statx(2) can be used to first query the mount ID belonging to the path. Design is based on a suggestion by Linus: "So I'd suggest something that is very much like "statfsat()", which gets a buffer and a length, and returns an extended "struct statfs" *AND* just a string description at the end." The interface closely mimics that of statx. Handle ASCII attributes by appending after the end of the structure (as per above suggestion). Pointers to strings are stored in u64 members to make the structure the same regardless of pointer size. Strings are nul terminated. Link: https://lore.kernel.org/all/CAHk-=wh5YifP7hzKSbwJj94+DZ2czjrZsczy6GBimiogZws=rg@mail.gmail.com/ Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20231025140205.3586473-5-mszeredi@redhat.com Reviewed-by: Ian Kent [Christian Brauner : various minor changes] Signed-off-by: Christian Brauner --- fs/namespace.c | 281 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 281 insertions(+) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index d3665d025acb..ae35d8b6aca8 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4683,6 +4683,287 @@ int show_path(struct seq_file *m, struct dentry *root) return 0; } +static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns) +{ + struct mount *mnt = mnt_find_id_at(ns, id); + + if (!mnt || mnt->mnt_id_unique != id) + return NULL; + + return &mnt->mnt; +} + +struct kstatmount { + struct statmount __user *const buf; + size_t const bufsize; + struct vfsmount *const mnt; + u64 const mask; + struct seq_file seq; + struct path root; + struct statmount sm; + size_t pos; + int err; +}; + +typedef int (*statmount_func_t)(struct kstatmount *); + +static int statmount_string_seq(struct kstatmount *s, statmount_func_t func) +{ + size_t rem = s->bufsize - s->pos - sizeof(s->sm); + struct seq_file *seq = &s->seq; + int ret; + + seq->count = 0; + seq->size = min(seq->size, rem); + seq->buf = kvmalloc(seq->size, GFP_KERNEL_ACCOUNT); + if (!seq->buf) + return -ENOMEM; + + ret = func(s); + if (ret) + return ret; + + if (seq_has_overflowed(seq)) { + if (seq->size == rem) + return -EOVERFLOW; + seq->size *= 2; + if (seq->size > MAX_RW_COUNT) + return -ENOMEM; + kvfree(seq->buf); + return 0; + } + + /* Done */ + return 1; +} + +static void statmount_string(struct kstatmount *s, u64 mask, statmount_func_t func, + u32 *str) +{ + int ret = s->pos + sizeof(s->sm) >= s->bufsize ? -EOVERFLOW : 0; + struct statmount *sm = &s->sm; + struct seq_file *seq = &s->seq; + + if (s->err || !(s->mask & mask)) + return; + + seq->size = PAGE_SIZE; + while (!ret) + ret = statmount_string_seq(s, func); + + if (ret < 0) { + s->err = ret; + } else { + seq->buf[seq->count++] = '\0'; + if (copy_to_user(s->buf->str + s->pos, seq->buf, seq->count)) { + s->err = -EFAULT; + } else { + *str = s->pos; + s->pos += seq->count; + } + } + kvfree(seq->buf); + sm->mask |= mask; +} + +static void statmount_numeric(struct kstatmount *s, u64 mask, statmount_func_t func) +{ + if (s->err || !(s->mask & mask)) + return; + + s->err = func(s); + s->sm.mask |= mask; +} + +static u64 mnt_to_attr_flags(struct vfsmount *mnt) +{ + unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags); + u64 attr_flags = 0; + + if (mnt_flags & MNT_READONLY) + attr_flags |= MOUNT_ATTR_RDONLY; + if (mnt_flags & MNT_NOSUID) + attr_flags |= MOUNT_ATTR_NOSUID; + if (mnt_flags & MNT_NODEV) + attr_flags |= MOUNT_ATTR_NODEV; + if (mnt_flags & MNT_NOEXEC) + attr_flags |= MOUNT_ATTR_NOEXEC; + if (mnt_flags & MNT_NODIRATIME) + attr_flags |= MOUNT_ATTR_NODIRATIME; + if (mnt_flags & MNT_NOSYMFOLLOW) + attr_flags |= MOUNT_ATTR_NOSYMFOLLOW; + + if (mnt_flags & MNT_NOATIME) + attr_flags |= MOUNT_ATTR_NOATIME; + else if (mnt_flags & MNT_RELATIME) + attr_flags |= MOUNT_ATTR_RELATIME; + else + attr_flags |= MOUNT_ATTR_STRICTATIME; + + if (is_idmapped_mnt(mnt)) + attr_flags |= MOUNT_ATTR_IDMAP; + + return attr_flags; +} + +static u64 mnt_to_propagation_flags(struct mount *m) +{ + u64 propagation = 0; + + if (IS_MNT_SHARED(m)) + propagation |= MS_SHARED; + if (IS_MNT_SLAVE(m)) + propagation |= MS_SLAVE; + if (IS_MNT_UNBINDABLE(m)) + propagation |= MS_UNBINDABLE; + if (!propagation) + propagation |= MS_PRIVATE; + + return propagation; +} + +static int statmount_sb_basic(struct kstatmount *s) +{ + struct super_block *sb = s->mnt->mnt_sb; + + s->sm.sb_dev_major = MAJOR(sb->s_dev); + s->sm.sb_dev_minor = MINOR(sb->s_dev); + s->sm.sb_magic = sb->s_magic; + s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME); + + return 0; +} + +static int statmount_mnt_basic(struct kstatmount *s) +{ + struct mount *m = real_mount(s->mnt); + + s->sm.mnt_id = m->mnt_id_unique; + s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique; + s->sm.mnt_id_old = m->mnt_id; + s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id; + s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt); + s->sm.mnt_propagation = mnt_to_propagation_flags(m); + s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0; + s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0; + + return 0; +} + +static int statmount_propagate_from(struct kstatmount *s) +{ + struct mount *m = real_mount(s->mnt); + + if (!IS_MNT_SLAVE(m)) + return 0; + + s->sm.propagate_from = get_dominating_id(m, ¤t->fs->root); + + return 0; +} + +static int statmount_mnt_root(struct kstatmount *s) +{ + struct seq_file *seq = &s->seq; + int err = show_path(seq, s->mnt->mnt_root); + + if (!err && !seq_has_overflowed(seq)) { + seq->buf[seq->count] = '\0'; + seq->count = string_unescape_inplace(seq->buf, UNESCAPE_OCTAL); + } + return err; +} + +static int statmount_mnt_point(struct kstatmount *s) +{ + struct vfsmount *mnt = s->mnt; + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + int err = seq_path_root(&s->seq, &mnt_path, &s->root, ""); + + return err == SEQ_SKIP ? 0 : err; +} + +static int statmount_fs_type(struct kstatmount *s) +{ + struct seq_file *seq = &s->seq; + struct super_block *sb = s->mnt->mnt_sb; + + seq_puts(seq, sb->s_type->name); + return 0; +} + +static int do_statmount(struct kstatmount *s) +{ + struct statmount *sm = &s->sm; + struct mount *m = real_mount(s->mnt); + size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm)); + int err; + + /* + * Don't trigger audit denials. We just want to determine what + * mounts to show users. + */ + if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) && + !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + err = security_sb_statfs(s->mnt->mnt_root); + if (err) + return err; + + statmount_numeric(s, STATMOUNT_SB_BASIC, statmount_sb_basic); + statmount_numeric(s, STATMOUNT_MNT_BASIC, statmount_mnt_basic); + statmount_numeric(s, STATMOUNT_PROPAGATE_FROM, statmount_propagate_from); + statmount_string(s, STATMOUNT_FS_TYPE, statmount_fs_type, &sm->fs_type); + statmount_string(s, STATMOUNT_MNT_ROOT, statmount_mnt_root, &sm->mnt_root); + statmount_string(s, STATMOUNT_MNT_POINT, statmount_mnt_point, &sm->mnt_point); + + if (s->err) + return s->err; + + /* Return the number of bytes copied to the buffer */ + sm->size = copysize + s->pos; + + if (copy_to_user(s->buf, sm, copysize)) + return -EFAULT; + + return 0; +} + +SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, + struct statmount __user *, buf, size_t, bufsize, + unsigned int, flags) +{ + struct vfsmount *mnt; + struct mnt_id_req kreq; + int ret; + + if (flags) + return -EINVAL; + + if (copy_from_user(&kreq, req, sizeof(kreq))) + return -EFAULT; + + down_read(&namespace_sem); + mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns); + ret = -ENOENT; + if (mnt) { + struct kstatmount s = { + .mask = kreq.request_mask, + .buf = buf, + .bufsize = bufsize, + .mnt = mnt, + }; + + get_fs_root(current->fs, &s.root); + ret = do_statmount(&s); + path_put(&s.root); + } + up_read(&namespace_sem); + + return ret; +} + static void __init init_mount_tree(void) { struct vfsmount *mnt; -- cgit From 6971beb4ec52ad7b26f5b5eedf19d4963be7df1b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 19 Nov 2023 19:03:33 +0100 Subject: statmount: simplify numeric option retrieval Don't use all of this indirection which makes it really hard to follow the code which is very basic. Error handling is also not really neede here at all. Signed-off-by: Christian Brauner --- fs/namespace.c | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index ae35d8b6aca8..16c2f84c6570 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4766,15 +4766,6 @@ static void statmount_string(struct kstatmount *s, u64 mask, statmount_func_t fu sm->mask |= mask; } -static void statmount_numeric(struct kstatmount *s, u64 mask, statmount_func_t func) -{ - if (s->err || !(s->mask & mask)) - return; - - s->err = func(s); - s->sm.mask |= mask; -} - static u64 mnt_to_attr_flags(struct vfsmount *mnt) { unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags); @@ -4822,22 +4813,22 @@ static u64 mnt_to_propagation_flags(struct mount *m) return propagation; } -static int statmount_sb_basic(struct kstatmount *s) +static void statmount_sb_basic(struct kstatmount *s) { struct super_block *sb = s->mnt->mnt_sb; + s->sm.mask |= STATMOUNT_SB_BASIC; s->sm.sb_dev_major = MAJOR(sb->s_dev); s->sm.sb_dev_minor = MINOR(sb->s_dev); s->sm.sb_magic = sb->s_magic; s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME); - - return 0; } -static int statmount_mnt_basic(struct kstatmount *s) +static void statmount_mnt_basic(struct kstatmount *s) { struct mount *m = real_mount(s->mnt); + s->sm.mask |= STATMOUNT_MNT_BASIC; s->sm.mnt_id = m->mnt_id_unique; s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique; s->sm.mnt_id_old = m->mnt_id; @@ -4846,20 +4837,15 @@ static int statmount_mnt_basic(struct kstatmount *s) s->sm.mnt_propagation = mnt_to_propagation_flags(m); s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0; s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0; - - return 0; } -static int statmount_propagate_from(struct kstatmount *s) +static void statmount_propagate_from(struct kstatmount *s) { struct mount *m = real_mount(s->mnt); - if (!IS_MNT_SLAVE(m)) - return 0; - - s->sm.propagate_from = get_dominating_id(m, ¤t->fs->root); - - return 0; + s->sm.mask |= STATMOUNT_PROPAGATE_FROM; + if (IS_MNT_SLAVE(m)) + s->sm.propagate_from = get_dominating_id(m, ¤t->fs->root); } static int statmount_mnt_root(struct kstatmount *s) @@ -4911,9 +4897,15 @@ static int do_statmount(struct kstatmount *s) if (err) return err; - statmount_numeric(s, STATMOUNT_SB_BASIC, statmount_sb_basic); - statmount_numeric(s, STATMOUNT_MNT_BASIC, statmount_mnt_basic); - statmount_numeric(s, STATMOUNT_PROPAGATE_FROM, statmount_propagate_from); + if (s->mask & STATMOUNT_SB_BASIC) + statmount_sb_basic(s); + + if (s->mask & STATMOUNT_MNT_BASIC) + statmount_mnt_basic(s); + + if (s->mask & STATMOUNT_PROPAGATE_FROM) + statmount_propagate_from(s); + statmount_string(s, STATMOUNT_FS_TYPE, statmount_fs_type, &sm->fs_type); statmount_string(s, STATMOUNT_MNT_ROOT, statmount_mnt_root, &sm->mnt_root); statmount_string(s, STATMOUNT_MNT_POINT, statmount_mnt_point, &sm->mnt_point); -- cgit From 68385d77c05b401f748acecc0e0a8eff10489334 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 19 Nov 2023 20:58:49 +0100 Subject: statmount: simplify string option retrieval The previous code was a bit too clever for what we currently support. A few minor changes: * Avoid indirect function calls and use a simple switch statement. We really only have three cases to handle so it's not like it's massively complex. We can switch to something more elaborate should we introduce more complex options. * Defer all copy_to_user() calls until after we've given up namespace semaphore. On kernels with userfaultfd it's possible to abuse copy_from/to_user() calls to indefinitely block on page faults. That's usually a privileged operation but may be made available to unprivileged users. Independent of userfaultfd it's better to not do all the copy_to_user() work while holding namespace semaphore. Instead collect the information and then copy it out after we've given up all locks. * This also folds a change from Arnd to reduce the stack size in prepare_kstatmount() to avoid warning such as: fs/namespace.c:4995:1: error: stack frame size (1536) exceeds limit (1024) in '__se_sys_statmount' [-Werror,-Wframe-larger-than] 4995 | SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, Reviewed-by: Ian Kent Link: https://lore.kernel.org/r/20231213090015.518044-1-arnd@kernel.org Signed-off-by: Christian Brauner --- fs/namespace.c | 247 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 145 insertions(+), 102 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 16c2f84c6570..7f1618ed2aba 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4694,78 +4694,15 @@ static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns) } struct kstatmount { - struct statmount __user *const buf; - size_t const bufsize; - struct vfsmount *const mnt; - u64 const mask; - struct seq_file seq; + struct statmount __user *buf; + size_t bufsize; + struct vfsmount *mnt; + u64 mask; struct path root; struct statmount sm; - size_t pos; - int err; + struct seq_file seq; }; -typedef int (*statmount_func_t)(struct kstatmount *); - -static int statmount_string_seq(struct kstatmount *s, statmount_func_t func) -{ - size_t rem = s->bufsize - s->pos - sizeof(s->sm); - struct seq_file *seq = &s->seq; - int ret; - - seq->count = 0; - seq->size = min(seq->size, rem); - seq->buf = kvmalloc(seq->size, GFP_KERNEL_ACCOUNT); - if (!seq->buf) - return -ENOMEM; - - ret = func(s); - if (ret) - return ret; - - if (seq_has_overflowed(seq)) { - if (seq->size == rem) - return -EOVERFLOW; - seq->size *= 2; - if (seq->size > MAX_RW_COUNT) - return -ENOMEM; - kvfree(seq->buf); - return 0; - } - - /* Done */ - return 1; -} - -static void statmount_string(struct kstatmount *s, u64 mask, statmount_func_t func, - u32 *str) -{ - int ret = s->pos + sizeof(s->sm) >= s->bufsize ? -EOVERFLOW : 0; - struct statmount *sm = &s->sm; - struct seq_file *seq = &s->seq; - - if (s->err || !(s->mask & mask)) - return; - - seq->size = PAGE_SIZE; - while (!ret) - ret = statmount_string_seq(s, func); - - if (ret < 0) { - s->err = ret; - } else { - seq->buf[seq->count++] = '\0'; - if (copy_to_user(s->buf->str + s->pos, seq->buf, seq->count)) { - s->err = -EFAULT; - } else { - *str = s->pos; - s->pos += seq->count; - } - } - kvfree(seq->buf); - sm->mask |= mask; -} - static u64 mnt_to_attr_flags(struct vfsmount *mnt) { unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags); @@ -4848,41 +4785,109 @@ static void statmount_propagate_from(struct kstatmount *s) s->sm.propagate_from = get_dominating_id(m, ¤t->fs->root); } -static int statmount_mnt_root(struct kstatmount *s) +static int statmount_mnt_root(struct kstatmount *s, struct seq_file *seq) { - struct seq_file *seq = &s->seq; - int err = show_path(seq, s->mnt->mnt_root); + int ret; + size_t start = seq->count; - if (!err && !seq_has_overflowed(seq)) { - seq->buf[seq->count] = '\0'; - seq->count = string_unescape_inplace(seq->buf, UNESCAPE_OCTAL); - } - return err; + ret = show_path(seq, s->mnt->mnt_root); + if (ret) + return ret; + + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + /* + * Unescape the result. It would be better if supplied string was not + * escaped in the first place, but that's a pretty invasive change. + */ + seq->buf[seq->count] = '\0'; + seq->count = start; + seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL)); + return 0; } -static int statmount_mnt_point(struct kstatmount *s) +static int statmount_mnt_point(struct kstatmount *s, struct seq_file *seq) { struct vfsmount *mnt = s->mnt; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; - int err = seq_path_root(&s->seq, &mnt_path, &s->root, ""); + int err; + err = seq_path_root(seq, &mnt_path, &s->root, ""); return err == SEQ_SKIP ? 0 : err; } -static int statmount_fs_type(struct kstatmount *s) +static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq) { - struct seq_file *seq = &s->seq; struct super_block *sb = s->mnt->mnt_sb; seq_puts(seq, sb->s_type->name); return 0; } -static int do_statmount(struct kstatmount *s) +static int statmount_string(struct kstatmount *s, u64 flag) { + int ret; + size_t kbufsize; + struct seq_file *seq = &s->seq; struct statmount *sm = &s->sm; - struct mount *m = real_mount(s->mnt); + + switch (flag) { + case STATMOUNT_FS_TYPE: + sm->fs_type = seq->count; + ret = statmount_fs_type(s, seq); + break; + case STATMOUNT_MNT_ROOT: + sm->mnt_root = seq->count; + ret = statmount_mnt_root(s, seq); + break; + case STATMOUNT_MNT_POINT: + sm->mnt_point = seq->count; + ret = statmount_mnt_point(s, seq); + break; + default: + WARN_ON_ONCE(true); + return -EINVAL; + } + + if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize))) + return -EOVERFLOW; + if (kbufsize >= s->bufsize) + return -EOVERFLOW; + + /* signal a retry */ + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + if (ret) + return ret; + + seq->buf[seq->count++] = '\0'; + sm->mask |= flag; + return 0; +} + +static int copy_statmount_to_user(struct kstatmount *s) +{ + struct statmount *sm = &s->sm; + struct seq_file *seq = &s->seq; + char __user *str = ((char __user *)s->buf) + sizeof(*sm); size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm)); + + if (seq->count && copy_to_user(str, seq->buf, seq->count)) + return -EFAULT; + + /* Return the number of bytes copied to the buffer */ + sm->size = copysize + seq->count; + if (copy_to_user(s->buf, sm, copysize)) + return -EFAULT; + + return 0; +} + +static int do_statmount(struct kstatmount *s) +{ + struct mount *m = real_mount(s->mnt); int err; /* @@ -4906,19 +4911,47 @@ static int do_statmount(struct kstatmount *s) if (s->mask & STATMOUNT_PROPAGATE_FROM) statmount_propagate_from(s); - statmount_string(s, STATMOUNT_FS_TYPE, statmount_fs_type, &sm->fs_type); - statmount_string(s, STATMOUNT_MNT_ROOT, statmount_mnt_root, &sm->mnt_root); - statmount_string(s, STATMOUNT_MNT_POINT, statmount_mnt_point, &sm->mnt_point); + if (s->mask & STATMOUNT_FS_TYPE) + err = statmount_string(s, STATMOUNT_FS_TYPE); - if (s->err) - return s->err; + if (!err && s->mask & STATMOUNT_MNT_ROOT) + err = statmount_string(s, STATMOUNT_MNT_ROOT); - /* Return the number of bytes copied to the buffer */ - sm->size = copysize + s->pos; + if (!err && s->mask & STATMOUNT_MNT_POINT) + err = statmount_string(s, STATMOUNT_MNT_POINT); - if (copy_to_user(s->buf, sm, copysize)) + if (err) + return err; + + return 0; +} + +static inline bool retry_statmount(const long ret, size_t *seq_size) +{ + if (likely(ret != -EAGAIN)) + return false; + if (unlikely(check_mul_overflow(*seq_size, 2, seq_size))) + return false; + if (unlikely(*seq_size > MAX_RW_COUNT)) + return false; + return true; +} + +static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, + struct statmount __user *buf, size_t bufsize, + size_t seq_size) +{ + if (!access_ok(buf, bufsize)) return -EFAULT; + memset(ks, 0, sizeof(*ks)); + ks->mask = kreq->request_mask; + ks->buf = buf; + ks->bufsize = bufsize; + ks->seq.size = seq_size; + ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT); + if (!ks->seq.buf) + return -ENOMEM; return 0; } @@ -4928,6 +4961,9 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, { struct vfsmount *mnt; struct mnt_id_req kreq; + struct kstatmount ks; + /* We currently support retrieval of 3 strings. */ + size_t seq_size = 3 * PATH_MAX; int ret; if (flags) @@ -4936,23 +4972,30 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, if (copy_from_user(&kreq, req, sizeof(kreq))) return -EFAULT; +retry: + ret = prepare_kstatmount(&ks, &kreq, buf, bufsize, seq_size); + if (ret) + return ret; + down_read(&namespace_sem); mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns); - ret = -ENOENT; - if (mnt) { - struct kstatmount s = { - .mask = kreq.request_mask, - .buf = buf, - .bufsize = bufsize, - .mnt = mnt, - }; - - get_fs_root(current->fs, &s.root); - ret = do_statmount(&s); - path_put(&s.root); + if (!mnt) { + up_read(&namespace_sem); + kvfree(ks.seq.buf); + return -ENOENT; } + + ks.mnt = mnt; + get_fs_root(current->fs, &ks.root); + ret = do_statmount(&ks); + path_put(&ks.root); up_read(&namespace_sem); + if (!ret) + ret = copy_statmount_to_user(&ks); + kvfree(ks.seq.buf); + if (retry_statmount(ret, &seq_size)) + goto retry; return ret; } -- cgit From b4c2bea8ceaa50cd42a8f73667389d801a3ecf2d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 25 Oct 2023 16:02:03 +0200 Subject: add listmount(2) syscall Add way to query the children of a particular mount. This is a more flexible way to iterate the mount tree than having to parse /proc/self/mountinfo. Lookup the mount by the new 64bit mount ID. If a mount needs to be queried based on path, then statx(2) can be used to first query the mount ID belonging to the path. Return an array of new (64bit) mount ID's. Without privileges only mounts are listed which are reachable from the task's root. Folded into this patch are several later improvements. Keeping them separate would make the history pointlessly confusing: * Recursive listing of mounts is the default now (cf. [1]). * Remove explicit LISTMOUNT_UNREACHABLE flag (cf. [1]) and fail if mount is unreachable from current root. This also makes permission checking consistent with statmount() (cf. [3]). * Start listing mounts in unique mount ID order (cf. [2]) to allow continuing listmount() from a midpoint. * Allow to continue listmount(). The @request_mask parameter is renamed and to @param to be usable by both statmount() and listmount(). If @param is set to a mount id then listmount() will continue listing mounts from that id on. This allows listing mounts in multiple listmount invocations without having to resize the buffer. If @param is zero then the listing starts from the beginning (cf. [4]). * Don't return EOVERFLOW, instead return the buffer size which allows to detect a full buffer as well (cf. [4]). Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20231025140205.3586473-6-mszeredi@redhat.com Reviewed-by: Ian Kent Link: https://lore.kernel.org/r/20231128160337.29094-2-mszeredi@redhat.com [1] (folded) Link: https://lore.kernel.org/r/20231128160337.29094-3-mszeredi@redhat.com [2] (folded) Link: https://lore.kernel.org/r/20231128160337.29094-4-mszeredi@redhat.com [3] (folded) Link: https://lore.kernel.org/r/20231128160337.29094-5-mszeredi@redhat.com [4] (folded) [Christian Brauner : various smaller fixes] Signed-off-by: Christian Brauner --- fs/namespace.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 84 insertions(+), 2 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 7f1618ed2aba..873185b8a84b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "pnode.h" #include "internal.h" @@ -1009,7 +1010,7 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m static inline struct mount *node_to_mount(struct rb_node *node) { - return rb_entry(node, struct mount, mnt_node); + return node ? rb_entry(node, struct mount, mnt_node) : NULL; } static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) @@ -4945,7 +4946,7 @@ static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, return -EFAULT; memset(ks, 0, sizeof(*ks)); - ks->mask = kreq->request_mask; + ks->mask = kreq->param; ks->buf = buf; ks->bufsize = bufsize; ks->seq.size = seq_size; @@ -4999,6 +5000,87 @@ retry: return ret; } +static struct mount *listmnt_next(struct mount *curr) +{ + return node_to_mount(rb_next(&curr->mnt_node)); +} + +static ssize_t do_listmount(struct mount *first, struct path *orig, u64 mnt_id, + u64 __user *buf, size_t bufsize, + const struct path *root) +{ + struct mount *r; + ssize_t ctr; + int err; + + /* + * Don't trigger audit denials. We just want to determine what + * mounts to show users. + */ + if (!is_path_reachable(real_mount(orig->mnt), orig->dentry, root) && + !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + err = security_sb_statfs(orig->dentry); + if (err) + return err; + + for (ctr = 0, r = first; r && ctr < bufsize; r = listmnt_next(r)) { + if (r->mnt_id_unique == mnt_id) + continue; + if (!is_path_reachable(r, r->mnt.mnt_root, orig)) + continue; + ctr = array_index_nospec(ctr, bufsize); + if (put_user(r->mnt_id_unique, buf + ctr)) + return -EFAULT; + if (check_add_overflow(ctr, 1, &ctr)) + return -ERANGE; + } + return ctr; +} + +SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, + u64 __user *, buf, size_t, bufsize, unsigned int, flags) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mnt_id_req kreq; + struct mount *first; + struct path root, orig; + u64 mnt_id, last_mnt_id; + ssize_t ret; + + if (flags) + return -EINVAL; + + if (copy_from_user(&kreq, req, sizeof(kreq))) + return -EFAULT; + mnt_id = kreq.mnt_id; + last_mnt_id = kreq.param; + + down_read(&namespace_sem); + get_fs_root(current->fs, &root); + if (mnt_id == LSMT_ROOT) { + orig = root; + } else { + ret = -ENOENT; + orig.mnt = lookup_mnt_in_ns(mnt_id, ns); + if (!orig.mnt) + goto err; + orig.dentry = orig.mnt->mnt_root; + } + if (!last_mnt_id) + first = node_to_mount(rb_first(&ns->mounts)); + else + first = mnt_find_id_at(ns, last_mnt_id + 1); + + ret = do_listmount(first, &orig, mnt_id, buf, bufsize, &root); +err: + path_put(&root); + up_read(&namespace_sem); + return ret; +} + + static void __init init_mount_tree(void) { struct vfsmount *mnt; -- cgit From 35e27a5744131996061e6e323f1bcb4c827ae867 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Nov 2023 12:27:15 +0100 Subject: fs: keep struct mnt_id_req extensible Make it extensible so that we have the liberty to reuse it in future mount-id based apis. Treat zero size as the first published struct. Signed-off-by: Christian Brauner --- fs/namespace.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 873185b8a84b..918e8f89ce35 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4956,6 +4956,30 @@ static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, return 0; } +static int copy_mnt_id_req(const struct mnt_id_req __user *req, + struct mnt_id_req *kreq) +{ + int ret; + size_t usize; + + BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER0); + + ret = get_user(usize, &req->size); + if (ret) + return -EFAULT; + if (unlikely(usize > PAGE_SIZE)) + return -E2BIG; + if (unlikely(usize < MNT_ID_REQ_SIZE_VER0)) + return -EINVAL; + memset(kreq, 0, sizeof(*kreq)); + ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); + if (ret) + return ret; + if (kreq->spare != 0) + return -EINVAL; + return 0; +} + SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, struct statmount __user *, buf, size_t, bufsize, unsigned int, flags) @@ -4970,8 +4994,9 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, if (flags) return -EINVAL; - if (copy_from_user(&kreq, req, sizeof(kreq))) - return -EFAULT; + ret = copy_mnt_id_req(req, &kreq); + if (ret) + return ret; retry: ret = prepare_kstatmount(&ks, &kreq, buf, bufsize, seq_size); @@ -5052,8 +5077,9 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, if (flags) return -EINVAL; - if (copy_from_user(&kreq, req, sizeof(kreq))) - return -EFAULT; + ret = copy_mnt_id_req(req, &kreq); + if (ret) + return ret; mnt_id = kreq.mnt_id; last_mnt_id = kreq.param; -- cgit