diff options
Diffstat (limited to 'fs/namespace.c')
-rw-r--r-- | fs/namespace.c | 822 |
1 files changed, 357 insertions, 465 deletions
diff --git a/fs/namespace.c b/fs/namespace.c index e13d9ab4f564..ddfd4457d338 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -79,6 +79,7 @@ static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ +static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */ static DEFINE_SEQLOCK(mnt_ns_tree_lock); #ifdef CONFIG_FSNOTIFY @@ -380,10 +381,9 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_list); INIT_LIST_HEAD(&mnt->mnt_expire); INIT_LIST_HEAD(&mnt->mnt_share); - INIT_LIST_HEAD(&mnt->mnt_slave_list); - INIT_LIST_HEAD(&mnt->mnt_slave); + INIT_HLIST_HEAD(&mnt->mnt_slave_list); + INIT_HLIST_NODE(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); - INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); RB_CLEAR_NODE(&mnt->mnt_node); mnt->mnt.mnt_idmap = &nop_mnt_idmap; @@ -894,7 +894,7 @@ struct vfsmount *lookup_mnt(const struct path *path) * namespace not just a mount that happens to have some specified * parent mount. */ -bool __is_local_mountpoint(struct dentry *dentry) +bool __is_local_mountpoint(const struct dentry *dentry) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct mount *mnt, *n; @@ -911,42 +911,48 @@ bool __is_local_mountpoint(struct dentry *dentry) return is_covered; } -static struct mountpoint *lookup_mountpoint(struct dentry *dentry) +struct pinned_mountpoint { + struct hlist_node node; + struct mountpoint *mp; +}; + +static bool lookup_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m) { struct hlist_head *chain = mp_hash(dentry); struct mountpoint *mp; hlist_for_each_entry(mp, chain, m_hash) { if (mp->m_dentry == dentry) { - mp->m_count++; - return mp; + hlist_add_head(&m->node, &mp->m_list); + m->mp = mp; + return true; } } - return NULL; + return false; } -static struct mountpoint *get_mountpoint(struct dentry *dentry) +static int get_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m) { - struct mountpoint *mp, *new = NULL; + struct mountpoint *mp __free(kfree) = NULL; + bool found; int ret; if (d_mountpoint(dentry)) { /* might be worth a WARN_ON() */ if (d_unlinked(dentry)) - return ERR_PTR(-ENOENT); + return -ENOENT; mountpoint: read_seqlock_excl(&mount_lock); - mp = lookup_mountpoint(dentry); + found = lookup_mountpoint(dentry, m); read_sequnlock_excl(&mount_lock); - if (mp) - goto done; + if (found) + return 0; } - if (!new) - new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); - if (!new) - return ERR_PTR(-ENOMEM); - + if (!mp) + mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); + if (!mp) + return -ENOMEM; /* Exactly one processes may set d_mounted */ ret = d_set_mounted(dentry); @@ -956,34 +962,28 @@ mountpoint: goto mountpoint; /* The dentry is not available as a mountpoint? */ - mp = ERR_PTR(ret); if (ret) - goto done; + return ret; /* Add the new mountpoint to the hash table */ read_seqlock_excl(&mount_lock); - new->m_dentry = dget(dentry); - new->m_count = 1; - hlist_add_head(&new->m_hash, mp_hash(dentry)); - INIT_HLIST_HEAD(&new->m_list); + mp->m_dentry = dget(dentry); + hlist_add_head(&mp->m_hash, mp_hash(dentry)); + INIT_HLIST_HEAD(&mp->m_list); + hlist_add_head(&m->node, &mp->m_list); + m->mp = no_free_ptr(mp); read_sequnlock_excl(&mount_lock); - - mp = new; - new = NULL; -done: - kfree(new); - return mp; + return 0; } /* * vfsmount lock must be held. Additionally, the caller is responsible * for serializing calls for given disposal list. */ -static void __put_mountpoint(struct mountpoint *mp, struct list_head *list) +static void maybe_free_mountpoint(struct mountpoint *mp, struct list_head *list) { - if (!--mp->m_count) { + if (hlist_empty(&mp->m_list)) { struct dentry *dentry = mp->m_dentry; - BUG_ON(!hlist_empty(&mp->m_list)); spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_MOUNTED; spin_unlock(&dentry->d_lock); @@ -993,10 +993,15 @@ static void __put_mountpoint(struct mountpoint *mp, struct list_head *list) } } -/* called with namespace_lock and vfsmount lock */ -static void put_mountpoint(struct mountpoint *mp) +/* + * locks: mount_lock [read_seqlock_excl], namespace_sem [excl] + */ +static void unpin_mountpoint(struct pinned_mountpoint *m) { - __put_mountpoint(mp, &ex_mountpoints); + if (m->mp) { + hlist_del(&m->node); + maybe_free_mountpoint(m->mp, &ex_mountpoints); + } } static inline int check_mnt(struct mount *mnt) @@ -1038,11 +1043,14 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns) } /* - * vfsmount lock must be held for write + * locks: mount_lock[write_seqlock] */ -static struct mountpoint *unhash_mnt(struct mount *mnt) +static void __umount_mnt(struct mount *mnt, struct list_head *shrink_list) { struct mountpoint *mp; + struct mount *parent = mnt->mnt_parent; + if (unlikely(parent->overmount == mnt)) + parent->overmount = NULL; mnt->mnt_parent = mnt; mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); @@ -1050,15 +1058,15 @@ static struct mountpoint *unhash_mnt(struct mount *mnt) hlist_del_init(&mnt->mnt_mp_list); mp = mnt->mnt_mp; mnt->mnt_mp = NULL; - return mp; + maybe_free_mountpoint(mp, shrink_list); } /* - * vfsmount lock must be held for write + * locks: mount_lock[write_seqlock], namespace_sem[excl] (for ex_mountpoints) */ static void umount_mnt(struct mount *mnt) { - put_mountpoint(unhash_mnt(mnt)); + __umount_mnt(mnt, &ex_mountpoints); } /* @@ -1068,43 +1076,17 @@ void mnt_set_mountpoint(struct mount *mnt, struct mountpoint *mp, struct mount *child_mnt) { - mp->m_count++; - mnt_add_count(mnt, 1); /* essentially, that's mntget */ child_mnt->mnt_mountpoint = mp->m_dentry; child_mnt->mnt_parent = mnt; child_mnt->mnt_mp = mp; hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); } -/** - * mnt_set_mountpoint_beneath - mount a mount beneath another one - * - * @new_parent: the source mount - * @top_mnt: the mount beneath which @new_parent is mounted - * @new_mp: the new mountpoint of @top_mnt on @new_parent - * - * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and - * parent @top_mnt->mnt_parent and mount it on top of @new_parent at - * @new_mp. And mount @new_parent on the old parent and old - * mountpoint of @top_mnt. - * - * Context: This function expects namespace_lock() and lock_mount_hash() - * to have been acquired in that order. - */ -static void mnt_set_mountpoint_beneath(struct mount *new_parent, - struct mount *top_mnt, - struct mountpoint *new_mp) -{ - struct mount *old_top_parent = top_mnt->mnt_parent; - struct mountpoint *old_top_mp = top_mnt->mnt_mp; - - mnt_set_mountpoint(old_top_parent, old_top_mp, new_parent); - mnt_change_mountpoint(new_parent, new_mp, top_mnt); -} - - -static void __attach_mnt(struct mount *mnt, struct mount *parent) +static void make_visible(struct mount *mnt) { + struct mount *parent = mnt->mnt_parent; + if (unlikely(mnt->mnt_mountpoint == parent->mnt.mnt_root)) + parent->overmount = mnt; hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mnt->mnt_mountpoint)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); @@ -1116,51 +1098,34 @@ static void __attach_mnt(struct mount *mnt, struct mount *parent) * @parent: the parent * @mnt: the new mount * @mp: the new mountpoint - * @beneath: whether to mount @mnt beneath or on top of @parent * - * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt + * Mount @mnt at @mp on @parent. Then attach @mnt * to @parent's child mount list and to @mount_hashtable. * - * If @beneath is true, remove @mnt from its current parent and - * mountpoint and mount it on @mp on @parent, and mount @parent on the - * old parent and old mountpoint of @mnt. Finally, attach @parent to - * @mnt_hashtable and @parent->mnt_parent->mnt_mounts. - * - * Note, when __attach_mnt() is called @mnt->mnt_parent already points + * Note, when make_visible() is called @mnt->mnt_parent already points * to the correct parent. * * Context: This function expects namespace_lock() and lock_mount_hash() * to have been acquired in that order. */ static void attach_mnt(struct mount *mnt, struct mount *parent, - struct mountpoint *mp, bool beneath) + struct mountpoint *mp) { - if (beneath) - mnt_set_mountpoint_beneath(mnt, parent, mp); - else - mnt_set_mountpoint(parent, mp, mnt); - /* - * Note, @mnt->mnt_parent has to be used. If @mnt was mounted - * beneath @parent then @mnt will need to be attached to - * @parent's old parent, not @parent. IOW, @mnt->mnt_parent - * isn't the same mount as @parent. - */ - __attach_mnt(mnt, mnt->mnt_parent); + mnt_set_mountpoint(parent, mp, mnt); + make_visible(mnt); } void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) { struct mountpoint *old_mp = mnt->mnt_mp; - struct mount *old_parent = mnt->mnt_parent; list_del_init(&mnt->mnt_child); hlist_del_init(&mnt->mnt_mp_list); hlist_del_init_rcu(&mnt->mnt_hash); - attach_mnt(mnt, parent, mp, false); + attach_mnt(mnt, parent, mp); - put_mountpoint(old_mp); - mnt_add_count(old_parent, -1); + maybe_free_mountpoint(old_mp, &ex_mountpoints); } static inline struct mount *node_to_mount(struct rb_node *node) @@ -1197,32 +1162,6 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) mnt_notify_add(mnt); } -/* - * vfsmount lock must be held for write - */ -static void commit_tree(struct mount *mnt) -{ - struct mount *parent = mnt->mnt_parent; - struct mount *m; - LIST_HEAD(head); - struct mnt_namespace *n = parent->mnt_ns; - - BUG_ON(parent == mnt); - - list_add_tail(&head, &mnt->mnt_list); - while (!list_empty(&head)) { - m = list_first_entry(&head, typeof(*m), mnt_list); - list_del(&m->mnt_list); - - mnt_add_to_ns(n, m); - } - n->nr_mounts += n->pending_mounts; - n->pending_mounts = 0; - - __attach_mnt(mnt, parent); - touch_mnt_namespace(n); -} - static struct mount *next_mnt(struct mount *p, struct mount *root) { struct list_head *next = p->mnt_mounts.next; @@ -1249,6 +1188,27 @@ static struct mount *skip_mnt_tree(struct mount *p) return p; } +/* + * vfsmount lock must be held for write + */ +static void commit_tree(struct mount *mnt) +{ + struct mnt_namespace *n = mnt->mnt_parent->mnt_ns; + + if (!mnt_ns_attached(mnt)) { + for (struct mount *m = mnt; m; m = next_mnt(m, mnt)) + if (unlikely(mnt_ns_attached(m))) + m = skip_mnt_tree(m); + else + mnt_add_to_ns(n, m); + n->nr_mounts += n->pending_mounts; + n->pending_mounts = 0; + } + + make_visible(mnt); + touch_mnt_namespace(n); +} + /** * vfs_create_mount - Create a mount for a configured superblock * @fc: The configuration context with the superblock attached @@ -1296,6 +1256,15 @@ struct vfsmount *fc_mount(struct fs_context *fc) } EXPORT_SYMBOL(fc_mount); +struct vfsmount *fc_mount_longterm(struct fs_context *fc) +{ + struct vfsmount *mnt = fc_mount(fc); + if (!IS_ERR(mnt)) + real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; + return mnt; +} +EXPORT_SYMBOL(fc_mount_longterm); + struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) @@ -1337,7 +1306,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, if (!mnt) return ERR_PTR(-ENOMEM); - if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) + mnt->mnt.mnt_flags = READ_ONCE(old->mnt.mnt_flags) & + ~MNT_INTERNAL_FLAGS; + + if (flag & (CL_SLAVE | CL_PRIVATE)) mnt->mnt_group_id = 0; /* not a peer of original */ else mnt->mnt_group_id = old->mnt_group_id; @@ -1348,8 +1320,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, goto out_free; } - mnt->mnt.mnt_flags = old->mnt.mnt_flags; - mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); + if (mnt->mnt_group_id) + set_mnt_shared(mnt); atomic_inc(&sb->s_active); mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); @@ -1362,30 +1334,19 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, list_add_tail(&mnt->mnt_instance, &sb->s_mounts); unlock_mount_hash(); - if ((flag & CL_SLAVE) || - ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { - list_add(&mnt->mnt_slave, &old->mnt_slave_list); + if (flag & CL_PRIVATE) // we are done with it + return mnt; + + if (peers(mnt, old)) + list_add(&mnt->mnt_share, &old->mnt_share); + + if ((flag & CL_SLAVE) && old->mnt_group_id) { + hlist_add_head(&mnt->mnt_slave, &old->mnt_slave_list); mnt->mnt_master = old; - CLEAR_MNT_SHARED(mnt); - } else if (!(flag & CL_PRIVATE)) { - if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) - list_add(&mnt->mnt_share, &old->mnt_share); - if (IS_MNT_SLAVE(old)) - list_add(&mnt->mnt_slave, &old->mnt_slave); + } else if (IS_MNT_SLAVE(old)) { + hlist_add_behind(&mnt->mnt_slave, &old->mnt_slave); mnt->mnt_master = old->mnt_master; - } else { - CLEAR_MNT_SHARED(mnt); } - if (flag & CL_MAKE_SHARED) - set_mnt_shared(mnt); - - /* stick the duplicate mount on the same expiry list - * as the original if that was on one */ - if (flag & CL_EXPIRE) { - if (!list_empty(&old->mnt_expire)) - list_add(&mnt->mnt_expire, &old->mnt_expire); - } - return mnt; out_free: @@ -1478,11 +1439,13 @@ static void mntput_no_expire(struct mount *mnt) rcu_read_unlock(); list_del(&mnt->mnt_instance); + if (unlikely(!list_empty(&mnt->mnt_expire))) + list_del(&mnt->mnt_expire); if (unlikely(!list_empty(&mnt->mnt_mounts))) { struct mount *p, *tmp; list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { - __put_mountpoint(unhash_mnt(p), &list); + __umount_mnt(p, &list); hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children); } } @@ -1679,23 +1642,19 @@ const struct seq_operations mounts_op = { int may_umount_tree(struct vfsmount *m) { struct mount *mnt = real_mount(m); - int actual_refs = 0; - int minimum_refs = 0; - struct mount *p; - BUG_ON(!m); + bool busy = false; /* write lock needed for mnt_get_count */ lock_mount_hash(); - for (p = mnt; p; p = next_mnt(p, mnt)) { - actual_refs += mnt_get_count(p); - minimum_refs += 2; + for (struct mount *p = mnt; p; p = next_mnt(p, mnt)) { + if (mnt_get_count(p) > (p == mnt ? 2 : 1)) { + busy = true; + break; + } } unlock_mount_hash(); - if (actual_refs > minimum_refs) - return 0; - - return 1; + return !busy; } EXPORT_SYMBOL(may_umount_tree); @@ -1771,15 +1730,18 @@ static bool need_notify_mnt_list(void) } #endif +static void free_mnt_ns(struct mnt_namespace *); static void namespace_unlock(void) { struct hlist_head head; struct hlist_node *p; struct mount *m; + struct mnt_namespace *ns = emptied_ns; LIST_HEAD(list); hlist_move_list(&unmounted, &head); list_splice_init(&ex_mountpoints, &list); + emptied_ns = NULL; if (need_notify_mnt_list()) { /* @@ -1793,6 +1755,11 @@ static void namespace_unlock(void) } else { up_write(&namespace_sem); } + if (unlikely(ns)) { + /* Make sure we notice when we leak mounts. */ + VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); + free_mnt_ns(ns); + } shrink_dentry_list(&list); @@ -1865,9 +1832,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) for (p = mnt; p; p = next_mnt(p, mnt)) { p->mnt.mnt_flags |= MNT_UMOUNT; if (mnt_ns_attached(p)) - move_from_ns(p, &tmp_list); - else - list_move(&p->mnt_list, &tmp_list); + move_from_ns(p); + list_add_tail(&p->mnt_list, &tmp_list); } /* Hide the mounts from mnt_mounts */ @@ -1896,7 +1862,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) disconnect = disconnect_mount(p, how); if (mnt_has_parent(p)) { - mnt_add_count(p->mnt_parent, -1); if (!disconnect) { /* Don't forget about p */ list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts); @@ -1973,7 +1938,7 @@ static int do_umount(struct mount *mnt, int flags) * all race cases, but it's a slowpath. */ lock_mount_hash(); - if (mnt_get_count(mnt) != 2) { + if (!list_empty(&mnt->mnt_mounts) || mnt_get_count(mnt) != 2) { unlock_mount_hash(); return -EBUSY; } @@ -2019,23 +1984,27 @@ static int do_umount(struct mount *mnt, int flags) namespace_lock(); lock_mount_hash(); - /* Recheck MNT_LOCKED with the locks held */ + /* Repeat the earlier racy checks, now that we are holding the locks */ retval = -EINVAL; + if (!check_mnt(mnt)) + goto out; + if (mnt->mnt.mnt_flags & MNT_LOCKED) goto out; + if (!mnt_has_parent(mnt)) /* not the absolute root */ + goto out; + event++; if (flags & MNT_DETACH) { - if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) - umount_tree(mnt, UMOUNT_PROPAGATE); + umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { smp_mb(); // paired with __legitimize_mnt() shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { - if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) - umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); + umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); retval = 0; } } @@ -2053,29 +2022,28 @@ out: * detach_mounts allows lazily unmounting those mounts instead of * leaking them. * - * The caller may hold dentry->d_inode->i_mutex. + * The caller may hold dentry->d_inode->i_rwsem. */ void __detach_mounts(struct dentry *dentry) { - struct mountpoint *mp; + struct pinned_mountpoint mp = {}; struct mount *mnt; namespace_lock(); lock_mount_hash(); - mp = lookup_mountpoint(dentry); - if (!mp) + if (!lookup_mountpoint(dentry, &mp)) goto out_unlock; event++; - while (!hlist_empty(&mp->m_list)) { - mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); + while (mp.node.next) { + mnt = hlist_entry(mp.node.next, struct mount, mnt_mp_list); if (mnt->mnt.mnt_flags & MNT_UMOUNT) { umount_mnt(mnt); hlist_add_head(&mnt->mnt_umount, &unmounted); } else umount_tree(mnt, UMOUNT_CONNECTED); } - put_mountpoint(mp); + unpin_mountpoint(&mp); out_unlock: unlock_mount_hash(); namespace_unlock(); @@ -2259,7 +2227,6 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, return dst_mnt; src_parent = src_root; - dst_mnt->mnt_mountpoint = src_root->mnt_mountpoint; list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) { if (!is_subdir(src_root_child->mnt_mountpoint, dentry)) @@ -2294,8 +2261,16 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, if (IS_ERR(dst_mnt)) goto out; lock_mount_hash(); - list_add_tail(&dst_mnt->mnt_list, &res->mnt_list); - attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp, false); + if (src_mnt->mnt.mnt_flags & MNT_LOCKED) + dst_mnt->mnt.mnt_flags |= MNT_LOCKED; + if (unlikely(flag & CL_EXPIRE)) { + /* stick the duplicate mount on the same expiry + * list as the original if that was on one */ + if (!list_empty(&src_mnt->mnt_expire)) + list_add(&dst_mnt->mnt_expire, + &src_mnt->mnt_expire); + } + attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp); unlock_mount_hash(); } } @@ -2310,104 +2285,94 @@ out: return dst_mnt; } -/* Caller should check returned pointer for errors */ - -struct vfsmount *collect_mounts(const struct path *path) +static inline bool extend_array(struct path **res, struct path **to_free, + unsigned n, unsigned *count, unsigned new_count) { - struct mount *tree; - namespace_lock(); - if (!check_mnt(real_mount(path->mnt))) - tree = ERR_PTR(-EINVAL); - else - tree = copy_tree(real_mount(path->mnt), path->dentry, - CL_COPY_ALL | CL_PRIVATE); - namespace_unlock(); - if (IS_ERR(tree)) - return ERR_CAST(tree); - return &tree->mnt; -} + struct path *p; -static void free_mnt_ns(struct mnt_namespace *); -static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); + if (likely(n < *count)) + return true; + p = kmalloc_array(new_count, sizeof(struct path), GFP_KERNEL); + if (p && *count) + memcpy(p, *res, *count * sizeof(struct path)); + *count = new_count; + kfree(*to_free); + *to_free = *res = p; + return p; +} -static inline bool must_dissolve(struct mnt_namespace *mnt_ns) +struct path *collect_paths(const struct path *path, + struct path *prealloc, unsigned count) { - /* - * This mount belonged to an anonymous mount namespace - * but was moved to a non-anonymous mount namespace and - * then unmounted. - */ - if (unlikely(!mnt_ns)) - return false; + struct mount *root = real_mount(path->mnt); + struct mount *child; + struct path *res = prealloc, *to_free = NULL; + unsigned n = 0; - /* - * This mount belongs to a non-anonymous mount namespace - * and we know that such a mount can never transition to - * an anonymous mount namespace again. - */ - if (!is_anon_ns(mnt_ns)) { - /* - * A detached mount either belongs to an anonymous mount - * namespace or a non-anonymous mount namespace. It - * should never belong to something purely internal. - */ - VFS_WARN_ON_ONCE(mnt_ns == MNT_NS_INTERNAL); - return false; + guard(rwsem_read)(&namespace_sem); + + if (!check_mnt(root)) + return ERR_PTR(-EINVAL); + if (!extend_array(&res, &to_free, 0, &count, 32)) + return ERR_PTR(-ENOMEM); + res[n++] = *path; + list_for_each_entry(child, &root->mnt_mounts, mnt_child) { + if (!is_subdir(child->mnt_mountpoint, path->dentry)) + continue; + for (struct mount *m = child; m; m = next_mnt(m, child)) { + if (!extend_array(&res, &to_free, n, &count, 2 * count)) + return ERR_PTR(-ENOMEM); + res[n].mnt = &m->mnt; + res[n].dentry = m->mnt.mnt_root; + n++; + } } + if (!extend_array(&res, &to_free, n, &count, count + 1)) + return ERR_PTR(-ENOMEM); + memset(res + n, 0, (count - n) * sizeof(struct path)); + for (struct path *p = res; p->mnt; p++) + path_get(p); + return res; +} - return true; +void drop_collected_paths(struct path *paths, struct path *prealloc) +{ + for (struct path *p = paths; p->mnt; p++) + path_put(p); + if (paths != prealloc) + kfree(paths); } +static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); + void dissolve_on_fput(struct vfsmount *mnt) { - struct mnt_namespace *ns; struct mount *m = real_mount(mnt); + /* + * m used to be the root of anon namespace; if it still is one, + * we need to dissolve the mount tree and free that namespace. + * Let's try to avoid taking namespace_sem if we can determine + * that there's nothing to do without it - rcu_read_lock() is + * enough to make anon_ns_root() memory-safe and once m has + * left its namespace, it's no longer our concern, since it will + * never become a root of anon ns again. + */ + scoped_guard(rcu) { - if (!must_dissolve(READ_ONCE(m->mnt_ns))) + if (!anon_ns_root(m)) return; } scoped_guard(namespace_lock, &namespace_sem) { - ns = m->mnt_ns; - if (!must_dissolve(ns)) - return; - - /* - * After must_dissolve() we know that this is a detached - * mount in an anonymous mount namespace. - * - * Now when mnt_has_parent() reports that this mount - * tree has a parent, we know that this anonymous mount - * tree has been moved to another anonymous mount - * namespace. - * - * So when closing this file we cannot unmount the mount - * tree. This will be done when the file referring to - * the root of the anonymous mount namespace will be - * closed (It could already be closed but it would sync - * on @namespace_sem and wait for us to finish.). - */ - if (mnt_has_parent(m)) + if (!anon_ns_root(m)) return; + emptied_ns = m->mnt_ns; lock_mount_hash(); umount_tree(m, UMOUNT_CONNECTED); unlock_mount_hash(); } - - /* Make sure we notice when we leak mounts. */ - VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); - free_mnt_ns(ns); -} - -void drop_collected_mounts(struct vfsmount *mnt) -{ - namespace_lock(); - lock_mount_hash(); - umount_tree(real_mount(mnt), 0); - unlock_mount_hash(); - namespace_unlock(); } static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) @@ -2486,9 +2451,7 @@ struct vfsmount *clone_private_mount(const struct path *path) * loops get created. */ if (!check_mnt(old_mnt)) { - if (!is_mounted(&old_mnt->mnt) || - !is_anon_ns(old_mnt->mnt_ns) || - mnt_has_parent(old_mnt)) + if (!anon_ns_root(old_mnt)) return ERR_PTR(-EINVAL); if (!check_for_nsfs_mounts(old_mnt)) @@ -2511,21 +2474,6 @@ struct vfsmount *clone_private_mount(const struct path *path) } EXPORT_SYMBOL_GPL(clone_private_mount); -int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, - struct vfsmount *root) -{ - struct mount *mnt; - int res = f(root, arg); - if (res) - return res; - list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { - res = f(&mnt->mnt, arg); - if (res) - return res; - } - return 0; -} - static void lock_mnt_tree(struct mount *mnt) { struct mount *p; @@ -2547,7 +2495,7 @@ static void lock_mnt_tree(struct mount *mnt) if (flags & MNT_NOEXEC) flags |= MNT_LOCK_NOEXEC; /* Don't allow unprivileged users to reveal what is under a mount */ - if (list_empty(&p->mnt_expire)) + if (list_empty(&p->mnt_expire) && p != mnt) flags |= MNT_LOCKED; p->mnt.mnt_flags = flags; } @@ -2568,7 +2516,7 @@ static int invent_group_ids(struct mount *mnt, bool recurse) struct mount *p; for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { - if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { + if (!p->mnt_group_id) { int err = mnt_alloc_group_id(p); if (err) { cleanup_group_ids(mnt, p); @@ -2604,17 +2552,15 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) } enum mnt_tree_flags_t { - MNT_TREE_MOVE = BIT(0), - MNT_TREE_BENEATH = BIT(1), - MNT_TREE_PROPAGATION = BIT(2), + MNT_TREE_BENEATH = BIT(0), + MNT_TREE_PROPAGATION = BIT(1), }; /** * attach_recursive_mnt - attach a source mount tree * @source_mnt: mount tree to be attached - * @top_mnt: mount that @source_mnt will be mounted on or mounted beneath + * @dest_mnt: mount that @source_mnt will be mounted on * @dest_mp: the mountpoint @source_mnt will be mounted at - * @flags: modify how @source_mnt is supposed to be attached * * NOTE: in the table below explains the semantics when a source mount * of a given type is attached to a destination mount of a given type. @@ -2677,26 +2623,31 @@ enum mnt_tree_flags_t { * Otherwise a negative error code is returned. */ static int attach_recursive_mnt(struct mount *source_mnt, - struct mount *top_mnt, - struct mountpoint *dest_mp, - enum mnt_tree_flags_t flags) + struct mount *dest_mnt, + struct mountpoint *dest_mp) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; HLIST_HEAD(tree_list); - struct mnt_namespace *ns = top_mnt->mnt_ns; - struct mountpoint *smp; - struct mount *child, *dest_mnt, *p; + struct mnt_namespace *ns = dest_mnt->mnt_ns; + struct pinned_mountpoint root = {}; + struct mountpoint *shorter = NULL; + struct mount *child, *p; + struct mount *top; struct hlist_node *n; int err = 0; - bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH; + bool moving = mnt_has_parent(source_mnt); /* * Preallocate a mountpoint in case the new mounts need to be * mounted beneath mounts on the same mountpoint. */ - smp = get_mountpoint(source_mnt->mnt.mnt_root); - if (IS_ERR(smp)) - return PTR_ERR(smp); + for (top = source_mnt; unlikely(top->overmount); top = top->overmount) { + if (!shorter && is_mnt_ns_file(top->mnt.mnt_root)) + shorter = top->mnt_mp; + } + err = get_mountpoint(top->mnt.mnt_root, &root); + if (err) + return err; /* Is there space to add these mounts to the mount namespace? */ if (!moving) { @@ -2705,11 +2656,6 @@ static int attach_recursive_mnt(struct mount *source_mnt, goto out; } - if (beneath) - dest_mnt = top_mnt->mnt_parent; - else - dest_mnt = top_mnt; - if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) @@ -2726,42 +2672,50 @@ static int attach_recursive_mnt(struct mount *source_mnt, } if (moving) { - if (beneath) - dest_mp = smp; - unhash_mnt(source_mnt); - attach_mnt(source_mnt, top_mnt, dest_mp, beneath); + umount_mnt(source_mnt); mnt_notify_add(source_mnt); - touch_mnt_namespace(source_mnt->mnt_ns); + /* if the mount is moved, it should no longer be expired + * automatically */ + list_del_init(&source_mnt->mnt_expire); } else { if (source_mnt->mnt_ns) { - LIST_HEAD(head); - /* move from anon - the caller will destroy */ + emptied_ns = source_mnt->mnt_ns; for (p = source_mnt; p; p = next_mnt(p, source_mnt)) - move_from_ns(p, &head); - list_del_init(&head); + move_from_ns(p); } - if (beneath) - mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp); - else - mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); - commit_tree(source_mnt); } + mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); + /* + * Now the original copy is in the same state as the secondaries - + * its root attached to mountpoint, but not hashed and all mounts + * in it are either in our namespace or in no namespace at all. + * Add the original to the list of copies and deal with the + * rest of work for all of them uniformly. + */ + hlist_add_head(&source_mnt->mnt_hash, &tree_list); + hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { struct mount *q; hlist_del_init(&child->mnt_hash); - q = __lookup_mnt(&child->mnt_parent->mnt, - child->mnt_mountpoint); - if (q) - mnt_change_mountpoint(child, smp, q); /* Notice when we are propagating across user namespaces */ if (child->mnt_parent->mnt_ns->user_ns != user_ns) lock_mnt_tree(child); - child->mnt.mnt_flags &= ~MNT_LOCKED; + q = __lookup_mnt(&child->mnt_parent->mnt, + child->mnt_mountpoint); + if (q) { + struct mountpoint *mp = root.mp; + struct mount *r = child; + while (unlikely(r->overmount)) + r = r->overmount; + if (unlikely(shorter) && child != source_mnt) + mp = shorter; + mnt_change_mountpoint(r, mp, q); + } commit_tree(child); } - put_mountpoint(smp); + unpin_mountpoint(&root); unlock_mount_hash(); return 0; @@ -2778,7 +2732,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, ns->pending_mounts = 0; read_seqlock_excl(&mount_lock); - put_mountpoint(smp); + unpin_mountpoint(&root); read_sequnlock_excl(&mount_lock); return err; @@ -2818,12 +2772,12 @@ static int attach_recursive_mnt(struct mount *source_mnt, * Return: Either the target mountpoint on the top mount or the top * mount's mountpoint. */ -static struct mountpoint *do_lock_mount(struct path *path, bool beneath) +static int do_lock_mount(struct path *path, struct pinned_mountpoint *pinned, bool beneath) { struct vfsmount *mnt = path->mnt; struct dentry *dentry; - struct mountpoint *mp = ERR_PTR(-ENOENT); struct path under = {}; + int err = -ENOENT; for (;;) { struct mount *m = real_mount(mnt); @@ -2861,8 +2815,8 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath) path->dentry = dget(mnt->mnt_root); continue; // got overmounted } - mp = get_mountpoint(dentry); - if (IS_ERR(mp)) + err = get_mountpoint(dentry, pinned); + if (err) break; if (beneath) { /* @@ -2873,25 +2827,25 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath) */ path_put(&under); } - return mp; + return 0; } namespace_unlock(); inode_unlock(dentry->d_inode); if (beneath) path_put(&under); - return mp; + return err; } -static inline struct mountpoint *lock_mount(struct path *path) +static inline int lock_mount(struct path *path, struct pinned_mountpoint *m) { - return do_lock_mount(path, false); + return do_lock_mount(path, m, false); } -static void unlock_mount(struct mountpoint *where) +static void unlock_mount(struct pinned_mountpoint *m) { - inode_unlock(where->m_dentry->d_inode); + inode_unlock(m->mp->m_dentry->d_inode); read_seqlock_excl(&mount_lock); - put_mountpoint(where); + unpin_mountpoint(m); read_sequnlock_excl(&mount_lock); namespace_unlock(); } @@ -2905,7 +2859,7 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) d_is_dir(mnt->mnt.mnt_root)) return -ENOTDIR; - return attach_recursive_mnt(mnt, p, mp, 0); + return attach_recursive_mnt(mnt, p, mp); } /* @@ -2954,10 +2908,8 @@ static int do_change_type(struct path *path, int ms_flags) goto out_unlock; } - lock_mount_hash(); for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) change_mnt_propagation(m, type); - unlock_mount_hash(); out_unlock: namespace_unlock(); @@ -3031,26 +2983,21 @@ static inline bool may_copy_tree(struct path *path) static struct mount *__do_loopback(struct path *old_path, int recurse) { - struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt); + struct mount *old = real_mount(old_path->mnt); if (IS_MNT_UNBINDABLE(old)) - return mnt; + return ERR_PTR(-EINVAL); if (!may_copy_tree(old_path)) - return mnt; + return ERR_PTR(-EINVAL); if (!recurse && __has_locked_children(old, old_path->dentry)) - return mnt; + return ERR_PTR(-EINVAL); if (recurse) - mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); + return copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); else - mnt = clone_mnt(old, old_path->dentry, 0); - - if (!IS_ERR(mnt)) - mnt->mnt.mnt_flags &= ~MNT_LOCKED; - - return mnt; + return clone_mnt(old, old_path->dentry, 0); } /* @@ -3061,7 +3008,7 @@ static int do_loopback(struct path *path, const char *old_name, { struct path old_path; struct mount *mnt = NULL, *parent; - struct mountpoint *mp; + struct pinned_mountpoint mp = {}; int err; if (!old_name || !*old_name) return -EINVAL; @@ -3073,11 +3020,9 @@ static int do_loopback(struct path *path, const char *old_name, if (mnt_ns_loop(old_path.dentry)) goto out; - mp = lock_mount(path); - if (IS_ERR(mp)) { - err = PTR_ERR(mp); + err = lock_mount(path, &mp); + if (err) goto out; - } parent = real_mount(path->mnt); if (!check_mnt(parent)) @@ -3089,14 +3034,14 @@ static int do_loopback(struct path *path, const char *old_name, goto out2; } - err = graft_tree(mnt, parent, mp); + err = graft_tree(mnt, parent, mp.mp); if (err) { lock_mount_hash(); umount_tree(mnt, UMOUNT_SYNC); unlock_mount_hash(); } out2: - unlock_mount(mp); + unlock_mount(&mp); out: path_put(&old_path); return err; @@ -3444,18 +3389,14 @@ static int do_set_group(struct path *from_path, struct path *to_path) goto out; if (IS_MNT_SLAVE(from)) { - struct mount *m = from->mnt_master; - - list_add(&to->mnt_slave, &from->mnt_slave); - to->mnt_master = m; + hlist_add_behind(&to->mnt_slave, &from->mnt_slave); + to->mnt_master = from->mnt_master; } if (IS_MNT_SHARED(from)) { to->mnt_group_id = from->mnt_group_id; list_add(&to->mnt_share, &from->mnt_share); - lock_mount_hash(); set_mnt_shared(to); - unlock_mount_hash(); } err = 0; @@ -3492,6 +3433,17 @@ static inline bool path_overmounted(const struct path *path) return unlikely(!no_child); } +/* + * Check if there is a possibly empty chain of descent from p1 to p2. + * Locks: namespace_sem (shared) or mount_lock (read_seqlock_excl). + */ +static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) +{ + while (p2 != p1 && mnt_has_parent(p2)) + p2 = p2->mnt_parent; + return p2 == p1; +} + /** * can_move_mount_beneath - check that we can mount beneath the top mount * @from: mount to mount beneath @@ -3543,9 +3495,8 @@ static int can_move_mount_beneath(const struct path *from, if (parent_mnt_to == current->nsproxy->mnt_ns->root) return -EINVAL; - for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent) - if (p == mnt_to) - return -EINVAL; + if (mount_is_ancestor(mnt_to, mnt_from)) + return -EINVAL; /* * If the parent mount propagates to the child mount this would @@ -3630,27 +3581,20 @@ static int do_move_mount(struct path *old_path, struct mount *p; struct mount *old; struct mount *parent; - struct mountpoint *mp, *old_mp; + struct pinned_mountpoint mp; int err; - bool attached, beneath = flags & MNT_TREE_BENEATH; + bool beneath = flags & MNT_TREE_BENEATH; - mp = do_lock_mount(new_path, beneath); - if (IS_ERR(mp)) - return PTR_ERR(mp); + err = do_lock_mount(new_path, &mp, beneath); + if (err) + return err; old = real_mount(old_path->mnt); p = real_mount(new_path->mnt); parent = old->mnt_parent; - attached = mnt_has_parent(old); - if (attached) - flags |= MNT_TREE_MOVE; - old_mp = old->mnt_mp; ns = old->mnt_ns; err = -EINVAL; - /* The thing moved must be mounted... */ - if (!is_mounted(&old->mnt)) - goto out; if (check_mnt(old)) { /* if the source is in our namespace... */ @@ -3660,13 +3604,14 @@ static int do_move_mount(struct path *old_path, /* ... and the target should be in our namespace */ if (!check_mnt(p)) goto out; + /* parent of the source should not be shared */ + if (IS_MNT_SHARED(parent)) + goto out; } else { /* * otherwise the source must be the root of some anon namespace. - * AV: check for mount being root of an anon namespace is worth - * an inlined predicate... */ - if (!is_anon_ns(ns) || mnt_has_parent(old)) + if (!anon_ns_root(old)) goto out; /* * Bail out early if the target is within the same namespace - @@ -3689,20 +3634,14 @@ static int do_move_mount(struct path *old_path, if (d_is_dir(new_path->dentry) != d_is_dir(old_path->dentry)) goto out; - /* - * Don't move a mount residing in a shared parent. - */ - if (attached && IS_MNT_SHARED(parent)) - goto out; if (beneath) { - err = can_move_mount_beneath(old_path, new_path, mp); + err = can_move_mount_beneath(old_path, new_path, mp.mp); if (err) goto out; err = -EINVAL; p = p->mnt_parent; - flags |= MNT_TREE_BENEATH; } /* @@ -3714,30 +3653,12 @@ static int do_move_mount(struct path *old_path, err = -ELOOP; if (!check_for_nsfs_mounts(old)) goto out; - for (; mnt_has_parent(p); p = p->mnt_parent) - if (p == old) - goto out; - - err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags); - if (err) + if (mount_is_ancestor(old, p)) goto out; - /* if the mount is moved, it should no longer be expire - * automatically */ - list_del_init(&old->mnt_expire); - if (attached) - put_mountpoint(old_mp); + err = attach_recursive_mnt(old, p, mp.mp); out: - unlock_mount(mp); - if (!err) { - if (attached) { - mntput_no_expire(parent); - } else { - /* Make sure we notice when we leak mounts. */ - VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); - free_mnt_ns(ns); - } - } + unlock_mount(&mp); return err; } @@ -3798,7 +3719,7 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, unsigned int mnt_flags) { struct vfsmount *mnt; - struct mountpoint *mp; + struct pinned_mountpoint mp = {}; struct super_block *sb = fc->root->d_sb; int error; @@ -3819,13 +3740,12 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, mnt_warn_timestamp_expiry(mountpoint, mnt); - mp = lock_mount(mountpoint); - if (IS_ERR(mp)) { - mntput(mnt); - return PTR_ERR(mp); + error = lock_mount(mountpoint, &mp); + if (!error) { + error = do_add_mount(real_mount(mnt), mp.mp, + mountpoint, mnt_flags); + unlock_mount(&mp); } - error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags); - unlock_mount(mp); if (error < 0) mntput(mnt); return error; @@ -3893,7 +3813,7 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, int finish_automount(struct vfsmount *m, const struct path *path) { struct dentry *dentry = path->dentry; - struct mountpoint *mp; + struct pinned_mountpoint mp = {}; struct mount *mnt; int err; @@ -3925,14 +3845,13 @@ int finish_automount(struct vfsmount *m, const struct path *path) err = 0; goto discard_locked; } - mp = get_mountpoint(dentry); - if (IS_ERR(mp)) { - err = PTR_ERR(mp); + err = get_mountpoint(dentry, &mp); + if (err) goto discard_locked; - } - err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE); - unlock_mount(mp); + err = do_add_mount(mnt, mp.mp, path, + path->mnt->mnt_flags | MNT_SHRINKABLE); + unlock_mount(&mp); if (unlikely(err)) goto discard; return 0; @@ -3941,12 +3860,6 @@ discard_locked: namespace_unlock(); inode_unlock(dentry->d_inode); discard: - /* remove m from any expiration list it may be on */ - if (!list_empty(&mnt->mnt_expire)) { - namespace_lock(); - list_del_init(&mnt->mnt_expire); - namespace_unlock(); - } mntput(m); return err; } @@ -3958,11 +3871,9 @@ discard: */ void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) { - namespace_lock(); - + read_seqlock_excl(&mount_lock); list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); - - namespace_unlock(); + read_sequnlock_excl(&mount_lock); } EXPORT_SYMBOL(mnt_set_expiry); @@ -4316,7 +4227,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, /* First pass: copy the tree topology */ copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) - copy_flags |= CL_SHARED_TO_SLAVE; + copy_flags |= CL_SLAVE; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); @@ -4741,7 +4652,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, { struct path new, old, root; struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent; - struct mountpoint *old_mp, *root_mp; + struct pinned_mountpoint old_mp = {}; int error; if (!may_mount()) @@ -4762,9 +4673,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, goto out2; get_fs_root(current->fs, &root); - old_mp = lock_mount(&old); - error = PTR_ERR(old_mp); - if (IS_ERR(old_mp)) + error = lock_mount(&old, &old_mp); + if (error) goto out3; error = -EINVAL; @@ -4791,11 +4701,11 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, if (!path_mounted(&root)) goto out4; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) - goto out4; /* not attached */ + goto out4; /* absolute root */ if (!path_mounted(&new)) goto out4; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) - goto out4; /* not attached */ + goto out4; /* absolute root */ /* make sure we can reach put_old from new_root */ if (!is_path_reachable(old_mnt, old.dentry, &new)) goto out4; @@ -4804,29 +4714,25 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, goto out4; lock_mount_hash(); umount_mnt(new_mnt); - root_mp = unhash_mnt(root_mnt); /* we'll need its mountpoint */ if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { new_mnt->mnt.mnt_flags |= MNT_LOCKED; root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; } - /* mount old root on put_old */ - attach_mnt(root_mnt, old_mnt, old_mp, false); /* mount new_root on / */ - attach_mnt(new_mnt, root_parent, root_mp, false); - mnt_add_count(root_parent, -1); + attach_mnt(new_mnt, root_parent, root_mnt->mnt_mp); + umount_mnt(root_mnt); + /* mount old root on put_old */ + attach_mnt(root_mnt, old_mnt, old_mp.mp); touch_mnt_namespace(current->nsproxy->mnt_ns); /* A moved mount should not expire automatically */ list_del_init(&new_mnt->mnt_expire); - put_mountpoint(root_mp); unlock_mount_hash(); mnt_notify_add(root_mnt); mnt_notify_add(new_mnt); chroot_fs_refs(&root, &new); error = 0; out4: - unlock_mount(old_mp); - if (!error) - mntput_no_expire(ex_parent); + unlock_mount(&old_mp); out3: path_put(&root); out2: @@ -5028,22 +4934,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) err = -EINVAL; lock_mount_hash(); - /* Ensure that this isn't anything purely vfs internal. */ - if (!is_mounted(&mnt->mnt)) - goto out; - - /* - * If this is an attached mount make sure it's located in the callers - * mount namespace. If it's not don't let the caller interact with it. - * - * If this mount doesn't have a parent it's most often simply a - * detached mount with an anonymous mount namespace. IOW, something - * that's simply not attached yet. But there are apparently also users - * that do change mount properties on the rootfs itself. That obviously - * neither has a parent nor is it a detached mount so we cannot - * unconditionally check for detached mounts. - */ - if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt)) + if (!anon_ns_root(mnt) && !check_mnt(mnt)) goto out; /* @@ -5290,16 +5181,12 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, kattr.kflags |= MOUNT_KATTR_RECURSE; ret = wants_mount_setattr(uattr, usize, &kattr); - if (ret < 0) - return ret; - - if (ret) { + if (ret > 0) { ret = do_mount_setattr(&file->f_path, &kattr); - if (ret) - return ret; - finish_mount_kattr(&kattr); } + if (ret) + return ret; } fd = get_unused_fd_flags(flags & O_CLOEXEC); @@ -5411,7 +5298,7 @@ static void statmount_mnt_basic(struct kstatmount *s) s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id; s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt); s->sm.mnt_propagation = mnt_to_propagation_flags(m); - s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0; + s->sm.mnt_peer_group = m->mnt_group_id; s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0; } @@ -6203,9 +6090,11 @@ static void __init init_mount_tree(void) if (IS_ERR(mnt)) panic("Can't create rootfs"); - ns = alloc_mnt_ns(&init_user_ns, false); + ns = alloc_mnt_ns(&init_user_ns, true); if (IS_ERR(ns)) panic("Can't allocate initial namespace"); + ns->seq = atomic64_inc_return(&mnt_ns_seq); + ns->ns.inum = PROC_MNT_INIT_INO; m = real_mount(mnt); ns->root = m; ns->nr_mounts = 1; @@ -6215,7 +6104,6 @@ static void __init init_mount_tree(void) root.mnt = mnt; root.dentry = mnt->mnt_root; - mnt->mnt_flags |= MNT_LOCKED; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); @@ -6262,8 +6150,12 @@ void put_mnt_ns(struct mnt_namespace *ns) { if (!refcount_dec_and_test(&ns->ns.count)) return; - drop_collected_mounts(&ns->root->mnt); - free_mnt_ns(ns); + namespace_lock(); + emptied_ns = ns; + lock_mount_hash(); + umount_tree(ns->root, 0); + unlock_mount_hash(); + namespace_unlock(); } struct vfsmount *kern_mount(struct file_system_type *type) |