diff options
Diffstat (limited to 'fs/namei.c')
| -rw-r--r-- | fs/namei.c | 3153 |
1 files changed, 2202 insertions, 951 deletions
diff --git a/fs/namei.c b/fs/namei.c index b867a92c078e..bf0f66f0e9b9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -17,15 +17,16 @@ #include <linux/init.h> #include <linux/export.h> -#include <linux/kernel.h> #include <linux/slab.h> +#include <linux/wordpart.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/namei.h> #include <linux/pagemap.h> +#include <linux/sched/mm.h> #include <linux/fsnotify.h> #include <linux/personality.h> #include <linux/security.h> -#include <linux/ima.h> #include <linux/syscalls.h> #include <linux/mount.h> #include <linux/audit.h> @@ -124,8 +125,15 @@ #define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) +static inline void initname(struct filename *name, const char __user *uptr) +{ + name->uptr = uptr; + name->aname = NULL; + atomic_set(&name->refcnt, 1); +} + struct filename * -getname_flags(const char __user *filename, int flags, int *empty) +getname_flags(const char __user *filename, int flags) { struct filename *result; char *kname; @@ -147,9 +155,20 @@ getname_flags(const char __user *filename, int flags, int *empty) result->name = kname; len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX); - if (unlikely(len < 0)) { - __putname(result); - return ERR_PTR(len); + /* + * Handle both empty path and copy failure in one go. + */ + if (unlikely(len <= 0)) { + if (unlikely(len < 0)) { + __putname(result); + return ERR_PTR(len); + } + + /* The empty path is special. */ + if (!(flags & LOOKUP_EMPTY)) { + __putname(result); + return ERR_PTR(-ENOENT); + } } /* @@ -179,46 +198,50 @@ getname_flags(const char __user *filename, int flags, int *empty) kfree(result); return ERR_PTR(len); } + /* The empty path is special. */ + if (unlikely(!len) && !(flags & LOOKUP_EMPTY)) { + __putname(kname); + kfree(result); + return ERR_PTR(-ENOENT); + } if (unlikely(len == PATH_MAX)) { __putname(kname); kfree(result); return ERR_PTR(-ENAMETOOLONG); } } - - result->refcnt = 1; - /* The empty path is special. */ - if (unlikely(!len)) { - if (empty) - *empty = 1; - if (!(flags & LOOKUP_EMPTY)) { - putname(result); - return ERR_PTR(-ENOENT); - } - } - - result->uptr = filename; - result->aname = NULL; + initname(result, filename); audit_getname(result); return result; } -struct filename * -getname_uflags(const char __user *filename, int uflags) +struct filename *getname_uflags(const char __user *filename, int uflags) { int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; - return getname_flags(filename, flags, NULL); + return getname_flags(filename, flags); } -struct filename * -getname(const char __user * filename) +struct filename *__getname_maybe_null(const char __user *pathname) { - return getname_flags(filename, 0, NULL); + struct filename *name; + char c; + + /* try to save on allocations; loss on um, though */ + if (get_user(c, pathname)) + return ERR_PTR(-EFAULT); + if (!c) + return NULL; + + name = getname_flags(pathname, LOOKUP_EMPTY); + if (!IS_ERR(name) && !(name->name[0])) { + putname(name); + name = NULL; + } + return name; } -struct filename * -getname_kernel(const char * filename) +struct filename *getname_kernel(const char * filename) { struct filename *result; int len = strlen(filename) + 1; @@ -245,34 +268,39 @@ getname_kernel(const char * filename) return ERR_PTR(-ENAMETOOLONG); } memcpy((char *)result->name, filename, len); - result->uptr = NULL; - result->aname = NULL; - result->refcnt = 1; + initname(result, NULL); audit_getname(result); - return result; } +EXPORT_SYMBOL(getname_kernel); void putname(struct filename *name) { - if (IS_ERR(name)) + int refcnt; + + if (IS_ERR_OR_NULL(name)) return; - BUG_ON(name->refcnt <= 0); + refcnt = atomic_read(&name->refcnt); + if (unlikely(refcnt != 1)) { + if (WARN_ON_ONCE(!refcnt)) + return; - if (--name->refcnt > 0) - return; + if (!atomic_dec_and_test(&name->refcnt)) + return; + } - if (name->name != name->iname) { + if (unlikely(name->name != name->iname)) { __putname(name->name); kfree(name); } else __putname(name); } +EXPORT_SYMBOL(putname); /** * check_acl - perform ACL permission checking - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * @@ -280,13 +308,13 @@ void putname(struct filename *name) * retrieve POSIX acls it needs to know whether it is called from a blocking or * non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply pass @nop_mnt_idmap. */ -static int check_acl(struct user_namespace *mnt_userns, +static int check_acl(struct mnt_idmap *idmap, struct inode *inode, int mask) { #ifdef CONFIG_FS_POSIX_ACL @@ -296,17 +324,17 @@ static int check_acl(struct user_namespace *mnt_userns, acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS); if (!acl) return -EAGAIN; - /* no ->get_acl() calls in RCU mode... */ + /* no ->get_inode_acl() calls in RCU mode... */ if (is_uncached_acl(acl)) return -ECHILD; - return posix_acl_permission(mnt_userns, inode, acl, mask); + return posix_acl_permission(idmap, inode, acl, mask); } - acl = get_acl(inode, ACL_TYPE_ACCESS); + acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { - int error = posix_acl_permission(mnt_userns, inode, acl, mask); + int error = posix_acl_permission(idmap, inode, acl, mask); posix_acl_release(acl); return error; } @@ -315,9 +343,28 @@ static int check_acl(struct user_namespace *mnt_userns, return -EAGAIN; } +/* + * Very quick optimistic "we know we have no ACL's" check. + * + * Note that this is purely for ACL_TYPE_ACCESS, and purely + * for the "we have cached that there are no ACLs" case. + * + * If this returns true, we know there are no ACLs. But if + * it returns false, we might still not have ACLs (it could + * be the is_uncached_acl() case). + */ +static inline bool no_acl_inode(struct inode *inode) +{ +#ifdef CONFIG_FS_POSIX_ACL + return likely(!READ_ONCE(inode->i_acl)); +#else + return true; +#endif +} + /** * acl_permission_check - perform basic UNIX permission checking - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * @@ -325,21 +372,43 @@ static int check_acl(struct user_namespace *mnt_userns, * function may retrieve POSIX acls it needs to know whether it is called from a * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply pass @nop_mnt_idmap. */ -static int acl_permission_check(struct user_namespace *mnt_userns, +static int acl_permission_check(struct mnt_idmap *idmap, struct inode *inode, int mask) { unsigned int mode = inode->i_mode; - kuid_t i_uid; + vfsuid_t vfsuid; + + /* + * Common cheap case: everybody has the requested + * rights, and there are no ACLs to check. No need + * to do any owner/group checks in that case. + * + * - 'mask&7' is the requested permission bit set + * - multiplying by 0111 spreads them out to all of ugo + * - '& ~mode' looks for missing inode permission bits + * - the '!' is for "no missing permissions" + * + * After that, we just need to check that there are no + * ACL's on the inode - do the 'IS_POSIXACL()' check last + * because it will dereference the ->i_sb pointer and we + * want to avoid that if at all possible. + */ + if (!((mask & 7) * 0111 & ~mode)) { + if (no_acl_inode(inode)) + return 0; + if (!IS_POSIXACL(inode)) + return 0; + } /* Are we the owner? If so, ACL's don't matter */ - i_uid = i_uid_into_mnt(mnt_userns, inode); - if (likely(uid_eq(current_fsuid(), i_uid))) { + vfsuid = i_uid_into_vfsuid(idmap, inode); + if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { mask &= 7; mode >>= 6; return (mask & ~mode) ? -EACCES : 0; @@ -347,7 +416,7 @@ static int acl_permission_check(struct user_namespace *mnt_userns, /* Do we have ACL's? */ if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { - int error = check_acl(mnt_userns, inode, mask); + int error = check_acl(idmap, inode, mask); if (error != -EAGAIN) return error; } @@ -361,8 +430,8 @@ static int acl_permission_check(struct user_namespace *mnt_userns, * about? Need to check group ownership if so. */ if (mask & (mode ^ (mode >> 3))) { - kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); - if (in_group_p(kgid)) + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); + if (vfsgid_in_group_p(vfsgid)) mode >>= 3; } @@ -372,7 +441,7 @@ static int acl_permission_check(struct user_namespace *mnt_userns, /** * generic_permission - check for access rights on a Posix-like filesystem - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode to check access rights for * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, * %MAY_NOT_BLOCK ...) @@ -386,13 +455,13 @@ static int acl_permission_check(struct user_namespace *mnt_userns, * request cannot be satisfied (eg. requires blocking or too much complexity). * It would then be called again in ref-walk mode. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply pass @nop_mnt_idmap. */ -int generic_permission(struct user_namespace *mnt_userns, struct inode *inode, +int generic_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int ret; @@ -400,17 +469,17 @@ int generic_permission(struct user_namespace *mnt_userns, struct inode *inode, /* * Do the basic permission checks. */ - ret = acl_permission_check(mnt_userns, inode, mask); + ret = acl_permission_check(idmap, inode, mask); if (ret != -EACCES) return ret; if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ if (!(mask & MAY_WRITE)) - if (capable_wrt_inode_uidgid(mnt_userns, inode, + if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; - if (capable_wrt_inode_uidgid(mnt_userns, inode, + if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; return -EACCES; @@ -421,7 +490,7 @@ int generic_permission(struct user_namespace *mnt_userns, struct inode *inode, */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) - if (capable_wrt_inode_uidgid(mnt_userns, inode, + if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; /* @@ -430,7 +499,7 @@ int generic_permission(struct user_namespace *mnt_userns, struct inode *inode, * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) - if (capable_wrt_inode_uidgid(mnt_userns, inode, + if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; @@ -440,7 +509,7 @@ EXPORT_SYMBOL(generic_permission); /** * do_inode_permission - UNIX permission checking - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * @@ -449,19 +518,19 @@ EXPORT_SYMBOL(generic_permission); * flag in inode->i_opflags, that says "this has not special * permission function, use the fast case". */ -static inline int do_inode_permission(struct user_namespace *mnt_userns, +static inline int do_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { if (likely(inode->i_op->permission)) - return inode->i_op->permission(mnt_userns, inode, mask); + return inode->i_op->permission(idmap, inode, mask); /* This gets set once for the inode lifetime */ spin_lock(&inode->i_lock); inode->i_opflags |= IOP_FASTPERM; spin_unlock(&inode->i_lock); } - return generic_permission(mnt_userns, inode, mask); + return generic_permission(idmap, inode, mask); } /** @@ -471,10 +540,13 @@ static inline int do_inode_permission(struct user_namespace *mnt_userns, * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Separate out file-system wide checks from inode-specific permission checks. + * + * Note: lookup_inode_permission_may_exec() does not call here. If you add + * MAY_EXEC checks, adjust it. */ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) { - if (unlikely(mask & MAY_WRITE)) { + if (mask & MAY_WRITE) { umode_t mode = inode->i_mode; /* Nobody gets write access to a read-only fs. */ @@ -486,7 +558,7 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) /** * inode_permission - Check for access rights to a given inode - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: Inode to check permission on * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * @@ -496,20 +568,20 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) * * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. */ -int inode_permission(struct user_namespace *mnt_userns, +int inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int retval; retval = sb_permission(inode->i_sb, inode, mask); - if (retval) + if (unlikely(retval)) return retval; - if (unlikely(mask & MAY_WRITE)) { + if (mask & MAY_WRITE) { /* * Nobody gets write access to an immutable file. */ - if (IS_IMMUTABLE(inode)) + if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; /* @@ -517,22 +589,58 @@ int inode_permission(struct user_namespace *mnt_userns, * written back improperly if their true value is unknown * to the vfs. */ - if (HAS_UNMAPPED_ID(mnt_userns, inode)) + if (unlikely(HAS_UNMAPPED_ID(idmap, inode))) return -EACCES; } - retval = do_inode_permission(mnt_userns, inode, mask); - if (retval) + retval = do_inode_permission(idmap, inode, mask); + if (unlikely(retval)) return retval; retval = devcgroup_inode_permission(inode, mask); - if (retval) + if (unlikely(retval)) return retval; return security_inode_permission(inode, mask); } EXPORT_SYMBOL(inode_permission); +/* + * lookup_inode_permission_may_exec - Check traversal right for given inode + * + * This is a special case routine for may_lookup() making assumptions specific + * to path traversal. Use inode_permission() if you are doing something else. + * + * Work is shaved off compared to inode_permission() as follows: + * - we know for a fact there is no MAY_WRITE to worry about + * - it is an invariant the inode is a directory + * + * Since majority of real-world traversal happens on inodes which grant it for + * everyone, we check it upfront and only resort to more expensive work if it + * fails. + * + * Filesystems which have their own ->permission hook and consequently miss out + * on IOP_FASTPERM can still get the optimization if they set IOP_FASTPERM_MAY_EXEC + * on their directory inodes. + */ +static __always_inline int lookup_inode_permission_may_exec(struct mnt_idmap *idmap, + struct inode *inode, int mask) +{ + /* Lookup already checked this to return -ENOTDIR */ + VFS_BUG_ON_INODE(!S_ISDIR(inode->i_mode), inode); + VFS_BUG_ON((mask & ~MAY_NOT_BLOCK) != 0); + + mask |= MAY_EXEC; + + if (unlikely(!(inode->i_opflags & (IOP_FASTPERM | IOP_FASTPERM_MAY_EXEC)))) + return inode_permission(idmap, inode, mask); + + if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode))) + return inode_permission(idmap, inode, mask); + + return security_inode_permission(inode, mask); +} + /** * path_get - get a reference to a path * @path: path to get the reference to @@ -566,7 +674,7 @@ struct nameidata { struct path root; struct inode *inode; /* path.dentry.d_inode */ unsigned int flags, state; - unsigned seq, m_seq, r_seq; + unsigned seq, next_seq, m_seq, r_seq; int last_type; unsigned depth; int total_link_count; @@ -577,10 +685,11 @@ struct nameidata { unsigned seq; } *stack, internal[EMBEDDED_LEVELS]; struct filename *name; + const char *pathname; struct nameidata *saved; unsigned root_seq; int dfd; - kuid_t dir_uid; + vfsuid_t dir_vfsuid; umode_t dir_mode; } __randomize_layout; @@ -595,6 +704,7 @@ static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name) p->depth = 0; p->dfd = dfd; p->name = name; + p->pathname = likely(name) ? name->name : ""; p->path.mnt = NULL; p->path.dentry = NULL; p->total_link_count = old ? old->total_link_count : 0; @@ -639,6 +749,8 @@ static bool nd_alloc_stack(struct nameidata *nd) /** * path_connected - Verify that a dentry is below mnt.mnt_root + * @mnt: The mountpoint to check. + * @dentry: The dentry to check. * * Rename can sometimes move a file or directory outside of a bind * mount, path_connected allows those cases to be detected. @@ -664,9 +776,17 @@ static void drop_links(struct nameidata *nd) } } +static void leave_rcu(struct nameidata *nd) +{ + nd->flags &= ~LOOKUP_RCU; + nd->seq = nd->next_seq = 0; + rcu_read_unlock(); +} + static void terminate_walk(struct nameidata *nd) { - drop_links(nd); + if (unlikely(nd->depth)) + drop_links(nd); if (!(nd->flags & LOOKUP_RCU)) { int i; path_put(&nd->path); @@ -677,8 +797,7 @@ static void terminate_walk(struct nameidata *nd) nd->state &= ~ND_ROOT_GRABBED; } } else { - nd->flags &= ~LOOKUP_RCU; - rcu_read_unlock(); + leave_rcu(nd); } nd->depth = 0; nd->path.mnt = NULL; @@ -729,13 +848,6 @@ static bool legitimize_links(struct nameidata *nd) static bool legitimize_root(struct nameidata *nd) { - /* - * For scoped-lookups (where nd->root has been zeroed), we need to - * restart the whole lookup from scratch -- because set_root() is wrong - * for these lookups (nd->dfd is the root, not the filesystem root). - */ - if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED)) - return false; /* Nothing to do if nd->root is zero or is managed by the VFS user. */ if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET)) return true; @@ -771,14 +883,13 @@ static bool try_to_unlazy(struct nameidata *nd) BUG_ON(!(nd->flags & LOOKUP_RCU)); - nd->flags &= ~LOOKUP_RCU; - if (unlikely(!legitimize_links(nd))) + if (unlikely(nd->depth && !legitimize_links(nd))) goto out1; if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) goto out; if (unlikely(!legitimize_root(nd))) goto out; - rcu_read_unlock(); + leave_rcu(nd); BUG_ON(nd->inode != parent->d_inode); return true; @@ -786,7 +897,7 @@ out1: nd->path.mnt = NULL; nd->path.dentry = NULL; out: - rcu_read_unlock(); + leave_rcu(nd); return false; } @@ -794,24 +905,27 @@ out: * try_to_unlazy_next - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * @dentry: next dentry to step into - * @seq: seq number to check @dentry against * Returns: true on success, false on failure * - * Similar to to try_to_unlazy(), but here we have the next dentry already + * Similar to try_to_unlazy(), but here we have the next dentry already * picked by rcu-walk and want to legitimize that in addition to the current * nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context. * Nothing should touch nameidata between try_to_unlazy_next() failure and * terminate_walk(). */ -static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq) +static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) { + int res; BUG_ON(!(nd->flags & LOOKUP_RCU)); - nd->flags &= ~LOOKUP_RCU; - if (unlikely(!legitimize_links(nd))) - goto out2; - if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq))) + if (unlikely(nd->depth && !legitimize_links(nd))) goto out2; + res = __legitimize_mnt(nd->path.mnt, nd->m_seq); + if (unlikely(res)) { + if (res > 0) + goto out2; + goto out1; + } if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref))) goto out1; @@ -824,7 +938,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi */ if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) goto out; - if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) + if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) goto out_dput; /* * Sequence counts matched. Now make sure that the root is @@ -832,7 +946,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi */ if (unlikely(!legitimize_root(nd))) goto out_dput; - rcu_read_unlock(); + leave_rcu(nd); return true; out2: @@ -840,18 +954,19 @@ out2: out1: nd->path.dentry = NULL; out: - rcu_read_unlock(); + leave_rcu(nd); return false; out_dput: - rcu_read_unlock(); + leave_rcu(nd); dput(dentry); return false; } -static inline int d_revalidate(struct dentry *dentry, unsigned int flags) +static inline int d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) - return dentry->d_op->d_revalidate(dentry, flags); + return dentry->d_op->d_revalidate(dir, name, dentry, flags); else return 1; } @@ -876,8 +991,8 @@ static int complete_walk(struct nameidata *nd) * We don't want to zero nd->root for scoped-lookups or * externally-managed nd->root. */ - if (!(nd->state & ND_ROOT_PRESET)) - if (!(nd->flags & LOOKUP_IS_SCOPED)) + if (likely(!(nd->state & ND_ROOT_PRESET))) + if (likely(!(nd->flags & LOOKUP_IS_SCOPED))) nd->root.mnt = NULL; nd->flags &= ~LOOKUP_CACHED; if (!try_to_unlazy(nd)) @@ -937,10 +1052,10 @@ static int set_root(struct nameidata *nd) unsigned seq; do { - seq = read_seqcount_begin(&fs->seq); + seq = read_seqbegin(&fs->seq); nd->root = fs->root; nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); + } while (read_seqretry(&fs->seq, seq)); } else { get_fs_root(fs, &nd->root); nd->state |= ND_ROOT_GRABBED; @@ -959,7 +1074,7 @@ static int nd_jump_root(struct nameidata *nd) } if (!nd->root.mnt) { int error = set_root(nd); - if (error) + if (unlikely(error)) return error; } if (nd->flags & LOOKUP_RCU) { @@ -968,7 +1083,7 @@ static int nd_jump_root(struct nameidata *nd) d = nd->path.dentry; nd->inode = d->d_inode; nd->seq = nd->root_seq; - if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq))) + if (read_seqcount_retry(&d->d_seq, nd->seq)) return -ECHILD; } else { path_put(&nd->path); @@ -984,7 +1099,7 @@ static int nd_jump_root(struct nameidata *nd) * Helper to directly jump to a known parsed path from ->get_link, * caller must have taken a reference to path beforehand. */ -int nd_jump_link(struct path *path) +int nd_jump_link(const struct path *path) { int error = -ELOOP; struct nameidata *nd = current->nameidata; @@ -1026,12 +1141,12 @@ static int sysctl_protected_fifos __read_mostly; static int sysctl_protected_regular __read_mostly; #ifdef CONFIG_SYSCTL -static struct ctl_table namei_sysctls[] = { +static const struct ctl_table namei_sysctls[] = { { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -1040,7 +1155,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_hardlinks", .data = &sysctl_protected_hardlinks, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -1049,7 +1164,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_fifos", .data = &sysctl_protected_fifos, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, @@ -1058,12 +1173,11 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_regular", .data = &sysctl_protected_regular, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, - { } }; static int __init init_fs_namei_sysctls(void) @@ -1078,6 +1192,7 @@ fs_initcall(init_fs_namei_sysctls); /** * may_follow_link - Check symlink following for unsafe situations * @nd: nameidata pathwalk data + * @inode: Used for idmapping. * * In the case of the sysctl_protected_symlinks sysctl being enabled, * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is @@ -1092,16 +1207,16 @@ fs_initcall(init_fs_namei_sysctls); */ static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { - struct user_namespace *mnt_userns; - kuid_t i_uid; + struct mnt_idmap *idmap; + vfsuid_t vfsuid; if (!sysctl_protected_symlinks) return 0; - mnt_userns = mnt_user_ns(nd->path.mnt); - i_uid = i_uid_into_mnt(mnt_userns, inode); + idmap = mnt_idmap(nd->path.mnt); + vfsuid = i_uid_into_vfsuid(idmap, inode); /* Allowed if owner and follower match. */ - if (uid_eq(current_cred()->fsuid, i_uid)) + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return 0; /* Allowed if parent directory not sticky and world-writable. */ @@ -1109,7 +1224,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod return 0; /* Allowed if parent directory and link owner match. */ - if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid)) + if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid)) return 0; if (nd->flags & LOOKUP_RCU) @@ -1122,7 +1237,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod /** * safe_hardlink_source - Check for safe hardlink conditions - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: the source inode to hardlink from * * Return false if at least one of the following conditions: @@ -1133,7 +1248,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod * * Otherwise returns true. */ -static bool safe_hardlink_source(struct user_namespace *mnt_userns, +static bool safe_hardlink_source(struct mnt_idmap *idmap, struct inode *inode) { umode_t mode = inode->i_mode; @@ -1151,7 +1266,7 @@ static bool safe_hardlink_source(struct user_namespace *mnt_userns, return false; /* Hardlinking to unreadable or unwritable sources is dangerous. */ - if (inode_permission(mnt_userns, inode, MAY_READ | MAY_WRITE)) + if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE)) return false; return true; @@ -1159,8 +1274,8 @@ static bool safe_hardlink_source(struct user_namespace *mnt_userns, /** * may_linkat - Check permissions for creating a hardlink - * @mnt_userns: user namespace of the mount the inode was found from - * @link: the source to hardlink from + * @idmap: idmap of the mount the inode was found from + * @link: the source to hardlink from * * Block hardlink when all of: * - sysctl_protected_hardlinks enabled @@ -1168,21 +1283,21 @@ static bool safe_hardlink_source(struct user_namespace *mnt_userns, * - hardlink source is unsafe (see safe_hardlink_source() above) * - not CAP_FOWNER in a namespace with the inode owner uid mapped * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if successful, -ve on error. */ -int may_linkat(struct user_namespace *mnt_userns, struct path *link) +int may_linkat(struct mnt_idmap *idmap, const struct path *link) { struct inode *inode = link->dentry->d_inode; /* Inode writeback is not safe when the uid or gid are invalid. */ - if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) || - !gid_valid(i_gid_into_mnt(mnt_userns, inode))) + if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || + !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; if (!sysctl_protected_hardlinks) @@ -1191,8 +1306,8 @@ int may_linkat(struct user_namespace *mnt_userns, struct path *link) /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ - if (safe_hardlink_source(mnt_userns, inode) || - inode_owner_or_capable(mnt_userns, inode)) + if (safe_hardlink_source(idmap, inode) || + inode_owner_or_capable(idmap, inode)) return 0; audit_log_path_denied(AUDIT_ANOM_LINK, "linkat"); @@ -1203,7 +1318,7 @@ int may_linkat(struct user_namespace *mnt_userns, struct path *link) * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory * should be allowed, or not, on files that already * exist. - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @nd: nameidata pathwalk data * @inode: the inode of the file to open * @@ -1218,37 +1333,56 @@ int may_linkat(struct user_namespace *mnt_userns, struct path *link) * the directory doesn't have to be world writable: being group writable will * be enough. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if the open is allowed, -ve on error. */ -static int may_create_in_sticky(struct user_namespace *mnt_userns, - struct nameidata *nd, struct inode *const inode) +static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd, + struct inode *const inode) { umode_t dir_mode = nd->dir_mode; - kuid_t dir_uid = nd->dir_uid; + vfsuid_t dir_vfsuid = nd->dir_vfsuid, i_vfsuid; - if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) || - (!sysctl_protected_regular && S_ISREG(inode->i_mode)) || - likely(!(dir_mode & S_ISVTX)) || - uid_eq(i_uid_into_mnt(mnt_userns, inode), dir_uid) || - uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) + if (likely(!(dir_mode & S_ISVTX))) return 0; - if (likely(dir_mode & 0002) || - (dir_mode & 0020 && - ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) || - (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) { - const char *operation = S_ISFIFO(inode->i_mode) ? - "sticky_create_fifo" : - "sticky_create_regular"; - audit_log_path_denied(AUDIT_ANOM_CREAT, operation); + if (S_ISREG(inode->i_mode) && !sysctl_protected_regular) + return 0; + + if (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos) + return 0; + + i_vfsuid = i_uid_into_vfsuid(idmap, inode); + + if (vfsuid_eq(i_vfsuid, dir_vfsuid)) + return 0; + + if (vfsuid_eq_kuid(i_vfsuid, current_fsuid())) + return 0; + + if (likely(dir_mode & 0002)) { + audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create"); return -EACCES; } + + if (dir_mode & 0020) { + if (sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) { + audit_log_path_denied(AUDIT_ANOM_CREAT, + "sticky_create_fifo"); + return -EACCES; + } + + if (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode)) { + audit_log_path_denied(AUDIT_ANOM_CREAT, + "sticky_create_regular"); + return -EACCES; + } + } + return 0; } @@ -1355,6 +1489,10 @@ static int follow_automount(struct path *path, int *count, unsigned lookup_flags dentry->d_inode) return -EISDIR; + /* No need to trigger automounts if mountpoint crossing is disabled. */ + if (lookup_flags & LOOKUP_NO_XDEV) + return -EXDEV; + if (count && (*count)++ >= MAXSYMLINKS) return -ELOOP; @@ -1375,9 +1513,13 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, int ret = 0; while (flags & DCACHE_MANAGED_DENTRY) { - /* Allow the filesystem to manage the transit without i_mutex + /* Allow the filesystem to manage the transit without i_rwsem * being held. */ if (flags & DCACHE_MANAGE_TRANSIT) { + if (lookup_flags & LOOKUP_NO_XDEV) { + ret = -EXDEV; + break; + } ret = path->dentry->d_op->d_manage(path, false); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) @@ -1395,6 +1537,10 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, // here we know it's positive flags = path->dentry->d_flags; need_mntput = true; + if (unlikely(lookup_flags & LOOKUP_NO_XDEV)) { + ret = -EXDEV; + break; + } continue; } } @@ -1456,11 +1602,11 @@ EXPORT_SYMBOL(follow_down_one); * point, the filesystem owning that dentry may be queried as to whether the * caller is permitted to proceed or not. */ -int follow_down(struct path *path) +int follow_down(struct path *path, unsigned int flags) { struct vfsmount *mnt = path->mnt; bool jumped; - int ret = traverse_mounts(path, &jumped, NULL, 0); + int ret = traverse_mounts(path, &jumped, NULL, flags); if (path->mnt != mnt) mntput(mnt); @@ -1472,8 +1618,7 @@ EXPORT_SYMBOL(follow_down); * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if * we meet a managed dentry that would need blocking. */ -static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, - struct inode **inode, unsigned *seqp) +static bool __follow_mount_rcu(struct nameidata *nd, struct path *path) { struct dentry *dentry = path->dentry; unsigned int flags = dentry->d_flags; @@ -1502,15 +1647,12 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, path->mnt = &mounted->mnt; dentry = path->dentry = mounted->mnt.mnt_root; nd->state |= ND_JUMPED; - *seqp = read_seqcount_begin(&dentry->d_seq); - *inode = dentry->d_inode; - /* - * We don't need to re-check ->d_seq after this - * ->d_inode read - there will be an RCU delay - * between mount hash removal and ->mnt_root - * becoming unpinned. - */ + nd->next_seq = read_seqcount_begin(&dentry->d_seq); flags = dentry->d_flags; + // makes sure that non-RCU pathwalk could reach + // this state. + if (read_seqretry(&mount_lock, nd->m_seq)) + return false; continue; } if (read_seqretry(&mount_lock, nd->m_seq)) @@ -1521,8 +1663,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, } static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, - struct path *path, struct inode **inode, - unsigned int *seqp) + struct path *path) { bool jumped; int ret; @@ -1530,31 +1671,25 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, path->mnt = nd->path.mnt; path->dentry = dentry; if (nd->flags & LOOKUP_RCU) { - unsigned int seq = *seqp; - if (unlikely(!*inode)) - return -ENOENT; - if (likely(__follow_mount_rcu(nd, path, inode, seqp))) + unsigned int seq = nd->next_seq; + if (likely(!d_managed(dentry))) return 0; - if (!try_to_unlazy_next(nd, dentry, seq)) - return -ECHILD; - // *path might've been clobbered by __follow_mount_rcu() + if (likely(__follow_mount_rcu(nd, path))) + return 0; + // *path and nd->next_seq might've been clobbered path->mnt = nd->path.mnt; path->dentry = dentry; + nd->next_seq = seq; + if (unlikely(!try_to_unlazy_next(nd, dentry))) + return -ECHILD; } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); - if (jumped) { - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - ret = -EXDEV; - else - nd->state |= ND_JUMPED; - } + if (jumped) + nd->state |= ND_JUMPED; if (unlikely(ret)) { dput(path->dentry); if (path->mnt != nd->path.mnt) mntput(path->mnt); - } else { - *inode = d_backing_inode(path->dentry); - *seqp = 0; /* out of RCU mode, so the value doesn't matter */ } return ret; } @@ -1569,7 +1704,7 @@ static struct dentry *lookup_dcache(const struct qstr *name, { struct dentry *dentry = d_lookup(dir, name); if (dentry) { - int error = d_revalidate(dentry, flags); + int error = d_revalidate(dir->d_inode, name, dentry, flags); if (unlikely(error <= 0)) { if (!error) d_invalidate(dentry); @@ -1586,18 +1721,22 @@ static struct dentry *lookup_dcache(const struct qstr *name, * dentries - as the matter of fact, this only gets called * when directory is guaranteed to have no in-lookup children * at all. + * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed. + * Will return -EEXIST if name is found and LOOKUP_EXCL was passed. */ -static struct dentry *__lookup_hash(const struct qstr *name, - struct dentry *base, unsigned int flags) +struct dentry *lookup_one_qstr_excl(const struct qstr *name, + struct dentry *base, unsigned int flags) { - struct dentry *dentry = lookup_dcache(name, base, flags); + struct dentry *dentry; struct dentry *old; - struct inode *dir = base->d_inode; + struct inode *dir; + dentry = lookup_dcache(name, base, flags); if (dentry) - return dentry; + goto found; /* Don't create child dentry for a dead directory. */ + dir = base->d_inode; if (unlikely(IS_DEADDIR(dir))) return ERR_PTR(-ENOENT); @@ -1610,12 +1749,36 @@ static struct dentry *__lookup_hash(const struct qstr *name, dput(dentry); dentry = old; } +found: + if (IS_ERR(dentry)) + return dentry; + if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) { + dput(dentry); + return ERR_PTR(-ENOENT); + } + if (d_is_positive(dentry) && (flags & LOOKUP_EXCL)) { + dput(dentry); + return ERR_PTR(-EEXIST); + } return dentry; } +EXPORT_SYMBOL(lookup_one_qstr_excl); -static struct dentry *lookup_fast(struct nameidata *nd, - struct inode **inode, - unsigned *seqp) +/** + * lookup_fast - do fast lockless (but racy) lookup of a dentry + * @nd: current nameidata + * + * Do a fast, but racy lookup in the dcache for the given dentry, and + * revalidate it. Returns a valid dentry pointer or NULL if one wasn't + * found. On error, an ERR_PTR will be returned. + * + * If this function returns a valid dentry and the walk is no longer + * lazy, the dentry will carry a reference that must later be put. If + * RCU mode is still in force, then this is not the case and the dentry + * must be legitimized before use. If this returns NULL, then the walk + * will no longer be in RCU mode. + */ +static struct dentry *lookup_fast(struct nameidata *nd) { struct dentry *dentry, *parent = nd->path.dentry; int status = 1; @@ -1626,8 +1789,7 @@ static struct dentry *lookup_fast(struct nameidata *nd, * going to fall back to non-racy lookup. */ if (nd->flags & LOOKUP_RCU) { - unsigned seq; - dentry = __d_lookup_rcu(parent, &nd->last, &seq); + dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq); if (unlikely(!dentry)) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); @@ -1635,37 +1797,26 @@ static struct dentry *lookup_fast(struct nameidata *nd, } /* - * This sequence count validates that the inode matches - * the dentry name information from lookup. - */ - *inode = d_backing_inode(dentry); - if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) - return ERR_PTR(-ECHILD); - - /* * This sequence count validates that the parent had no * changes while we did the lookup of the dentry above. - * - * The memory barrier in read_seqcount_begin of child is - * enough, we can use __read_seqcount_retry here. */ - if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq))) + if (read_seqcount_retry(&parent->d_seq, nd->seq)) return ERR_PTR(-ECHILD); - *seqp = seq; - status = d_revalidate(dentry, nd->flags); + status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags); if (likely(status > 0)) return dentry; - if (!try_to_unlazy_next(nd, dentry, seq)) + if (!try_to_unlazy_next(nd, dentry)) return ERR_PTR(-ECHILD); if (status == -ECHILD) /* we'd been told to redo it in non-rcu mode */ - status = d_revalidate(dentry, nd->flags); + status = d_revalidate(nd->inode, &nd->last, + dentry, nd->flags); } else { dentry = __d_lookup(parent, &nd->last); if (unlikely(!dentry)) return NULL; - status = d_revalidate(dentry, nd->flags); + status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags); } if (unlikely(status <= 0)) { if (!status) @@ -1693,7 +1844,7 @@ again: if (IS_ERR(dentry)) return dentry; if (unlikely(!d_in_lookup(dentry))) { - int error = d_revalidate(dentry, flags); + int error = d_revalidate(inode, name, dentry, flags); if (unlikely(error <= 0)) { if (!error) { d_invalidate(dentry); @@ -1714,7 +1865,7 @@ again: return dentry; } -static struct dentry *lookup_slow(const struct qstr *name, +static noinline struct dentry *lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { @@ -1726,18 +1877,45 @@ static struct dentry *lookup_slow(const struct qstr *name, return res; } -static inline int may_lookup(struct user_namespace *mnt_userns, - struct nameidata *nd) +static struct dentry *lookup_slow_killable(const struct qstr *name, + struct dentry *dir, + unsigned int flags) { - if (nd->flags & LOOKUP_RCU) { - int err = inode_permission(mnt_userns, nd->inode, MAY_EXEC|MAY_NOT_BLOCK); - if (err != -ECHILD || !try_to_unlazy(nd)) - return err; - } - return inode_permission(mnt_userns, nd->inode, MAY_EXEC); + struct inode *inode = dir->d_inode; + struct dentry *res; + + if (inode_lock_shared_killable(inode)) + return ERR_PTR(-EINTR); + res = __lookup_slow(name, dir, flags); + inode_unlock_shared(inode); + return res; } -static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) +static inline int may_lookup(struct mnt_idmap *idmap, + struct nameidata *restrict nd) +{ + int err, mask; + + mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0; + err = lookup_inode_permission_may_exec(idmap, nd->inode, mask); + if (likely(!err)) + return 0; + + // If we failed, and we weren't in LOOKUP_RCU, it's final + if (!(nd->flags & LOOKUP_RCU)) + return err; + + // Drop out of RCU mode to make sure it wasn't transient + if (!try_to_unlazy(nd)) + return -ECHILD; // redo it all non-lazy + + if (err != -ECHILD) // hard error + return err; + + return lookup_inode_permission_may_exec(idmap, nd->inode, 0); +} + +static int reserve_stack(struct nameidata *nd, struct path *link) { if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) return -ELOOP; @@ -1752,9 +1930,9 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) if (nd->flags & LOOKUP_RCU) { // we need to grab link before we do unlazy. And we can't skip // unlazy even if we fail to grab the link - cleanup needs it - bool grabbed_link = legitimize_path(nd, link, seq); + bool grabbed_link = legitimize_path(nd, link, nd->next_seq); - if (!try_to_unlazy(nd) != 0 || !grabbed_link) + if (!try_to_unlazy(nd) || !grabbed_link) return -ECHILD; if (nd_alloc_stack(nd)) @@ -1765,13 +1943,23 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; -static const char *pick_link(struct nameidata *nd, struct path *link, - struct inode *inode, unsigned seq, int flags) +static noinline const char *pick_link(struct nameidata *nd, struct path *link, + struct inode *inode, int flags) { struct saved *last; const char *res; - int error = reserve_stack(nd, link, seq); + int error; + if (nd->flags & LOOKUP_RCU) { + /* make sure that d_is_symlink from step_into_slowpath() matches the inode */ + if (read_seqcount_retry(&link->dentry->d_seq, nd->next_seq)) + return ERR_PTR(-ECHILD); + } else { + if (link->mnt == nd->path.mnt) + mntget(link->mnt); + } + + error = reserve_stack(nd, link); if (unlikely(error)) { if (!(nd->flags & LOOKUP_RCU)) path_put(link); @@ -1780,7 +1968,7 @@ static const char *pick_link(struct nameidata *nd, struct path *link, last = nd->stack + nd->depth++; last->link = *link; clear_delayed_call(&last->done); - last->seq = seq; + last->seq = nd->next_seq; if (flags & WALK_TRAILING) { error = may_follow_link(nd, inode); @@ -1792,13 +1980,13 @@ static const char *pick_link(struct nameidata *nd, struct path *link, unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW)) return ERR_PTR(-ELOOP); - if (!(nd->flags & LOOKUP_RCU)) { + if (unlikely(atime_needs_update(&last->link, inode))) { + if (nd->flags & LOOKUP_RCU) { + if (!try_to_unlazy(nd)) + return ERR_PTR(-ECHILD); + } touch_atime(&last->link); cond_resched(); - } else if (atime_needs_update(&last->link, inode)) { - if (!try_to_unlazy(nd)) - return ERR_PTR(-ECHILD); - touch_atime(&last->link); } error = security_inode_follow_link(link->dentry, inode, @@ -1842,43 +2030,68 @@ all_done: // pure jump * to do this check without having to look at inode->i_op, * so we keep a cache of "no, this doesn't need follow_link" * for the common case. + * + * NOTE: dentry must be what nd->next_seq had been sampled from. */ -static const char *step_into(struct nameidata *nd, int flags, - struct dentry *dentry, struct inode *inode, unsigned seq) +static noinline const char *step_into_slowpath(struct nameidata *nd, int flags, + struct dentry *dentry) { struct path path; - int err = handle_mounts(nd, dentry, &path, &inode, &seq); + struct inode *inode; + int err; - if (err < 0) + err = handle_mounts(nd, dentry, &path); + if (unlikely(err < 0)) return ERR_PTR(err); + inode = path.dentry->d_inode; if (likely(!d_is_symlink(path.dentry)) || ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) || (flags & WALK_NOFOLLOW)) { /* not a symlink or should not follow */ - if (!(nd->flags & LOOKUP_RCU)) { + if (nd->flags & LOOKUP_RCU) { + if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) + return ERR_PTR(-ECHILD); + if (unlikely(!inode)) + return ERR_PTR(-ENOENT); + } else { dput(nd->path.dentry); if (nd->path.mnt != path.mnt) mntput(nd->path.mnt); } nd->path = path; nd->inode = inode; - nd->seq = seq; + nd->seq = nd->next_seq; return NULL; } - if (nd->flags & LOOKUP_RCU) { - /* make sure that d_is_symlink above matches inode */ - if (read_seqcount_retry(&path.dentry->d_seq, seq)) + return pick_link(nd, &path, inode, flags); +} + +static __always_inline const char *step_into(struct nameidata *nd, int flags, + struct dentry *dentry) +{ + /* + * In the common case we are in rcu-walk and traversing over a non-mounted on + * directory (as opposed to e.g., a symlink). + * + * We can handle that and negative entries with the checks below. + */ + if (likely((nd->flags & LOOKUP_RCU) && + !d_managed(dentry) && !d_is_symlink(dentry))) { + struct inode *inode = dentry->d_inode; + if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); - } else { - if (path.mnt == nd->path.mnt) - mntget(path.mnt); + if (unlikely(!inode)) + return ERR_PTR(-ENOENT); + nd->path.dentry = dentry; + /* nd->path.mnt is retained on purpose */ + nd->inode = inode; + nd->seq = nd->next_seq; + return NULL; } - return pick_link(nd, &path, inode, seq, flags); + return step_into_slowpath(nd, flags, dentry); } -static struct dentry *follow_dotdot_rcu(struct nameidata *nd, - struct inode **inodep, - unsigned *seqp) +static struct dentry *follow_dotdot_rcu(struct nameidata *nd) { struct dentry *parent, *old; @@ -1895,30 +2108,30 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd, nd->path = path; nd->inode = path.dentry->d_inode; nd->seq = seq; - if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + // makes sure that non-RCU pathwalk could reach this state + if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); /* we know that mountpoint was pinned */ } old = nd->path.dentry; parent = old->d_parent; - *inodep = parent->d_inode; - *seqp = read_seqcount_begin(&parent->d_seq); - if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq))) + nd->next_seq = read_seqcount_begin(&parent->d_seq); + // makes sure that non-RCU pathwalk could reach this state + if (read_seqcount_retry(&old->d_seq, nd->seq)) return ERR_PTR(-ECHILD); if (unlikely(!path_connected(nd->path.mnt, parent))) return ERR_PTR(-ECHILD); return parent; in_root: - if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-ECHILD); - return NULL; + nd->next_seq = nd->seq; + return nd->path.dentry; } -static struct dentry *follow_dotdot(struct nameidata *nd, - struct inode **inodep, - unsigned *seqp) +static struct dentry *follow_dotdot(struct nameidata *nd) { struct dentry *parent; @@ -1942,15 +2155,12 @@ static struct dentry *follow_dotdot(struct nameidata *nd, dput(parent); return ERR_PTR(-ENOENT); } - *seqp = 0; - *inodep = parent->d_inode; return parent; in_root: if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-EXDEV); - dget(nd->path.dentry); - return NULL; + return dget(nd->path.dentry); } static const char *handle_dots(struct nameidata *nd, int type) @@ -1958,26 +2168,19 @@ static const char *handle_dots(struct nameidata *nd, int type) if (type == LAST_DOTDOT) { const char *error = NULL; struct dentry *parent; - struct inode *inode; - unsigned seq; if (!nd->root.mnt) { error = ERR_PTR(set_root(nd)); - if (error) + if (unlikely(error)) return error; } if (nd->flags & LOOKUP_RCU) - parent = follow_dotdot_rcu(nd, &inode, &seq); + parent = follow_dotdot_rcu(nd); else - parent = follow_dotdot(nd, &inode, &seq); + parent = follow_dotdot(nd); if (IS_ERR(parent)) return ERR_CAST(parent); - if (unlikely(!parent)) - error = step_into(nd, WALK_NOFOLLOW, - nd->path.dentry, nd->inode, nd->seq); - else - error = step_into(nd, WALK_NOFOLLOW, - parent, inode, seq); + error = step_into(nd, WALK_NOFOLLOW, parent); if (unlikely(error)) return error; @@ -1989,31 +2192,29 @@ static const char *handle_dots(struct nameidata *nd, int type) * some fallback). */ smp_rmb(); - if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))) + if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)) return ERR_PTR(-EAGAIN); - if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))) + if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)) return ERR_PTR(-EAGAIN); } } return NULL; } -static const char *walk_component(struct nameidata *nd, int flags) +static __always_inline const char *walk_component(struct nameidata *nd, int flags) { struct dentry *dentry; - struct inode *inode; - unsigned seq; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and * parent relationships. */ if (unlikely(nd->last_type != LAST_NORM)) { - if (!(flags & WALK_MORE) && nd->depth) + if (unlikely(nd->depth) && !(flags & WALK_MORE)) put_link(nd); return handle_dots(nd, nd->last_type); } - dentry = lookup_fast(nd, &inode, &seq); + dentry = lookup_fast(nd); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (unlikely(!dentry)) { @@ -2021,9 +2222,9 @@ static const char *walk_component(struct nameidata *nd, int flags) if (IS_ERR(dentry)) return ERR_CAST(dentry); } - if (!(flags & WALK_MORE) && nd->depth) + if (unlikely(nd->depth) && !(flags & WALK_MORE)) put_link(nd); - return step_into(nd, flags, dentry, inode, seq); + return step_into(nd, flags, dentry); } /* @@ -2179,21 +2380,39 @@ EXPORT_SYMBOL(hashlen_string); /* * Calculate the length and hash of the path component, and - * return the "hash_len" as the result. + * return the length as the result. */ -static inline u64 hash_name(const void *salt, const char *name) +static inline const char *hash_name(struct nameidata *nd, + const char *name, + unsigned long *lastword) { - unsigned long a = 0, b, x = 0, y = (unsigned long)salt; + unsigned long a, b, x, y = (unsigned long)nd->path.dentry; unsigned long adata, bdata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; - len = 0; - goto inside; + /* + * The first iteration is special, because it can result in + * '.' and '..' and has no mixing other than the final fold. + */ + a = load_unaligned_zeropad(name); + b = a ^ REPEAT_BYTE('/'); + if (has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)) { + adata = prep_zero_mask(a, adata, &constants); + bdata = prep_zero_mask(b, bdata, &constants); + mask = create_zero_mask(adata | bdata); + a &= zero_bytemask(mask); + *lastword = a; + len = find_zero(mask); + nd->last.hash = fold_hash(a, y); + nd->last.len = len; + return name + len; + } + len = 0; + x = 0; do { HASH_MIX(x, y, a); len += sizeof(unsigned long); -inside: a = load_unaligned_zeropad(name+len); b = a ^ REPEAT_BYTE('/'); } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants))); @@ -2201,11 +2420,25 @@ inside: adata = prep_zero_mask(a, adata, &constants); bdata = prep_zero_mask(b, bdata, &constants); mask = create_zero_mask(adata | bdata); - x ^= a & zero_bytemask(mask); + a &= zero_bytemask(mask); + x ^= a; + len += find_zero(mask); + *lastword = 0; // Multi-word components cannot be DOT or DOTDOT - return hashlen_create(fold_hash(x, y), len + find_zero(mask)); + nd->last.hash = fold_hash(x, y); + nd->last.len = len; + return name + len; } +/* + * Note that the 'last' word is always zero-masked, but + * was loaded as a possibly big-endian word. + */ +#ifdef __BIG_ENDIAN + #define LAST_WORD_IS_DOT (0x2eul << (BITS_PER_LONG-8)) + #define LAST_WORD_IS_DOTDOT (0x2e2eul << (BITS_PER_LONG-16)) +#endif + #else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */ /* Return the hash of a string of known length */ @@ -2238,22 +2471,35 @@ EXPORT_SYMBOL(hashlen_string); * We know there's a real path component here of at least * one character. */ -static inline u64 hash_name(const void *salt, const char *name) +static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword) { - unsigned long hash = init_name_hash(salt); - unsigned long len = 0, c; + unsigned long hash = init_name_hash(nd->path.dentry); + unsigned long len = 0, c, last = 0; c = (unsigned char)*name; do { + last = (last << 8) + c; len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } while (c && c != '/'); - return hashlen_create(end_name_hash(hash), len); + + // This is reliable for DOT or DOTDOT, since the component + // cannot contain NUL characters - top bits being zero means + // we cannot have had any other pathnames. + *lastword = last; + nd->last.hash = end_name_hash(hash); + nd->last.len = len; + return name + len; } #endif +#ifndef LAST_WORD_IS_DOT + #define LAST_WORD_IS_DOT 0x2e + #define LAST_WORD_IS_DOTDOT 0x2e2e +#endif + /* * Name resolution. * This is the basic name resolution function, turning a pathname into @@ -2271,56 +2517,52 @@ static int link_path_walk(const char *name, struct nameidata *nd) nd->flags |= LOOKUP_PARENT; if (IS_ERR(name)) return PTR_ERR(name); - while (*name=='/') - name++; - if (!*name) { + if (*name == '/') { + do { + name++; + } while (unlikely(*name == '/')); + } + if (unlikely(!*name)) { nd->dir_mode = 0; // short-circuit the 'hardening' idiocy return 0; } /* At this point we know we have a real path component. */ for(;;) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; const char *link; - u64 hash_len; - int type; + unsigned long lastword; - mnt_userns = mnt_user_ns(nd->path.mnt); - err = may_lookup(mnt_userns, nd); - if (err) + idmap = mnt_idmap(nd->path.mnt); + err = may_lookup(idmap, nd); + if (unlikely(err)) return err; - hash_len = hash_name(nd->path.dentry, name); + nd->last.name = name; + name = hash_name(nd, name, &lastword); - type = LAST_NORM; - if (name[0] == '.') switch (hashlen_len(hash_len)) { - case 2: - if (name[1] == '.') { - type = LAST_DOTDOT; - nd->state |= ND_JUMPED; - } - break; - case 1: - type = LAST_DOT; - } - if (likely(type == LAST_NORM)) { - struct dentry *parent = nd->path.dentry; + switch(lastword) { + case LAST_WORD_IS_DOTDOT: + nd->last_type = LAST_DOTDOT; + nd->state |= ND_JUMPED; + break; + + case LAST_WORD_IS_DOT: + nd->last_type = LAST_DOT; + break; + + default: + nd->last_type = LAST_NORM; nd->state &= ~ND_JUMPED; + + struct dentry *parent = nd->path.dentry; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { - struct qstr this = { { .hash_len = hash_len }, .name = name }; - err = parent->d_op->d_hash(parent, &this); + err = parent->d_op->d_hash(parent, &nd->last); if (err < 0) return err; - hash_len = this.hash_len; - name = this.name; } } - nd->last.hash_len = hash_len; - nd->last.name = name; - nd->last_type = type; - - name += hashlen_len(hash_len); if (!*name) goto OK; /* @@ -2333,8 +2575,8 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (unlikely(!*name)) { OK: /* pathname or trailing symlink, done */ - if (!depth) { - nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode); + if (likely(!depth)) { + nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode); nd->dir_mode = nd->inode->i_mode; nd->flags &= ~LOOKUP_PARENT; return 0; @@ -2368,16 +2610,18 @@ OK: static const char *path_init(struct nameidata *nd, unsigned flags) { int error; - const char *s = nd->name->name; + const char *s = nd->pathname; /* LOOKUP_CACHED requires RCU, ask caller to retry */ - if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED) + if (unlikely((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)) return ERR_PTR(-EAGAIN); - if (!*s) + if (unlikely(!*s)) flags &= ~LOOKUP_RCU; if (flags & LOOKUP_RCU) rcu_read_lock(); + else + nd->seq = nd->next_seq = 0; nd->flags = flags; nd->state |= ND_JUMPED; @@ -2386,7 +2630,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); smp_rmb(); - if (nd->state & ND_ROOT_PRESET) { + if (unlikely(nd->state & ND_ROOT_PRESET)) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; if (*s && unlikely(!d_can_lookup(root))) @@ -2405,7 +2649,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->root.mnt = NULL; /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */ - if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) { + if (*s == '/' && likely(!(flags & LOOKUP_IN_ROOT))) { error = nd_jump_root(nd); if (unlikely(error)) return ERR_PTR(error); @@ -2419,31 +2663,35 @@ static const char *path_init(struct nameidata *nd, unsigned flags) unsigned seq; do { - seq = read_seqcount_begin(&fs->seq); + seq = read_seqbegin(&fs->seq); nd->path = fs->pwd; nd->inode = nd->path.dentry->d_inode; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); + } while (read_seqretry(&fs->seq, seq)); } else { get_fs_pwd(current->fs, &nd->path); nd->inode = nd->path.dentry->d_inode; } } else { /* Caller must check execute permissions on the starting path component */ - struct fd f = fdget_raw(nd->dfd); + CLASS(fd_raw, f)(nd->dfd); struct dentry *dentry; - if (!f.file) + if (fd_empty(f)) return ERR_PTR(-EBADF); - dentry = f.file->f_path.dentry; + if (flags & LOOKUP_LINKAT_EMPTY) { + if (fd_file(f)->f_cred != current_cred() && + !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH)) + return ERR_PTR(-ENOENT); + } + + dentry = fd_file(f)->f_path.dentry; - if (*s && unlikely(!d_can_lookup(dentry))) { - fdput(f); + if (*s && unlikely(!d_can_lookup(dentry))) return ERR_PTR(-ENOTDIR); - } - nd->path = f.file->f_path; + nd->path = fd_file(f)->f_path; if (flags & LOOKUP_RCU) { nd->inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); @@ -2451,11 +2699,10 @@ static const char *path_init(struct nameidata *nd, unsigned flags) path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } - fdput(f); } /* For scoped-lookups we need to set the root to the dirfd as well. */ - if (flags & LOOKUP_IS_SCOPED) { + if (unlikely(flags & LOOKUP_IS_SCOPED)) { nd->root = nd->path; if (flags & LOOKUP_RCU) { nd->root_seq = nd->seq; @@ -2479,11 +2726,11 @@ static int handle_lookup_down(struct nameidata *nd) { if (!(nd->flags & LOOKUP_RCU)) dget(nd->path.dentry); - return PTR_ERR(step_into(nd, WALK_NOFOLLOW, - nd->path.dentry, nd->inode, nd->seq)); + nd->next_seq = nd->seq; + return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry)); } -/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ +/* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path) { const char *s = path_init(nd, flags); @@ -2518,7 +2765,7 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path } int filename_lookup(int dfd, struct filename *name, unsigned flags, - struct path *path, struct path *root) + struct path *path, const struct path *root) { int retval; struct nameidata nd; @@ -2538,7 +2785,7 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags, return retval; } -/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ +/* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_parentat(struct nameidata *nd, unsigned flags, struct path *parent) { @@ -2556,16 +2803,17 @@ static int path_parentat(struct nameidata *nd, unsigned flags, } /* Note: this does not consume "name" */ -static int filename_parentat(int dfd, struct filename *name, - unsigned int flags, struct path *parent, - struct qstr *last, int *type) +static int __filename_parentat(int dfd, struct filename *name, + unsigned int flags, struct path *parent, + struct qstr *last, int *type, + const struct path *root) { int retval; struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); - set_nameidata(&nd, dfd, name, NULL); + set_nameidata(&nd, dfd, name, root); retval = path_parentat(&nd, flags | LOOKUP_RCU, parent); if (unlikely(retval == -ECHILD)) retval = path_parentat(&nd, flags, parent); @@ -2580,37 +2828,158 @@ static int filename_parentat(int dfd, struct filename *name, return retval; } +static int filename_parentat(int dfd, struct filename *name, + unsigned int flags, struct path *parent, + struct qstr *last, int *type) +{ + return __filename_parentat(dfd, name, flags, parent, last, type, NULL); +} + +/** + * start_dirop - begin a create or remove dirop, performing locking and lookup + * @parent: the dentry of the parent in which the operation will occur + * @name: a qstr holding the name within that parent + * @lookup_flags: intent and other lookup flags. + * + * The lookup is performed and necessary locks are taken so that, on success, + * the returned dentry can be operated on safely. + * The qstr must already have the hash value calculated. + * + * Returns: a locked dentry, or an error. + * + */ +static struct dentry *__start_dirop(struct dentry *parent, struct qstr *name, + unsigned int lookup_flags, + unsigned int state) +{ + struct dentry *dentry; + struct inode *dir = d_inode(parent); + + if (state == TASK_KILLABLE) { + int ret = down_write_killable_nested(&dir->i_rwsem, + I_MUTEX_PARENT); + if (ret) + return ERR_PTR(ret); + } else { + inode_lock_nested(dir, I_MUTEX_PARENT); + } + dentry = lookup_one_qstr_excl(name, parent, lookup_flags); + if (IS_ERR(dentry)) + inode_unlock(dir); + return dentry; +} + +struct dentry *start_dirop(struct dentry *parent, struct qstr *name, + unsigned int lookup_flags) +{ + return __start_dirop(parent, name, lookup_flags, TASK_NORMAL); +} + +/** + * end_dirop - signal completion of a dirop + * @de: the dentry which was returned by start_dirop or similar. + * + * If the de is an error, nothing happens. Otherwise any lock taken to + * protect the dentry is dropped and the dentry itself is release (dput()). + */ +void end_dirop(struct dentry *de) +{ + if (!IS_ERR(de)) { + inode_unlock(de->d_parent->d_inode); + dput(de); + } +} +EXPORT_SYMBOL(end_dirop); + /* does lookup, returns the object with parent locked */ -static struct dentry *__kern_path_locked(struct filename *name, struct path *path) +static struct dentry *__start_removing_path(int dfd, struct filename *name, + struct path *path) { + struct path parent_path __free(path_put) = {}; struct dentry *d; struct qstr last; int type, error; - error = filename_parentat(AT_FDCWD, name, 0, path, &last, &type); + error = filename_parentat(dfd, name, 0, &parent_path, &last, &type); if (error) return ERR_PTR(error); - if (unlikely(type != LAST_NORM)) { - path_put(path); + if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); - } - inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); - d = __lookup_hash(&last, path->dentry, 0); - if (IS_ERR(d)) { - inode_unlock(path->dentry->d_inode); - path_put(path); - } + /* don't fail immediately if it's r/o, at least try to report other errors */ + error = mnt_want_write(parent_path.mnt); + d = start_dirop(parent_path.dentry, &last, 0); + if (IS_ERR(d)) + goto drop; + if (error) + goto fail; + path->dentry = no_free_ptr(parent_path.dentry); + path->mnt = no_free_ptr(parent_path.mnt); + return d; + +fail: + end_dirop(d); + d = ERR_PTR(error); +drop: + if (!error) + mnt_drop_write(parent_path.mnt); return d; } -struct dentry *kern_path_locked(const char *name, struct path *path) +/** + * kern_path_parent: lookup path returning parent and target + * @name: path name + * @path: path to store parent in + * + * The path @name should end with a normal component, not "." or ".." or "/". + * A lookup is performed and if successful the parent information + * is store in @parent and the dentry is returned. + * + * The dentry maybe negative, the parent will be positive. + * + * Returns: dentry or error. + */ +struct dentry *kern_path_parent(const char *name, struct path *path) +{ + struct path parent_path __free(path_put) = {}; + struct filename *filename __free(putname) = getname_kernel(name); + struct dentry *d; + struct qstr last; + int type, error; + + error = filename_parentat(AT_FDCWD, filename, 0, &parent_path, &last, &type); + if (error) + return ERR_PTR(error); + if (unlikely(type != LAST_NORM)) + return ERR_PTR(-EINVAL); + + d = lookup_noperm_unlocked(&last, parent_path.dentry); + if (IS_ERR(d)) + return d; + path->dentry = no_free_ptr(parent_path.dentry); + path->mnt = no_free_ptr(parent_path.mnt); + return d; +} + +struct dentry *start_removing_path(const char *name, struct path *path) { struct filename *filename = getname_kernel(name); - struct dentry *res = __kern_path_locked(filename, path); + struct dentry *res = __start_removing_path(AT_FDCWD, filename, path); + + putname(filename); + return res; +} + +struct dentry *start_removing_user_path_at(int dfd, + const char __user *name, + struct path *path) +{ + struct filename *filename = getname(name); + struct dentry *res = __start_removing_path(dfd, filename, path); putname(filename); return res; } +EXPORT_SYMBOL(start_removing_user_path_at); int kern_path(const char *name, unsigned int flags, struct path *path) { @@ -2624,6 +2993,24 @@ int kern_path(const char *name, unsigned int flags, struct path *path) EXPORT_SYMBOL(kern_path); /** + * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair + * @filename: filename structure + * @flags: lookup flags + * @parent: pointer to struct path to fill + * @last: last component + * @type: type of the last component + * @root: pointer to struct path of the base directory + */ +int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, + struct path *parent, struct qstr *last, int *type, + const struct path *root) +{ + return __filename_parentat(AT_FDCWD, filename, flags, parent, last, + type, root); +} +EXPORT_SYMBOL(vfs_path_parent_lookup); + +/** * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair * @dentry: pointer to dentry of the base directory * @mnt: pointer to vfs mount of the base directory @@ -2647,20 +3034,17 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, } EXPORT_SYMBOL(vfs_path_lookup); -static int lookup_one_common(struct user_namespace *mnt_userns, - const char *name, struct dentry *base, int len, - struct qstr *this) +int lookup_noperm_common(struct qstr *qname, struct dentry *base) { - this->name = name; - this->len = len; - this->hash = full_name_hash(base, name, len); + const char *name = qname->name; + u32 len = qname->len; + + qname->hash = full_name_hash(base, name, len); if (!len) return -EACCES; - if (unlikely(name[0] == '.')) { - if (len < 2 || (len == 2 && name[1] == '.')) - return -EACCES; - } + if (is_dot_dotdot(name, len)) + return -EACCES; while (len--) { unsigned int c = *(const unsigned char *)name++; @@ -2672,150 +3056,482 @@ static int lookup_one_common(struct user_namespace *mnt_userns, * to use its own hash.. */ if (base->d_flags & DCACHE_OP_HASH) { - int err = base->d_op->d_hash(base, this); + int err = base->d_op->d_hash(base, qname); if (err < 0) return err; } + return 0; +} - return inode_permission(mnt_userns, base->d_inode, MAY_EXEC); +static int lookup_one_common(struct mnt_idmap *idmap, + struct qstr *qname, struct dentry *base) +{ + int err; + err = lookup_noperm_common(qname, base); + if (err < 0) + return err; + return inode_permission(idmap, base->d_inode, MAY_EXEC); } /** - * try_lookup_one_len - filesystem helper to lookup single pathname component - * @name: pathname component to lookup + * try_lookup_noperm - filesystem helper to lookup single pathname component + * @name: qstr storing pathname component to lookup * @base: base directory to lookup from - * @len: maximum length @len should be interpreted to * * Look up a dentry by name in the dcache, returning NULL if it does not - * currently exist. The function does not try to create a dentry. + * currently exist. The function does not try to create a dentry and if one + * is found it doesn't try to revalidate it. * * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. + * not be called by generic code. It does no permission checking. + * + * No locks need be held - only a counted reference to @base is needed. * - * The caller must hold base->i_mutex. */ -struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len) +struct dentry *try_lookup_noperm(struct qstr *name, struct dentry *base) { - struct qstr this; int err; - WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - - err = lookup_one_common(&init_user_ns, name, base, len, &this); + err = lookup_noperm_common(name, base); if (err) return ERR_PTR(err); - return lookup_dcache(&this, base, 0); + return d_lookup(base, name); } -EXPORT_SYMBOL(try_lookup_one_len); +EXPORT_SYMBOL(try_lookup_noperm); /** - * lookup_one_len - filesystem helper to lookup single pathname component - * @name: pathname component to lookup + * lookup_noperm - filesystem helper to lookup single pathname component + * @name: qstr storing pathname component to lookup * @base: base directory to lookup from - * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. + * not be called by generic code. It does no permission checking. * - * The caller must hold base->i_mutex. + * The caller must hold base->i_rwsem. */ -struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) +struct dentry *lookup_noperm(struct qstr *name, struct dentry *base) { struct dentry *dentry; - struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_common(&init_user_ns, name, base, len, &this); + err = lookup_noperm_common(name, base); if (err) return ERR_PTR(err); - dentry = lookup_dcache(&this, base, 0); - return dentry ? dentry : __lookup_slow(&this, base, 0); + dentry = lookup_dcache(name, base, 0); + return dentry ? dentry : __lookup_slow(name, base, 0); } -EXPORT_SYMBOL(lookup_one_len); +EXPORT_SYMBOL(lookup_noperm); /** - * lookup_one - filesystem helper to lookup single pathname component - * @mnt_userns: user namespace of the mount the lookup is performed from - * @name: pathname component to lookup + * lookup_one - lookup single pathname component + * @idmap: idmap of the mount the lookup is performed from + * @name: qstr holding pathname component to lookup * @base: base directory to lookup from - * @len: maximum length @len should be interpreted to * - * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. + * This can be used for in-kernel filesystem clients such as file servers. * - * The caller must hold base->i_mutex. + * The caller must hold base->i_rwsem. */ -struct dentry *lookup_one(struct user_namespace *mnt_userns, const char *name, - struct dentry *base, int len) +struct dentry *lookup_one(struct mnt_idmap *idmap, struct qstr *name, + struct dentry *base) { struct dentry *dentry; - struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_common(mnt_userns, name, base, len, &this); + err = lookup_one_common(idmap, name, base); if (err) return ERR_PTR(err); - dentry = lookup_dcache(&this, base, 0); - return dentry ? dentry : __lookup_slow(&this, base, 0); + dentry = lookup_dcache(name, base, 0); + return dentry ? dentry : __lookup_slow(name, base, 0); } EXPORT_SYMBOL(lookup_one); /** - * lookup_one_len_unlocked - filesystem helper to lookup single pathname component + * lookup_one_unlocked - lookup single pathname component + * @idmap: idmap of the mount the lookup is performed from + * @name: qstr olding pathname component to lookup + * @base: base directory to lookup from + * + * This can be used for in-kernel filesystem clients such as file servers. + * + * Unlike lookup_one, it should be called without the parent + * i_rwsem held, and will take the i_rwsem itself if necessary. + */ +struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct qstr *name, + struct dentry *base) +{ + int err; + struct dentry *ret; + + err = lookup_one_common(idmap, name, base); + if (err) + return ERR_PTR(err); + + ret = lookup_dcache(name, base, 0); + if (!ret) + ret = lookup_slow(name, base, 0); + return ret; +} +EXPORT_SYMBOL(lookup_one_unlocked); + +/** + * lookup_one_positive_killable - lookup single pathname component + * @idmap: idmap of the mount the lookup is performed from + * @name: qstr olding pathname component to lookup + * @base: base directory to lookup from + * + * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns + * known positive or ERR_PTR(). This is what most of the users want. + * + * Note that pinned negative with unlocked parent _can_ become positive at any + * time, so callers of lookup_one_unlocked() need to be very careful; pinned + * positives have >d_inode stable, so this one avoids such problems. + * + * This can be used for in-kernel filesystem clients such as file servers. + * + * It should be called without the parent i_rwsem held, and will take + * the i_rwsem itself if necessary. If a fatal signal is pending or + * delivered, it will return %-EINTR if the lock is needed. + */ +struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap, + struct qstr *name, + struct dentry *base) +{ + int err; + struct dentry *ret; + + err = lookup_one_common(idmap, name, base); + if (err) + return ERR_PTR(err); + + ret = lookup_dcache(name, base, 0); + if (!ret) + ret = lookup_slow_killable(name, base, 0); + if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { + dput(ret); + ret = ERR_PTR(-ENOENT); + } + return ret; +} +EXPORT_SYMBOL(lookup_one_positive_killable); + +/** + * lookup_one_positive_unlocked - lookup single pathname component + * @idmap: idmap of the mount the lookup is performed from + * @name: qstr holding pathname component to lookup + * @base: base directory to lookup from + * + * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns + * known positive or ERR_PTR(). This is what most of the users want. + * + * Note that pinned negative with unlocked parent _can_ become positive at any + * time, so callers of lookup_one_unlocked() need to be very careful; pinned + * positives have >d_inode stable, so this one avoids such problems. + * + * This can be used for in-kernel filesystem clients such as file servers. + * + * The helper should be called without i_rwsem held. + */ +struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, + struct qstr *name, + struct dentry *base) +{ + struct dentry *ret = lookup_one_unlocked(idmap, name, base); + + if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { + dput(ret); + ret = ERR_PTR(-ENOENT); + } + return ret; +} +EXPORT_SYMBOL(lookup_one_positive_unlocked); + +/** + * lookup_noperm_unlocked - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from - * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. + * not be called by generic code. It does no permission checking. + * + * Unlike lookup_noperm(), it should be called without the parent + * i_rwsem held, and will take the i_rwsem itself if necessary. * - * Unlike lookup_one_len, it should be called without the parent - * i_mutex held, and will take the i_mutex itself if necessary. + * Unlike try_lookup_noperm() it *does* revalidate the dentry if it already + * existed. */ -struct dentry *lookup_one_len_unlocked(const char *name, - struct dentry *base, int len) +struct dentry *lookup_noperm_unlocked(struct qstr *name, struct dentry *base) { - struct qstr this; - int err; struct dentry *ret; + int err; - err = lookup_one_common(&init_user_ns, name, base, len, &this); + err = lookup_noperm_common(name, base); if (err) return ERR_PTR(err); - ret = lookup_dcache(&this, base, 0); + ret = lookup_dcache(name, base, 0); if (!ret) - ret = lookup_slow(&this, base, 0); + ret = lookup_slow(name, base, 0); return ret; } -EXPORT_SYMBOL(lookup_one_len_unlocked); +EXPORT_SYMBOL(lookup_noperm_unlocked); /* - * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT) + * Like lookup_noperm_unlocked(), except that it yields ERR_PTR(-ENOENT) * on negatives. Returns known positive or ERR_PTR(); that's what * most of the users want. Note that pinned negative with unlocked parent - * _can_ become positive at any time, so callers of lookup_one_len_unlocked() + * _can_ become positive at any time, so callers of lookup_noperm_unlocked() * need to be very careful; pinned positives have ->d_inode stable, so * this one avoids such problems. */ -struct dentry *lookup_positive_unlocked(const char *name, - struct dentry *base, int len) +struct dentry *lookup_noperm_positive_unlocked(struct qstr *name, + struct dentry *base) { - struct dentry *ret = lookup_one_len_unlocked(name, base, len); + struct dentry *ret; + + ret = lookup_noperm_unlocked(name, base); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { dput(ret); ret = ERR_PTR(-ENOENT); } return ret; } -EXPORT_SYMBOL(lookup_positive_unlocked); +EXPORT_SYMBOL(lookup_noperm_positive_unlocked); + +/** + * start_creating - prepare to create a given name with permission checking + * @idmap: idmap of the mount + * @parent: directory in which to prepare to create the name + * @name: the name to be created + * + * Locks are taken and a lookup is performed prior to creating + * an object in a directory. Permission checking (MAY_EXEC) is performed + * against @idmap. + * + * If the name already exists, a positive dentry is returned, so + * behaviour is similar to O_CREAT without O_EXCL, which doesn't fail + * with -EEXIST. + * + * Returns: a negative or positive dentry, or an error. + */ +struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent, + struct qstr *name) +{ + int err = lookup_one_common(idmap, name, parent); + + if (err) + return ERR_PTR(err); + return start_dirop(parent, name, LOOKUP_CREATE); +} +EXPORT_SYMBOL(start_creating); + +/** + * start_removing - prepare to remove a given name with permission checking + * @idmap: idmap of the mount + * @parent: directory in which to find the name + * @name: the name to be removed + * + * Locks are taken and a lookup in performed prior to removing + * an object from a directory. Permission checking (MAY_EXEC) is performed + * against @idmap. + * + * If the name doesn't exist, an error is returned. + * + * end_removing() should be called when removal is complete, or aborted. + * + * Returns: a positive dentry, or an error. + */ +struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent, + struct qstr *name) +{ + int err = lookup_one_common(idmap, name, parent); + + if (err) + return ERR_PTR(err); + return start_dirop(parent, name, 0); +} +EXPORT_SYMBOL(start_removing); + +/** + * start_creating_killable - prepare to create a given name with permission checking + * @idmap: idmap of the mount + * @parent: directory in which to prepare to create the name + * @name: the name to be created + * + * Locks are taken and a lookup in performed prior to creating + * an object in a directory. Permission checking (MAY_EXEC) is performed + * against @idmap. + * + * If the name already exists, a positive dentry is returned. + * + * If a signal is received or was already pending, the function aborts + * with -EINTR; + * + * Returns: a negative or positive dentry, or an error. + */ +struct dentry *start_creating_killable(struct mnt_idmap *idmap, + struct dentry *parent, + struct qstr *name) +{ + int err = lookup_one_common(idmap, name, parent); + + if (err) + return ERR_PTR(err); + return __start_dirop(parent, name, LOOKUP_CREATE, TASK_KILLABLE); +} +EXPORT_SYMBOL(start_creating_killable); + +/** + * start_removing_killable - prepare to remove a given name with permission checking + * @idmap: idmap of the mount + * @parent: directory in which to find the name + * @name: the name to be removed + * + * Locks are taken and a lookup in performed prior to removing + * an object from a directory. Permission checking (MAY_EXEC) is performed + * against @idmap. + * + * If the name doesn't exist, an error is returned. + * + * end_removing() should be called when removal is complete, or aborted. + * + * If a signal is received or was already pending, the function aborts + * with -EINTR; + * + * Returns: a positive dentry, or an error. + */ +struct dentry *start_removing_killable(struct mnt_idmap *idmap, + struct dentry *parent, + struct qstr *name) +{ + int err = lookup_one_common(idmap, name, parent); + + if (err) + return ERR_PTR(err); + return __start_dirop(parent, name, 0, TASK_KILLABLE); +} +EXPORT_SYMBOL(start_removing_killable); + +/** + * start_creating_noperm - prepare to create a given name without permission checking + * @parent: directory in which to prepare to create the name + * @name: the name to be created + * + * Locks are taken and a lookup in performed prior to creating + * an object in a directory. + * + * If the name already exists, a positive dentry is returned. + * + * Returns: a negative or positive dentry, or an error. + */ +struct dentry *start_creating_noperm(struct dentry *parent, + struct qstr *name) +{ + int err = lookup_noperm_common(name, parent); + + if (err) + return ERR_PTR(err); + return start_dirop(parent, name, LOOKUP_CREATE); +} +EXPORT_SYMBOL(start_creating_noperm); + +/** + * start_removing_noperm - prepare to remove a given name without permission checking + * @parent: directory in which to find the name + * @name: the name to be removed + * + * Locks are taken and a lookup in performed prior to removing + * an object from a directory. + * + * If the name doesn't exist, an error is returned. + * + * end_removing() should be called when removal is complete, or aborted. + * + * Returns: a positive dentry, or an error. + */ +struct dentry *start_removing_noperm(struct dentry *parent, + struct qstr *name) +{ + int err = lookup_noperm_common(name, parent); + + if (err) + return ERR_PTR(err); + return start_dirop(parent, name, 0); +} +EXPORT_SYMBOL(start_removing_noperm); + +/** + * start_creating_dentry - prepare to create a given dentry + * @parent: directory from which dentry should be removed + * @child: the dentry to be removed + * + * A lock is taken to protect the dentry again other dirops and + * the validity of the dentry is checked: correct parent and still hashed. + * + * If the dentry is valid and negative a reference is taken and + * returned. If not an error is returned. + * + * end_creating() should be called when creation is complete, or aborted. + * + * Returns: the valid dentry, or an error. + */ +struct dentry *start_creating_dentry(struct dentry *parent, + struct dentry *child) +{ + inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); + if (unlikely(IS_DEADDIR(parent->d_inode) || + child->d_parent != parent || + d_unhashed(child))) { + inode_unlock(parent->d_inode); + return ERR_PTR(-EINVAL); + } + if (d_is_positive(child)) { + inode_unlock(parent->d_inode); + return ERR_PTR(-EEXIST); + } + return dget(child); +} +EXPORT_SYMBOL(start_creating_dentry); + +/** + * start_removing_dentry - prepare to remove a given dentry + * @parent: directory from which dentry should be removed + * @child: the dentry to be removed + * + * A lock is taken to protect the dentry again other dirops and + * the validity of the dentry is checked: correct parent and still hashed. + * + * If the dentry is valid and positive, a reference is taken and + * returned. If not an error is returned. + * + * end_removing() should be called when removal is complete, or aborted. + * + * Returns: the valid dentry, or an error. + */ +struct dentry *start_removing_dentry(struct dentry *parent, + struct dentry *child) +{ + inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); + if (unlikely(IS_DEADDIR(parent->d_inode) || + child->d_parent != parent || + d_unhashed(child))) { + inode_unlock(parent->d_inode); + return ERR_PTR(-EINVAL); + } + if (d_is_negative(child)) { + inode_unlock(parent->d_inode); + return ERR_PTR(-ENOENT); + } + return dget(child); +} +EXPORT_SYMBOL(start_removing_dentry); #ifdef CONFIG_UNIX98_PTYS int path_pts(struct path *path) @@ -2834,37 +3550,37 @@ int path_pts(struct path *path) dput(path->dentry); path->dentry = parent; child = d_hash_and_lookup(parent, &this); - if (!child) + if (IS_ERR_OR_NULL(child)) return -ENOENT; path->dentry = child; dput(parent); - follow_down(path); + follow_down(path, 0); return 0; } #endif -int user_path_at_empty(int dfd, const char __user *name, unsigned flags, - struct path *path, int *empty) +int user_path_at(int dfd, const char __user *name, unsigned flags, + struct path *path) { - struct filename *filename = getname_flags(name, flags, empty); + struct filename *filename = getname_flags(name, flags); int ret = filename_lookup(dfd, filename, flags, path, NULL); putname(filename); return ret; } -EXPORT_SYMBOL(user_path_at_empty); +EXPORT_SYMBOL(user_path_at); -int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir, +int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); - if (uid_eq(i_uid_into_mnt(mnt_userns, inode), fsuid)) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), fsuid)) return 0; - if (uid_eq(i_uid_into_mnt(mnt_userns, dir), fsuid)) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, dir), fsuid)) return 0; - return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER); + return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER); } EXPORT_SYMBOL(__check_sticky); @@ -2888,7 +3604,7 @@ EXPORT_SYMBOL(__check_sticky); * 11. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ -static int may_delete(struct user_namespace *mnt_userns, struct inode *dir, +static int may_delete(struct mnt_idmap *idmap, struct inode *dir, struct dentry *victim, bool isdir) { struct inode *inode = d_backing_inode(victim); @@ -2901,21 +3617,21 @@ static int may_delete(struct user_namespace *mnt_userns, struct inode *dir, BUG_ON(victim->d_parent->d_inode != dir); /* Inode writeback is not safe when the uid or gid are invalid. */ - if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) || - !gid_valid(i_gid_into_mnt(mnt_userns, inode))) + if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || + !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); - error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); + error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(mnt_userns, dir, inode) || IS_APPEND(inode) || + if (check_sticky(idmap, dir, inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || - HAS_UNMAPPED_ID(mnt_userns, inode)) + HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (isdir) { if (!d_is_dir(victim)) @@ -2940,7 +3656,7 @@ static int may_delete(struct user_namespace *mnt_userns, struct inode *dir, * 4. We should have write and exec permissions on dir * 5. We can't do it if dir is immutable (done in permission()) */ -static inline int may_create(struct user_namespace *mnt_userns, +static inline int may_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *child) { audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); @@ -2948,10 +3664,43 @@ static inline int may_create(struct user_namespace *mnt_userns, return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns)) + if (!fsuidgid_has_mapping(dir->i_sb, idmap)) return -EOVERFLOW; - return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); + return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); +} + +// p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held +static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2) +{ + struct dentry *p = p1, *q = p2, *r; + + while ((r = p->d_parent) != p2 && r != p) + p = r; + if (r == p2) { + // p is a child of p2 and an ancestor of p1 or p1 itself + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2); + return p; + } + // p is the root of connected component that contains p1 + // p2 does not occur on the path from p to p1 + while ((r = q->d_parent) != p1 && r != p && r != q) + q = r; + if (r == p1) { + // q is a child of p1 and an ancestor of p2 or p2 itself + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); + return q; + } else if (likely(r == p)) { + // both p2 and p1 are descendents of p + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); + return NULL; + } else { // no common ancestor at the time we'd been called + mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); + return ERR_PTR(-EXDEV); + } } /* @@ -2959,34 +3708,57 @@ static inline int may_create(struct user_namespace *mnt_userns, */ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) { - struct dentry *p; - if (p1 == p2) { inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); return NULL; } mutex_lock(&p1->d_sb->s_vfs_rename_mutex); + return lock_two_directories(p1, p2); +} +EXPORT_SYMBOL(lock_rename); - p = d_ancestor(p2, p1); - if (p) { +/* + * c1 and p2 should be on the same fs. + */ +struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2) +{ + if (READ_ONCE(c1->d_parent) == p2) { + /* + * hopefully won't need to touch ->s_vfs_rename_mutex at all. + */ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); - inode_lock_nested(p1->d_inode, I_MUTEX_CHILD); - return p; - } + /* + * now that p2 is locked, nobody can move in or out of it, + * so the test below is safe. + */ + if (likely(c1->d_parent == p2)) + return NULL; - p = d_ancestor(p1, p2); - if (p) { - inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); - inode_lock_nested(p2->d_inode, I_MUTEX_CHILD); - return p; + /* + * c1 got moved out of p2 while we'd been taking locks; + * unlock and fall back to slow case. + */ + inode_unlock(p2->d_inode); } - inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); - inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); + mutex_lock(&c1->d_sb->s_vfs_rename_mutex); + /* + * nobody can move out of any directories on this fs. + */ + if (likely(c1->d_parent != p2)) + return lock_two_directories(c1->d_parent, p2); + + /* + * c1 got moved into p2 while we were taking locks; + * we need p2 locked and ->s_vfs_rename_mutex unlocked, + * for consistency with lock_rename(). + */ + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); + mutex_unlock(&c1->d_sb->s_vfs_rename_mutex); return NULL; } -EXPORT_SYMBOL(lock_rename); +EXPORT_SYMBOL(lock_rename_child); void unlock_rename(struct dentry *p1, struct dentry *p2) { @@ -2999,36 +3771,365 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) EXPORT_SYMBOL(unlock_rename); /** + * __start_renaming - lookup and lock names for rename + * @rd: rename data containing parents and flags, and + * for receiving found dentries + * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL, + * LOOKUP_NO_SYMLINKS etc). + * @old_last: name of object in @rd.old_parent + * @new_last: name of object in @rd.new_parent + * + * Look up two names and ensure locks are in place for + * rename. + * + * On success the found dentries are stored in @rd.old_dentry, + * @rd.new_dentry and an extra ref is taken on @rd.old_parent. + * These references and the lock are dropped by end_renaming(). + * + * The passed in qstrs must have the hash calculated, and no permission + * checking is performed. + * + * Returns: zero or an error. + */ +static int +__start_renaming(struct renamedata *rd, int lookup_flags, + struct qstr *old_last, struct qstr *new_last) +{ + struct dentry *trap; + struct dentry *d1, *d2; + int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE; + int err; + + if (rd->flags & RENAME_EXCHANGE) + target_flags = 0; + if (rd->flags & RENAME_NOREPLACE) + target_flags |= LOOKUP_EXCL; + + trap = lock_rename(rd->old_parent, rd->new_parent); + if (IS_ERR(trap)) + return PTR_ERR(trap); + + d1 = lookup_one_qstr_excl(old_last, rd->old_parent, + lookup_flags); + err = PTR_ERR(d1); + if (IS_ERR(d1)) + goto out_unlock; + + d2 = lookup_one_qstr_excl(new_last, rd->new_parent, + lookup_flags | target_flags); + err = PTR_ERR(d2); + if (IS_ERR(d2)) + goto out_dput_d1; + + if (d1 == trap) { + /* source is an ancestor of target */ + err = -EINVAL; + goto out_dput_d2; + } + + if (d2 == trap) { + /* target is an ancestor of source */ + if (rd->flags & RENAME_EXCHANGE) + err = -EINVAL; + else + err = -ENOTEMPTY; + goto out_dput_d2; + } + + rd->old_dentry = d1; + rd->new_dentry = d2; + dget(rd->old_parent); + return 0; + +out_dput_d2: + dput(d2); +out_dput_d1: + dput(d1); +out_unlock: + unlock_rename(rd->old_parent, rd->new_parent); + return err; +} + +/** + * start_renaming - lookup and lock names for rename with permission checking + * @rd: rename data containing parents and flags, and + * for receiving found dentries + * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL, + * LOOKUP_NO_SYMLINKS etc). + * @old_last: name of object in @rd.old_parent + * @new_last: name of object in @rd.new_parent + * + * Look up two names and ensure locks are in place for + * rename. + * + * On success the found dentries are stored in @rd.old_dentry, + * @rd.new_dentry. Also the refcount on @rd->old_parent is increased. + * These references and the lock are dropped by end_renaming(). + * + * The passed in qstrs need not have the hash calculated, and basic + * eXecute permission checking is performed against @rd.mnt_idmap. + * + * Returns: zero or an error. + */ +int start_renaming(struct renamedata *rd, int lookup_flags, + struct qstr *old_last, struct qstr *new_last) +{ + int err; + + err = lookup_one_common(rd->mnt_idmap, old_last, rd->old_parent); + if (err) + return err; + err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent); + if (err) + return err; + return __start_renaming(rd, lookup_flags, old_last, new_last); +} +EXPORT_SYMBOL(start_renaming); + +static int +__start_renaming_dentry(struct renamedata *rd, int lookup_flags, + struct dentry *old_dentry, struct qstr *new_last) +{ + struct dentry *trap; + struct dentry *d2; + int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE; + int err; + + if (rd->flags & RENAME_EXCHANGE) + target_flags = 0; + if (rd->flags & RENAME_NOREPLACE) + target_flags |= LOOKUP_EXCL; + + /* Already have the dentry - need to be sure to lock the correct parent */ + trap = lock_rename_child(old_dentry, rd->new_parent); + if (IS_ERR(trap)) + return PTR_ERR(trap); + if (d_unhashed(old_dentry) || + (rd->old_parent && rd->old_parent != old_dentry->d_parent)) { + /* dentry was removed, or moved and explicit parent requested */ + err = -EINVAL; + goto out_unlock; + } + + d2 = lookup_one_qstr_excl(new_last, rd->new_parent, + lookup_flags | target_flags); + err = PTR_ERR(d2); + if (IS_ERR(d2)) + goto out_unlock; + + if (old_dentry == trap) { + /* source is an ancestor of target */ + err = -EINVAL; + goto out_dput_d2; + } + + if (d2 == trap) { + /* target is an ancestor of source */ + if (rd->flags & RENAME_EXCHANGE) + err = -EINVAL; + else + err = -ENOTEMPTY; + goto out_dput_d2; + } + + rd->old_dentry = dget(old_dentry); + rd->new_dentry = d2; + rd->old_parent = dget(old_dentry->d_parent); + return 0; + +out_dput_d2: + dput(d2); +out_unlock: + unlock_rename(old_dentry->d_parent, rd->new_parent); + return err; +} + +/** + * start_renaming_dentry - lookup and lock name for rename with permission checking + * @rd: rename data containing parents and flags, and + * for receiving found dentries + * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL, + * LOOKUP_NO_SYMLINKS etc). + * @old_dentry: dentry of name to move + * @new_last: name of target in @rd.new_parent + * + * Look up target name and ensure locks are in place for + * rename. + * + * On success the found dentry is stored in @rd.new_dentry and + * @rd.old_parent is confirmed to be the parent of @old_dentry. If it + * was originally %NULL, it is set. In either case a reference is taken + * so that end_renaming() can have a stable reference to unlock. + * + * References and the lock can be dropped with end_renaming() + * + * The passed in qstr need not have the hash calculated, and basic + * eXecute permission checking is performed against @rd.mnt_idmap. + * + * Returns: zero or an error. + */ +int start_renaming_dentry(struct renamedata *rd, int lookup_flags, + struct dentry *old_dentry, struct qstr *new_last) +{ + int err; + + err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent); + if (err) + return err; + return __start_renaming_dentry(rd, lookup_flags, old_dentry, new_last); +} +EXPORT_SYMBOL(start_renaming_dentry); + +/** + * start_renaming_two_dentries - Lock to dentries in given parents for rename + * @rd: rename data containing parent + * @old_dentry: dentry of name to move + * @new_dentry: dentry to move to + * + * Ensure locks are in place for rename and check parentage is still correct. + * + * On success the two dentries are stored in @rd.old_dentry and + * @rd.new_dentry and @rd.old_parent and @rd.new_parent are confirmed to + * be the parents of the dentries. + * + * References and the lock can be dropped with end_renaming() + * + * Returns: zero or an error. + */ +int +start_renaming_two_dentries(struct renamedata *rd, + struct dentry *old_dentry, struct dentry *new_dentry) +{ + struct dentry *trap; + int err; + + /* Already have the dentry - need to be sure to lock the correct parent */ + trap = lock_rename_child(old_dentry, rd->new_parent); + if (IS_ERR(trap)) + return PTR_ERR(trap); + err = -EINVAL; + if (d_unhashed(old_dentry) || + (rd->old_parent && rd->old_parent != old_dentry->d_parent)) + /* old_dentry was removed, or moved and explicit parent requested */ + goto out_unlock; + if (d_unhashed(new_dentry) || + rd->new_parent != new_dentry->d_parent) + /* new_dentry was removed or moved */ + goto out_unlock; + + if (old_dentry == trap) + /* source is an ancestor of target */ + goto out_unlock; + + if (new_dentry == trap) { + /* target is an ancestor of source */ + if (rd->flags & RENAME_EXCHANGE) + err = -EINVAL; + else + err = -ENOTEMPTY; + goto out_unlock; + } + + err = -EEXIST; + if (d_is_positive(new_dentry) && (rd->flags & RENAME_NOREPLACE)) + goto out_unlock; + + rd->old_dentry = dget(old_dentry); + rd->new_dentry = dget(new_dentry); + rd->old_parent = dget(old_dentry->d_parent); + return 0; + +out_unlock: + unlock_rename(old_dentry->d_parent, rd->new_parent); + return err; +} +EXPORT_SYMBOL(start_renaming_two_dentries); + +void end_renaming(struct renamedata *rd) +{ + unlock_rename(rd->old_parent, rd->new_parent); + dput(rd->old_dentry); + dput(rd->new_dentry); + dput(rd->old_parent); +} +EXPORT_SYMBOL(end_renaming); + +/** + * vfs_prepare_mode - prepare the mode to be used for a new inode + * @idmap: idmap of the mount the inode was found from + * @dir: parent directory of the new inode + * @mode: mode of the new inode + * @mask_perms: allowed permission by the vfs + * @type: type of file to be created + * + * This helper consolidates and enforces vfs restrictions on the @mode of a new + * object to be created. + * + * Umask stripping depends on whether the filesystem supports POSIX ACLs (see + * the kernel documentation for mode_strip_umask()). Moving umask stripping + * after setgid stripping allows the same ordering for both non-POSIX ACL and + * POSIX ACL supporting filesystems. + * + * Note that it's currently valid for @type to be 0 if a directory is created. + * Filesystems raise that flag individually and we need to check whether each + * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a + * non-zero type. + * + * Returns: mode to be passed to the filesystem + */ +static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, + const struct inode *dir, umode_t mode, + umode_t mask_perms, umode_t type) +{ + mode = mode_strip_sgid(idmap, dir, mode); + mode = mode_strip_umask(dir, mode); + + /* + * Apply the vfs mandated allowed permission mask and set the type of + * file to be created before we call into the filesystem. + */ + mode &= (mask_perms & ~S_IFMT); + mode |= (type & S_IFMT); + + return mode; +} + +/** * vfs_create - create new file - * @mnt_userns: user namespace of the mount the inode was found from - * @dir: inode of @dentry - * @dentry: pointer to dentry of the base directory - * @mode: mode of the new file - * @want_excl: whether the file must not yet exist + * @idmap: idmap of the mount the inode was found from + * @dentry: dentry of the child file + * @mode: mode of the child file + * @di: returns parent inode, if the inode is delegated. * * Create a new file. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply pass @nop_mnt_idmap. */ -int vfs_create(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, bool want_excl) +int vfs_create(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode, + struct delegated_inode *di) { - int error = may_create(mnt_userns, dir, dentry); + struct inode *dir = d_inode(dentry->d_parent); + int error; + + error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->create) return -EACCES; /* shouldn't it be ENOSYS? */ - mode &= S_IALLUGO; - mode |= S_IFREG; + + mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG); error = security_inode_create(dir, dentry, mode); if (error) return error; - error = dir->i_op->create(mnt_userns, dir, dentry, mode, want_excl); + error = try_break_deleg(dir, di); + if (error) + return error; + error = dir->i_op->create(idmap, dir, dentry, mode, true); if (!error) fsnotify_create(dir, dentry); return error; @@ -3040,7 +4141,7 @@ int vfs_mkobj(struct dentry *dentry, umode_t mode, void *arg) { struct inode *dir = dentry->d_parent->d_inode; - int error = may_create(&init_user_ns, dir, dentry); + int error = may_create(&nop_mnt_idmap, dir, dentry); if (error) return error; @@ -3062,7 +4163,7 @@ bool may_open_dev(const struct path *path) !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); } -static int may_open(struct user_namespace *mnt_userns, const struct path *path, +static int may_open(struct mnt_idmap *idmap, const struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; @@ -3096,9 +4197,11 @@ static int may_open(struct user_namespace *mnt_userns, const struct path *path, if ((acc_mode & MAY_EXEC) && path_noexec(path)) return -EACCES; break; + default: + VFS_BUG_ON_INODE(!IS_ANON_FILE(inode), inode); } - error = inode_permission(mnt_userns, inode, MAY_OPEN | acc_mode); + error = inode_permission(idmap, inode, MAY_OPEN | acc_mode); if (error) return error; @@ -3113,13 +4216,13 @@ static int may_open(struct user_namespace *mnt_userns, const struct path *path, } /* O_NOATIME can only be set by the owner or superuser */ - if (flag & O_NOATIME && !inode_owner_or_capable(mnt_userns, inode)) + if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode)) return -EPERM; return 0; } -static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp) +static int handle_truncate(struct mnt_idmap *idmap, struct file *filp) { const struct path *path = &filp->f_path; struct inode *inode = path->dentry->d_inode; @@ -3127,9 +4230,9 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp) if (error) return error; - error = security_path_truncate(path); + error = security_file_truncate(filp); if (!error) { - error = do_truncate(mnt_userns, path->dentry, 0, + error = do_truncate(idmap, path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, filp); } @@ -3144,7 +4247,7 @@ static inline int open_to_namei_flags(int flag) return flag; } -static int may_o_create(struct user_namespace *mnt_userns, +static int may_o_create(struct mnt_idmap *idmap, const struct path *dir, struct dentry *dentry, umode_t mode) { @@ -3152,10 +4255,10 @@ static int may_o_create(struct user_namespace *mnt_userns, if (error) return error; - if (!fsuidgid_has_mapping(dir->dentry->d_sb, mnt_userns)) + if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap)) return -EOVERFLOW; - error = inode_permission(mnt_userns, dir->dentry->d_inode, + error = inode_permission(idmap, dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); if (error) return error; @@ -3187,8 +4290,8 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, if (nd->flags & LOOKUP_DIRECTORY) open_flag |= O_DIRECTORY; - file->f_path.dentry = DENTRY_NOT_SET; - file->f_path.mnt = nd->path.mnt; + file->__f_path.dentry = DENTRY_NOT_SET; + file->__f_path.mnt = nd->path.mnt; error = dir->i_op->atomic_open(dir, dentry, file, open_to_namei_flags(open_flag), mode); d_lookup_done(dentry); @@ -3233,9 +4336,9 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, */ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, const struct open_flags *op, - bool got_write) + bool got_write, struct delegated_inode *delegated_inode) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; int open_flag = op->open_flag; @@ -3258,7 +4361,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, if (d_in_lookup(dentry)) break; - error = d_revalidate(dentry, nd->flags); + error = d_revalidate(dir_inode, &nd->last, dentry, nd->flags); if (likely(error > 0)) break; if (error) @@ -3272,6 +4375,9 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, return dentry; } + if (open_flag & O_CREAT) + audit_inode(nd->name, dir, AUDIT_INODE_PARENT); + /* * Checking write permission is tricky, bacuse we don't know if we are * going to actually need it: O_CREAT opens should work as long as the @@ -3283,14 +4389,13 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, */ if (unlikely(!got_write)) open_flag &= ~O_TRUNC; - mnt_userns = mnt_user_ns(nd->path.mnt); + idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if (open_flag & O_EXCL) open_flag &= ~O_TRUNC; - if (!IS_POSIXACL(dir->d_inode)) - mode &= ~current_umask(); + mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode); if (likely(got_write)) - create_error = may_o_create(mnt_userns, &nd->path, + create_error = may_o_create(idmap, &nd->path, dentry, mode); else create_error = -EROFS; @@ -3320,6 +4425,11 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, /* Negative dentry, just create the file */ if (!dentry->d_inode && (open_flag & O_CREAT)) { + /* but break the directory lease first! */ + error = try_break_deleg(dir_inode, delegated_inode); + if (error) + goto out_dput; + file->f_mode |= FMODE_CREATED; audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE); if (!dir_inode->i_op->create) { @@ -3327,7 +4437,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, goto out_dput; } - error = dir_inode->i_op->create(mnt_userns, dir_inode, dentry, + error = dir_inode->i_op->create(idmap, dir_inode, dentry, mode, open_flag & O_EXCL); if (error) goto out_dput; @@ -3343,14 +4453,49 @@ out_dput: return ERR_PTR(error); } +static inline bool trailing_slashes(struct nameidata *nd) +{ + return (bool)nd->last.name[nd->last.len]; +} + +static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag) +{ + struct dentry *dentry; + + if (open_flag & O_CREAT) { + if (trailing_slashes(nd)) + return ERR_PTR(-EISDIR); + + /* Don't bother on an O_EXCL create */ + if (open_flag & O_EXCL) + return NULL; + } + + if (trailing_slashes(nd)) + nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; + + dentry = lookup_fast(nd); + if (IS_ERR_OR_NULL(dentry)) + return dentry; + + if (open_flag & O_CREAT) { + /* Discard negative dentries. Need inode_lock to do the create */ + if (!dentry->d_inode) { + if (!(nd->flags & LOOKUP_RCU)) + dput(dentry); + dentry = NULL; + } + } + return dentry; +} + static const char *open_last_lookups(struct nameidata *nd, struct file *file, const struct open_flags *op) { + struct delegated_inode delegated_inode = { }; struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool got_write = false; - unsigned seq; - struct inode *inode; struct dentry *dentry; const char *res; @@ -3362,29 +4507,24 @@ static const char *open_last_lookups(struct nameidata *nd, return handle_dots(nd, nd->last_type); } - if (!(open_flag & O_CREAT)) { - if (nd->last.name[nd->last.len]) - nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; - /* we _can_ be in RCU mode here */ - dentry = lookup_fast(nd, &inode, &seq); - if (IS_ERR(dentry)) - return ERR_CAST(dentry); - if (likely(dentry)) - goto finish_lookup; + /* We _can_ be in RCU mode here */ + dentry = lookup_fast_for_open(nd, open_flag); + if (IS_ERR(dentry)) + return ERR_CAST(dentry); + + if (likely(dentry)) + goto finish_lookup; - BUG_ON(nd->flags & LOOKUP_RCU); + if (!(open_flag & O_CREAT)) { + if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU)) + return ERR_PTR(-ECHILD); } else { - /* create side of things */ if (nd->flags & LOOKUP_RCU) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); } - audit_inode(nd->name, dir, AUDIT_INODE_PARENT); - /* trailing slashes? */ - if (unlikely(nd->last.name[nd->last.len])) - return ERR_PTR(-EISDIR); } - +retry: if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { got_write = !mnt_want_write(nd->path.mnt); /* @@ -3397,9 +4537,13 @@ static const char *open_last_lookups(struct nameidata *nd, inode_lock(dir->d_inode); else inode_lock_shared(dir->d_inode); - dentry = lookup_open(nd, file, op, got_write); - if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED)) - fsnotify_create(dir->d_inode, dentry); + dentry = lookup_open(nd, file, op, got_write, &delegated_inode); + if (!IS_ERR(dentry)) { + if (file->f_mode & FMODE_CREATED) + fsnotify_create(dir->d_inode, dentry); + if (file->f_mode & FMODE_OPENED) + fsnotify_open(file); + } if (open_flag & O_CREAT) inode_unlock(dir->d_inode); else @@ -3408,8 +4552,16 @@ static const char *open_last_lookups(struct nameidata *nd, if (got_write) mnt_drop_write(nd->path.mnt); - if (IS_ERR(dentry)) + if (IS_ERR(dentry)) { + if (is_delegated(&delegated_inode)) { + int error = break_deleg_wait(&delegated_inode); + + if (!error) + goto retry; + return ERR_PTR(error); + } return ERR_CAST(dentry); + } if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) { dput(nd->path.dentry); @@ -3420,7 +4572,7 @@ static const char *open_last_lookups(struct nameidata *nd, finish_lookup: if (nd->depth) put_link(nd); - res = step_into(nd, WALK_TRAILING, dentry, inode, seq); + res = step_into(nd, WALK_TRAILING, dentry); if (unlikely(res)) nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); return res; @@ -3432,7 +4584,7 @@ finish_lookup: static int do_open(struct nameidata *nd, struct file *file, const struct open_flags *op) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; int open_flag = op->open_flag; bool do_truncate; int acc_mode; @@ -3445,13 +4597,13 @@ static int do_open(struct nameidata *nd, } if (!(file->f_mode & FMODE_CREATED)) audit_inode(nd->name, nd->path.dentry, 0); - mnt_userns = mnt_user_ns(nd->path.mnt); + idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED)) return -EEXIST; if (d_is_dir(nd->path.dentry)) return -EISDIR; - error = may_create_in_sticky(mnt_userns, nd, + error = may_create_in_sticky(idmap, nd, d_backing_inode(nd->path.dentry)); if (unlikely(error)) return error; @@ -3471,13 +4623,13 @@ static int do_open(struct nameidata *nd, return error; do_truncate = true; } - error = may_open(mnt_userns, &nd->path, acc_mode, open_flag); + error = may_open(idmap, &nd->path, acc_mode, open_flag); if (!error && !(file->f_mode & FMODE_OPENED)) error = vfs_open(&nd->path, file); if (!error) - error = ima_file_check(file, op->acc_mode); + error = security_file_post_open(file, op->acc_mode); if (!error && do_truncate) - error = handle_truncate(mnt_userns, file); + error = handle_truncate(idmap, file); if (unlikely(error > 0)) { WARN_ON(1); error = -EINVAL; @@ -3489,84 +4641,110 @@ static int do_open(struct nameidata *nd, /** * vfs_tmpfile - create tmpfile - * @mnt_userns: user namespace of the mount the inode was found from - * @dentry: pointer to dentry of the base directory + * @idmap: idmap of the mount the inode was found from + * @parentpath: pointer to the path of the base directory + * @file: file descriptor of the new tmpfile * @mode: mode of the new tmpfile - * @open_flag: flags * * Create a temporary file. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply pass @nop_mnt_idmap. */ -struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns, - struct dentry *dentry, umode_t mode, int open_flag) +int vfs_tmpfile(struct mnt_idmap *idmap, + const struct path *parentpath, + struct file *file, umode_t mode) { - struct dentry *child = NULL; - struct inode *dir = dentry->d_inode; + struct dentry *child; + struct inode *dir = d_inode(parentpath->dentry); struct inode *inode; int error; + int open_flag = file->f_flags; /* we want directory to be writable */ - error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); + error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) - goto out_err; - error = -EOPNOTSUPP; + return error; if (!dir->i_op->tmpfile) - goto out_err; - error = -ENOMEM; - child = d_alloc(dentry, &slash_name); + return -EOPNOTSUPP; + child = d_alloc(parentpath->dentry, &slash_name); if (unlikely(!child)) - goto out_err; - error = dir->i_op->tmpfile(mnt_userns, dir, child, mode); + return -ENOMEM; + file->__f_path.mnt = parentpath->mnt; + file->__f_path.dentry = child; + mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); + error = dir->i_op->tmpfile(idmap, dir, file, mode); + dput(child); + if (file->f_mode & FMODE_OPENED) + fsnotify_open(file); if (error) - goto out_err; - error = -ENOENT; - inode = child->d_inode; - if (unlikely(!inode)) - goto out_err; + return error; + /* Don't check for other permissions, the inode was just created */ + error = may_open(idmap, &file->f_path, 0, file->f_flags); + if (error) + return error; + inode = file_inode(file); if (!(open_flag & O_EXCL)) { spin_lock(&inode->i_lock); - inode->i_state |= I_LINKABLE; + inode_state_set(inode, I_LINKABLE); spin_unlock(&inode->i_lock); } - ima_post_create_tmpfile(mnt_userns, inode); - return child; + security_inode_post_create_tmpfile(idmap, inode); + return 0; +} -out_err: - dput(child); - return ERR_PTR(error); +/** + * kernel_tmpfile_open - open a tmpfile for kernel internal use + * @idmap: idmap of the mount the inode was found from + * @parentpath: path of the base directory + * @mode: mode of the new tmpfile + * @open_flag: flags + * @cred: credentials for open + * + * Create and open a temporary file. The file is not accounted in nr_files, + * hence this is only for kernel internal use, and must not be installed into + * file tables or such. + */ +struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, + const struct path *parentpath, + umode_t mode, int open_flag, + const struct cred *cred) +{ + struct file *file; + int error; + + file = alloc_empty_file_noaccount(open_flag, cred); + if (IS_ERR(file)) + return file; + + error = vfs_tmpfile(idmap, parentpath, file, mode); + if (error) { + fput(file); + file = ERR_PTR(error); + } + return file; } -EXPORT_SYMBOL(vfs_tmpfile); +EXPORT_SYMBOL(kernel_tmpfile_open); static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, struct file *file) { - struct user_namespace *mnt_userns; - struct dentry *child; struct path path; int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); + if (unlikely(error)) return error; error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; - mnt_userns = mnt_user_ns(path.mnt); - child = vfs_tmpfile(mnt_userns, path.dentry, op->mode, op->open_flag); - error = PTR_ERR(child); - if (IS_ERR(child)) + error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode); + if (error) goto out2; - dput(path.dentry); - path.dentry = child; - audit_inode(nd->name, child, 0); - /* Don't check for other permissions, the inode was just created */ - error = may_open(mnt_userns, &path, 0, op->open_flag); - if (!error) - error = vfs_open(&path, file); + audit_inode(nd->name, file->f_path.dentry, 0); out2: mnt_drop_write(path.mnt); out: @@ -3615,7 +4793,7 @@ static struct file *path_openat(struct nameidata *nd, WARN_ON(1); error = -EINVAL; } - fput(file); + fput_close(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; @@ -3673,18 +4851,13 @@ static struct dentry *filename_create(int dfd, struct filename *name, { struct dentry *dentry = ERR_PTR(-EEXIST); struct qstr last; + bool want_dir = lookup_flags & LOOKUP_DIRECTORY; + unsigned int reval_flag = lookup_flags & LOOKUP_REVAL; + unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL; int type; - int err2; int error; - bool is_dir = (lookup_flags & LOOKUP_DIRECTORY); - - /* - * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any - * other flags passed in are ignored! - */ - lookup_flags &= LOOKUP_REVAL; - error = filename_parentat(dfd, name, lookup_flags, path, &last, &type); + error = filename_parentat(dfd, name, reval_flag, path, &last, &type); if (error) return ERR_PTR(error); @@ -3696,49 +4869,34 @@ static struct dentry *filename_create(int dfd, struct filename *name, goto out; /* don't fail immediately if it's r/o, at least try to report other errors */ - err2 = mnt_want_write(path->mnt); + error = mnt_want_write(path->mnt); /* - * Do the final lookup. + * Do the final lookup. Suppress 'create' if there is a trailing + * '/', and a directory wasn't requested. */ - lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL; - inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); - dentry = __lookup_hash(&last, path->dentry, lookup_flags); + if (last.name[last.len] && !want_dir) + create_flags &= ~LOOKUP_CREATE; + dentry = start_dirop(path->dentry, &last, reval_flag | create_flags); if (IS_ERR(dentry)) - goto unlock; + goto out_drop_write; - error = -EEXIST; - if (d_is_positive(dentry)) + if (unlikely(error)) goto fail; - /* - * Special case - lookup gave negative, but... we had foo/bar/ - * From the vfs_mknod() POV we just have a negative dentry - - * all is fine. Let's be bastards - you had / on the end, you've - * been asking for (non-existent) directory. -ENOENT for you. - */ - if (unlikely(!is_dir && last.name[last.len])) { - error = -ENOENT; - goto fail; - } - if (unlikely(err2)) { - error = err2; - goto fail; - } return dentry; fail: - dput(dentry); + end_dirop(dentry); dentry = ERR_PTR(error); -unlock: - inode_unlock(path->dentry->d_inode); - if (!err2) +out_drop_write: + if (!error) mnt_drop_write(path->mnt); out: path_put(path); return dentry; } -struct dentry *kern_path_create(int dfd, const char *pathname, - struct path *path, unsigned int lookup_flags) +struct dentry *start_creating_path(int dfd, const char *pathname, + struct path *path, unsigned int lookup_flags) { struct filename *filename = getname_kernel(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); @@ -3746,19 +4904,30 @@ struct dentry *kern_path_create(int dfd, const char *pathname, putname(filename); return res; } -EXPORT_SYMBOL(kern_path_create); +EXPORT_SYMBOL(start_creating_path); -void done_path_create(struct path *path, struct dentry *dentry) +/** + * end_creating_path - finish a code section started by start_creating_path() + * @path: the path instantiated by start_creating_path() + * @dentry: the dentry returned by start_creating_path() + * + * end_creating_path() will unlock and locks taken by start_creating_path() + * and drop an references that were taken. It should only be called + * if start_creating_path() returned a non-error. + * If vfs_mkdir() was called and it returned an error, that error *should* + * be passed to end_creating_path() together with the path. + */ +void end_creating_path(const struct path *path, struct dentry *dentry) { - dput(dentry); - inode_unlock(path->dentry->d_inode); + end_creating(dentry); mnt_drop_write(path->mnt); path_put(path); } -EXPORT_SYMBOL(done_path_create); +EXPORT_SYMBOL(end_creating_path); -inline struct dentry *user_path_create(int dfd, const char __user *pathname, - struct path *path, unsigned int lookup_flags) +inline struct dentry *start_creating_user_path( + int dfd, const char __user *pathname, + struct path *path, unsigned int lookup_flags) { struct filename *filename = getname(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); @@ -3766,29 +4935,32 @@ inline struct dentry *user_path_create(int dfd, const char __user *pathname, putname(filename); return res; } -EXPORT_SYMBOL(user_path_create); +EXPORT_SYMBOL(start_creating_user_path); + /** * vfs_mknod - create device node or file - * @mnt_userns: user namespace of the mount the inode was found from - * @dir: inode of @dentry - * @dentry: pointer to dentry of the base directory - * @mode: mode of the new device node or file - * @dev: device number of device to create + * @idmap: idmap of the mount the inode was found from + * @dir: inode of the parent directory + * @dentry: dentry of the child device node + * @mode: mode of the child device node + * @dev: device number of device to create + * @delegated_inode: returns parent inode, if the inode is delegated. * * Create a device node or file. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply pass @nop_mnt_idmap. */ -int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, dev_t dev) +int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev, + struct delegated_inode *delegated_inode) { bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; - int error = may_create(mnt_userns, dir, dentry); + int error = may_create(idmap, dir, dentry); if (error) return error; @@ -3800,6 +4972,7 @@ int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (!dir->i_op->mknod) return -EPERM; + mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = devcgroup_inode_mknod(mode, dev); if (error) return error; @@ -3808,7 +4981,11 @@ int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (error) return error; - error = dir->i_op->mknod(mnt_userns, dir, dentry, mode, dev); + error = try_break_deleg(dir, delegated_inode); + if (error) + return error; + + error = dir->i_op->mknod(idmap, dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); return error; @@ -3835,7 +5012,8 @@ static int may_mknod(umode_t mode) static int do_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev) { - struct user_namespace *mnt_userns; + struct delegated_inode di = { }; + struct mnt_idmap *idmap; struct dentry *dentry; struct path path; int error; @@ -3850,31 +5028,34 @@ retry: if (IS_ERR(dentry)) goto out1; - if (!IS_POSIXACL(path.dentry->d_inode)) - mode &= ~current_umask(); - error = security_path_mknod(&path, dentry, mode, dev); + error = security_path_mknod(&path, dentry, + mode_strip_umask(path.dentry->d_inode, mode), dev); if (error) goto out2; - mnt_userns = mnt_user_ns(path.mnt); + idmap = mnt_idmap(path.mnt); switch (mode & S_IFMT) { case 0: case S_IFREG: - error = vfs_create(mnt_userns, path.dentry->d_inode, - dentry, mode, true); + error = vfs_create(idmap, dentry, mode, &di); if (!error) - ima_post_path_mknod(mnt_userns, dentry); + security_path_post_mknod(idmap, dentry); break; case S_IFCHR: case S_IFBLK: - error = vfs_mknod(mnt_userns, path.dentry->d_inode, - dentry, mode, new_decode_dev(dev)); + error = vfs_mknod(idmap, path.dentry->d_inode, + dentry, mode, new_decode_dev(dev), &di); break; case S_IFIFO: case S_IFSOCK: - error = vfs_mknod(mnt_userns, path.dentry->d_inode, - dentry, mode, 0); + error = vfs_mknod(idmap, path.dentry->d_inode, + dentry, mode, 0, &di); break; } out2: - done_path_create(&path, dentry); + end_creating_path(&path, dentry); + if (is_delegated(&di)) { + error = break_deleg_wait(&di); + if (!error) + goto retry; + } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -3896,44 +5077,70 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d } /** - * vfs_mkdir - create directory - * @mnt_userns: user namespace of the mount the inode was found from - * @dir: inode of @dentry - * @dentry: pointer to dentry of the base directory - * @mode: mode of the new directory + * vfs_mkdir - create directory returning correct dentry if possible + * @idmap: idmap of the mount the inode was found from + * @dir: inode of the parent directory + * @dentry: dentry of the child directory + * @mode: mode of the child directory + * @delegated_inode: returns parent inode, if the inode is delegated. * * Create a directory. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply pass @nop_mnt_idmap. + * + * In the event that the filesystem does not use the *@dentry but leaves it + * negative or unhashes it and possibly splices a different one returning it, + * the original dentry is dput() and the alternate is returned. + * + * In case of an error the dentry is dput() and an ERR_PTR() is returned. */ -int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) +struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, + struct delegated_inode *delegated_inode) { - int error = may_create(mnt_userns, dir, dentry); + int error; unsigned max_links = dir->i_sb->s_max_links; + struct dentry *de; + error = may_create(idmap, dir, dentry); if (error) - return error; + goto err; + error = -EPERM; if (!dir->i_op->mkdir) - return -EPERM; + goto err; - mode &= (S_IRWXUGO|S_ISVTX); + mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0); error = security_inode_mkdir(dir, dentry, mode); if (error) - return error; + goto err; + error = -EMLINK; if (max_links && dir->i_nlink >= max_links) - return -EMLINK; + goto err; - error = dir->i_op->mkdir(mnt_userns, dir, dentry, mode); - if (!error) - fsnotify_mkdir(dir, dentry); - return error; + error = try_break_deleg(dir, delegated_inode); + if (error) + goto err; + + de = dir->i_op->mkdir(idmap, dir, dentry, mode); + error = PTR_ERR(de); + if (IS_ERR(de)) + goto err; + if (de) { + dput(dentry); + dentry = de; + } + fsnotify_mkdir(dir, dentry); + return dentry; + +err: + end_creating(dentry); + return ERR_PTR(error); } EXPORT_SYMBOL(vfs_mkdir); @@ -3943,6 +5150,7 @@ int do_mkdirat(int dfd, struct filename *name, umode_t mode) struct path path; int error; unsigned int lookup_flags = LOOKUP_DIRECTORY; + struct delegated_inode delegated_inode = { }; retry: dentry = filename_create(dfd, name, &path, lookup_flags); @@ -3950,16 +5158,20 @@ retry: if (IS_ERR(dentry)) goto out_putname; - if (!IS_POSIXACL(path.dentry->d_inode)) - mode &= ~current_umask(); - error = security_path_mkdir(&path, dentry, mode); + error = security_path_mkdir(&path, dentry, + mode_strip_umask(path.dentry->d_inode, mode)); if (!error) { - struct user_namespace *mnt_userns; - mnt_userns = mnt_user_ns(path.mnt); - error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry, - mode); + dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, mode, &delegated_inode); + if (IS_ERR(dentry)) + error = PTR_ERR(dentry); + } + end_creating_path(&path, dentry); + if (is_delegated(&delegated_inode)) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry; } - done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -3981,22 +5193,23 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) /** * vfs_rmdir - remove directory - * @mnt_userns: user namespace of the mount the inode was found from - * @dir: inode of @dentry - * @dentry: pointer to dentry of the base directory + * @idmap: idmap of the mount the inode was found from + * @dir: inode of the parent directory + * @dentry: dentry of the child directory + * @delegated_inode: returns parent inode, if it's delegated. * * Remove a directory. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply pass @nop_mnt_idmap. */ -int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry) +int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, struct delegated_inode *delegated_inode) { - int error = may_delete(mnt_userns, dir, dentry, 1); + int error = may_delete(idmap, dir, dentry, 1); if (error) return error; @@ -4016,6 +5229,10 @@ int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir, if (error) goto out; + error = try_break_deleg(dir, delegated_inode); + if (error) + goto out; + error = dir->i_op->rmdir(dir, dentry); if (error) goto out; @@ -4024,26 +5241,25 @@ int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir, dentry->d_inode->i_flags |= S_DEAD; dont_mount(dentry); detach_mounts(dentry); - fsnotify_rmdir(dir, dentry); out: inode_unlock(dentry->d_inode); dput(dentry); if (!error) - d_delete(dentry); + d_delete_notify(dir, dentry); return error; } EXPORT_SYMBOL(vfs_rmdir); int do_rmdir(int dfd, struct filename *name) { - struct user_namespace *mnt_userns; int error; struct dentry *dentry; struct path path; struct qstr last; int type; unsigned int lookup_flags = 0; + struct delegated_inode delegated_inode = { }; retry: error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (error) @@ -4065,27 +5281,26 @@ retry: if (error) goto exit2; - inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); - dentry = __lookup_hash(&last, path.dentry, lookup_flags); + dentry = start_dirop(path.dentry, &last, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit3; - if (!dentry->d_inode) { - error = -ENOENT; - goto exit4; - } error = security_path_rmdir(&path, dentry); if (error) goto exit4; - mnt_userns = mnt_user_ns(path.mnt); - error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry); + error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, &delegated_inode); exit4: - dput(dentry); + end_dirop(dentry); exit3: - inode_unlock(path.dentry->d_inode); mnt_drop_write(path.mnt); exit2: path_put(&path); + if (is_delegated(&delegated_inode)) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry; + } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4102,34 +5317,34 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) /** * vfs_unlink - unlink a filesystem object - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: parent directory * @dentry: victim * @delegated_inode: returns victim inode, if the inode is delegated. * - * The caller must hold dir->i_mutex. + * The caller must hold dir->i_rwsem exclusively. * * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and * return a reference to the inode in delegated_inode. The caller * should then break the delegation on that inode and retry. Because * breaking a delegation may take a long time, the caller should drop - * dir->i_mutex before doing so. + * dir->i_rwsem before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply pass @nop_mnt_idmap. */ -int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, struct inode **delegated_inode) +int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, struct delegated_inode *delegated_inode) { struct inode *target = dentry->d_inode; - int error = may_delete(mnt_userns, dir, dentry, 0); + int error = may_delete(idmap, dir, dentry, 0); if (error) return error; @@ -4145,6 +5360,9 @@ int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir, else { error = security_inode_unlink(dir, dentry); if (!error) { + error = try_break_deleg(dir, delegated_inode); + if (error) + goto out; error = try_break_deleg(target, delegated_inode); if (error) goto out; @@ -4152,7 +5370,6 @@ int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir, if (!error) { dont_mount(dentry); detach_mounts(dentry); - fsnotify_unlink(dir, dentry); } } } @@ -4160,9 +5377,11 @@ out: inode_unlock(target); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ - if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { + if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) { + fsnotify_unlink(dir, dentry); + } else if (!error) { fsnotify_link_count(target); - d_delete(dentry); + d_delete_notify(dir, dentry); } return error; @@ -4171,7 +5390,7 @@ EXPORT_SYMBOL(vfs_unlink); /* * Make sure that the actual truncation of the file will occur outside its - * directory's i_mutex. Truncate can take a long time if there is a lot of + * directory's i_rwsem. Truncate can take a long time if there is a lot of * writeout happening, and we don't want to prevent access to the directory * while waiting on the I/O. */ @@ -4182,73 +5401,62 @@ int do_unlinkat(int dfd, struct filename *name) struct path path; struct qstr last; int type; - struct inode *inode = NULL; - struct inode *delegated_inode = NULL; + struct inode *inode; + struct delegated_inode delegated_inode = { }; unsigned int lookup_flags = 0; retry: error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (error) - goto exit1; + goto exit_putname; error = -EISDIR; if (type != LAST_NORM) - goto exit2; + goto exit_path_put; error = mnt_want_write(path.mnt); if (error) - goto exit2; + goto exit_path_put; retry_deleg: - inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); - dentry = __lookup_hash(&last, path.dentry, lookup_flags); + dentry = start_dirop(path.dentry, &last, lookup_flags); error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - struct user_namespace *mnt_userns; - - /* Why not before? Because we want correct error value */ - if (last.name[last.len]) - goto slashes; - inode = dentry->d_inode; - if (d_is_negative(dentry)) - goto slashes; - ihold(inode); - error = security_path_unlink(&path, dentry); - if (error) - goto exit3; - mnt_userns = mnt_user_ns(path.mnt); - error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry, - &delegated_inode); -exit3: - dput(dentry); + if (IS_ERR(dentry)) + goto exit_drop_write; + + /* Why not before? Because we want correct error value */ + if (unlikely(last.name[last.len])) { + if (d_is_dir(dentry)) + error = -EISDIR; + else + error = -ENOTDIR; + end_dirop(dentry); + goto exit_drop_write; } - inode_unlock(path.dentry->d_inode); - if (inode) - iput(inode); /* truncate the inode here */ - inode = NULL; - if (delegated_inode) { + inode = dentry->d_inode; + ihold(inode); + error = security_path_unlink(&path, dentry); + if (error) + goto exit_end_dirop; + error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, &delegated_inode); +exit_end_dirop: + end_dirop(dentry); + iput(inode); /* truncate the inode here */ + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } +exit_drop_write: mnt_drop_write(path.mnt); -exit2: +exit_path_put: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; - inode = NULL; goto retry; } -exit1: +exit_putname: putname(name); return error; - -slashes: - if (d_is_negative(dentry)) - error = -ENOENT; - else if (d_is_dir(dentry)) - error = -EISDIR; - else - error = -ENOTDIR; - goto exit3; } SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) @@ -4268,24 +5476,27 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname) /** * vfs_symlink - create symlink - * @mnt_userns: user namespace of the mount the inode was found from - * @dir: inode of @dentry - * @dentry: pointer to dentry of the base directory + * @idmap: idmap of the mount the inode was found from + * @dir: inode of the parent directory + * @dentry: dentry of the child symlink file * @oldname: name of the file to link to + * @delegated_inode: returns victim inode, if the inode is delegated. * * Create a symlink. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply pass @nop_mnt_idmap. */ -int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, const char *oldname) +int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, const char *oldname, + struct delegated_inode *delegated_inode) { - int error = may_create(mnt_userns, dir, dentry); + int error; + error = may_create(idmap, dir, dentry); if (error) return error; @@ -4296,7 +5507,11 @@ int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (error) return error; - error = dir->i_op->symlink(mnt_userns, dir, dentry, oldname); + error = try_break_deleg(dir, delegated_inode); + if (error) + return error; + + error = dir->i_op->symlink(idmap, dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); return error; @@ -4309,6 +5524,7 @@ int do_symlinkat(struct filename *from, int newdfd, struct filename *to) struct dentry *dentry; struct path path; unsigned int lookup_flags = 0; + struct delegated_inode delegated_inode = { }; if (IS_ERR(from)) { error = PTR_ERR(from); @@ -4321,14 +5537,15 @@ retry: goto out_putnames; error = security_path_symlink(&path, dentry, from->name); - if (!error) { - struct user_namespace *mnt_userns; - - mnt_userns = mnt_user_ns(path.mnt); - error = vfs_symlink(mnt_userns, path.dentry->d_inode, dentry, - from->name); + if (!error) + error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, from->name, &delegated_inode); + end_creating_path(&path, dentry); + if (is_delegated(&delegated_inode)) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry; } - done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4353,32 +5570,32 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn /** * vfs_link - create a new link * @old_dentry: object to be linked - * @mnt_userns: the user namespace of the mount + * @idmap: idmap of the mount * @dir: new parent * @new_dentry: where to create the new link * @delegated_inode: returns inode needing a delegation break * - * The caller must hold dir->i_mutex + * The caller must hold dir->i_rwsem exclusively. * * If vfs_link discovers a delegation on the to-be-linked file in need * of breaking, it will return -EWOULDBLOCK and return a reference to the * inode in delegated_inode. The caller should then break the delegation * and retry. Because breaking a delegation may take a long time, the - * caller should drop the i_mutex before doing so. + * caller should drop the i_rwsem before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply pass @nop_mnt_idmap. */ -int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns, +int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, struct inode *dir, struct dentry *new_dentry, - struct inode **delegated_inode) + struct delegated_inode *delegated_inode) { struct inode *inode = old_dentry->d_inode; unsigned max_links = dir->i_sb->s_max_links; @@ -4387,7 +5604,7 @@ int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns, if (!inode) return -ENOENT; - error = may_create(mnt_userns, dir, new_dentry); + error = may_create(idmap, dir, new_dentry); if (error) return error; @@ -4401,10 +5618,10 @@ int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns, return -EPERM; /* * Updating the link count will likely cause i_uid and i_gid to - * be writen back improperly if their true value is unknown to + * be written back improperly if their true value is unknown to * the vfs. */ - if (HAS_UNMAPPED_ID(mnt_userns, inode)) + if (HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (!dir->i_op->link) return -EPERM; @@ -4417,19 +5634,21 @@ int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns, inode_lock(inode); /* Make sure we don't allow creating hardlink to an unlinked file */ - if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE)) error = -ENOENT; else if (max_links && inode->i_nlink >= max_links) error = -EMLINK; else { - error = try_break_deleg(inode, delegated_inode); + error = try_break_deleg(dir, delegated_inode); + if (!error) + error = try_break_deleg(inode, delegated_inode); if (!error) error = dir->i_op->link(old_dentry, dir, new_dentry); } - if (!error && (inode->i_state & I_LINKABLE)) { + if (!error && (inode_state_read_once(inode) & I_LINKABLE)) { spin_lock(&inode->i_lock); - inode->i_state &= ~I_LINKABLE; + inode_state_clear(inode, I_LINKABLE); spin_unlock(&inode->i_lock); } inode_unlock(inode); @@ -4451,10 +5670,10 @@ EXPORT_SYMBOL(vfs_link); int do_linkat(int olddfd, struct filename *old, int newdfd, struct filename *new, int flags) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct dentry *new_dentry; struct path old_path, new_path; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; int how = 0; int error; @@ -4463,14 +5682,13 @@ int do_linkat(int olddfd, struct filename *old, int newdfd, goto out_putnames; } /* - * To use null names we require CAP_DAC_READ_SEARCH + * To use null names we require CAP_DAC_READ_SEARCH or + * that the open-time creds of the dfd matches current. * This ensures that not everyone will be able to create - * handlink using the passed filedescriptor. + * a hardlink using the passed file descriptor. */ - if (flags & AT_EMPTY_PATH && !capable(CAP_DAC_READ_SEARCH)) { - error = -ENOENT; - goto out_putnames; - } + if (flags & AT_EMPTY_PATH) + how |= LOOKUP_LINKAT_EMPTY; if (flags & AT_SYMLINK_FOLLOW) how |= LOOKUP_FOLLOW; @@ -4488,18 +5706,18 @@ retry: error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; - mnt_userns = mnt_user_ns(new_path.mnt); - error = may_linkat(mnt_userns, &old_path); + idmap = mnt_idmap(new_path.mnt); + error = may_linkat(idmap, &old_path); if (unlikely(error)) goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; - error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode, + error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: - done_path_create(&new_path, new_dentry); - if (delegated_inode) { + end_creating_path(&new_path, new_dentry); + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) { path_put(&old_path); @@ -4555,12 +5773,13 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * * a) we can get into loop creation. * b) race potential - two innocent renames can create a loop together. - * That's where 4.4 screws up. Current fix: serialization on + * That's where 4.4BSD screws up. Current fix: serialization on * sb->s_vfs_rename_mutex. We might be more accurate, but that's another * story. - * c) we have to lock _four_ objects - parents and victim (if it exists), - * and source (if it is not a directory). - * And that - after we got ->i_mutex on parents (until then we don't know + * c) we may have to lock up to _four_ objects - parents and victim (if it exists), + * and source (if it's a non-directory or a subdirectory that moves to + * different parent). + * And that - after we got ->i_rwsem on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change * only under ->s_vfs_rename_mutex _and_ that parent of the object we @@ -4572,18 +5791,19 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * has no more than 1 dentry. If "hybrid" objects will ever appear, * we'd better make sure that there's no link(2) for them. * d) conversion from fhandle to dentry may come in the wrong moment - when - * we are removing the target. Solution: we will have to grab ->i_mutex + * we are removing the target. Solution: we will have to grab ->i_rwsem * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on - * ->i_mutex on parents, which works but leads to some truly excessive + * ->i_rwsem on parents, which works but leads to some truly excessive * locking]. */ int vfs_rename(struct renamedata *rd) { int error; - struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir; + struct inode *old_dir = d_inode(rd->old_parent); + struct inode *new_dir = d_inode(rd->new_parent); struct dentry *old_dentry = rd->old_dentry; struct dentry *new_dentry = rd->new_dentry; - struct inode **delegated_inode = rd->delegated_inode; + struct delegated_inode *delegated_inode = rd->delegated_inode; unsigned int flags = rd->flags; bool is_dir = d_is_dir(old_dentry); struct inode *source = old_dentry->d_inode; @@ -4591,24 +5811,25 @@ int vfs_rename(struct renamedata *rd) bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; struct name_snapshot old_name; + bool lock_old_subdir, lock_new_subdir; if (source == target) return 0; - error = may_delete(rd->old_mnt_userns, old_dir, old_dentry, is_dir); + error = may_delete(rd->mnt_idmap, old_dir, old_dentry, is_dir); if (error) return error; if (!target) { - error = may_create(rd->new_mnt_userns, new_dir, new_dentry); + error = may_create(rd->mnt_idmap, new_dir, new_dentry); } else { new_is_dir = d_is_dir(new_dentry); if (!(flags & RENAME_EXCHANGE)) - error = may_delete(rd->new_mnt_userns, new_dir, + error = may_delete(rd->mnt_idmap, new_dir, new_dentry, is_dir); else - error = may_delete(rd->new_mnt_userns, new_dir, + error = may_delete(rd->mnt_idmap, new_dir, new_dentry, new_is_dir); } if (error) @@ -4623,13 +5844,13 @@ int vfs_rename(struct renamedata *rd) */ if (new_dir != old_dir) { if (is_dir) { - error = inode_permission(rd->old_mnt_userns, source, + error = inode_permission(rd->mnt_idmap, source, MAY_WRITE); if (error) return error; } if ((flags & RENAME_EXCHANGE) && new_is_dir) { - error = inode_permission(rd->new_mnt_userns, target, + error = inode_permission(rd->mnt_idmap, target, MAY_WRITE); if (error) return error; @@ -4643,10 +5864,33 @@ int vfs_rename(struct renamedata *rd) take_dentry_name_snapshot(&old_name, old_dentry); dget(new_dentry); - if (!is_dir || (flags & RENAME_EXCHANGE)) + /* + * Lock children. + * The source subdirectory needs to be locked on cross-directory + * rename or cross-directory exchange since its parent changes. + * The target subdirectory needs to be locked on cross-directory + * exchange due to parent change and on any rename due to becoming + * a victim. + * Non-directories need locking in all cases (for NFS reasons); + * they get locked after any subdirectories (in inode address order). + * + * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE. + * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex. + */ + lock_old_subdir = new_dir != old_dir; + lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE); + if (is_dir) { + if (lock_old_subdir) + inode_lock_nested(source, I_MUTEX_CHILD); + if (target && (!new_is_dir || lock_new_subdir)) + inode_lock(target); + } else if (new_is_dir) { + if (lock_new_subdir) + inode_lock_nested(target, I_MUTEX_CHILD); + inode_lock(source); + } else { lock_two_nondirectories(source, target); - else if (target) - inode_lock(target); + } error = -EPERM; if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target))) @@ -4664,6 +5908,14 @@ int vfs_rename(struct renamedata *rd) old_dir->i_nlink >= max_links) goto out; } + error = try_break_deleg(old_dir, delegated_inode); + if (error) + goto out; + if (new_dir != old_dir) { + error = try_break_deleg(new_dir, delegated_inode); + if (error) + goto out; + } if (!is_dir) { error = try_break_deleg(source, delegated_inode); if (error) @@ -4674,7 +5926,7 @@ int vfs_rename(struct renamedata *rd) if (error) goto out; } - error = old_dir->i_op->rename(rd->new_mnt_userns, old_dir, old_dentry, + error = old_dir->i_op->rename(rd->mnt_idmap, old_dir, old_dentry, new_dir, new_dentry, flags); if (error) goto out; @@ -4694,9 +5946,9 @@ int vfs_rename(struct renamedata *rd) d_exchange(old_dentry, new_dentry); } out: - if (!is_dir || (flags & RENAME_EXCHANGE)) - unlock_two_nondirectories(source, target); - else if (target) + if (!is_dir || lock_old_subdir) + inode_unlock(source); + if (target && (!new_is_dir || lock_new_subdir)) inode_unlock(target); dput(new_dentry); if (!error) { @@ -4717,13 +5969,11 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, struct filename *to, unsigned int flags) { struct renamedata rd; - struct dentry *old_dentry, *new_dentry; - struct dentry *trap; struct path old_path, new_path; struct qstr old_last, new_last; int old_type, new_type; - struct inode *delegated_inode = NULL; - unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET; + struct delegated_inode delegated_inode = { }; + unsigned int lookup_flags = 0; bool should_retry = false; int error = -EINVAL; @@ -4734,9 +5984,6 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, (flags & RENAME_EXCHANGE)) goto put_names; - if (flags & RENAME_EXCHANGE) - target_flags = 0; - retry: error = filename_parentat(olddfd, from, lookup_flags, &old_path, &old_last, &old_type); @@ -4766,73 +6013,42 @@ retry: goto exit2; retry_deleg: - trap = lock_rename(new_path.dentry, old_path.dentry); + rd.old_parent = old_path.dentry; + rd.mnt_idmap = mnt_idmap(old_path.mnt); + rd.new_parent = new_path.dentry; + rd.delegated_inode = &delegated_inode; + rd.flags = flags; - old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags); - error = PTR_ERR(old_dentry); - if (IS_ERR(old_dentry)) - goto exit3; - /* source must exist */ - error = -ENOENT; - if (d_is_negative(old_dentry)) - goto exit4; - new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags); - error = PTR_ERR(new_dentry); - if (IS_ERR(new_dentry)) - goto exit4; - error = -EEXIST; - if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) - goto exit5; - if (flags & RENAME_EXCHANGE) { - error = -ENOENT; - if (d_is_negative(new_dentry)) - goto exit5; + error = __start_renaming(&rd, lookup_flags, &old_last, &new_last); + if (error) + goto exit_lock_rename; - if (!d_is_dir(new_dentry)) { + if (flags & RENAME_EXCHANGE) { + if (!d_is_dir(rd.new_dentry)) { error = -ENOTDIR; if (new_last.name[new_last.len]) - goto exit5; + goto exit_unlock; } } /* unless the source is a directory trailing slashes give -ENOTDIR */ - if (!d_is_dir(old_dentry)) { + if (!d_is_dir(rd.old_dentry)) { error = -ENOTDIR; if (old_last.name[old_last.len]) - goto exit5; + goto exit_unlock; if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len]) - goto exit5; + goto exit_unlock; } - /* source should not be ancestor of target */ - error = -EINVAL; - if (old_dentry == trap) - goto exit5; - /* target should not be an ancestor of source */ - if (!(flags & RENAME_EXCHANGE)) - error = -ENOTEMPTY; - if (new_dentry == trap) - goto exit5; - error = security_path_rename(&old_path, old_dentry, - &new_path, new_dentry, flags); + error = security_path_rename(&old_path, rd.old_dentry, + &new_path, rd.new_dentry, flags); if (error) - goto exit5; - - rd.old_dir = old_path.dentry->d_inode; - rd.old_dentry = old_dentry; - rd.old_mnt_userns = mnt_user_ns(old_path.mnt); - rd.new_dir = new_path.dentry->d_inode; - rd.new_dentry = new_dentry; - rd.new_mnt_userns = mnt_user_ns(new_path.mnt); - rd.delegated_inode = &delegated_inode; - rd.flags = flags; + goto exit_unlock; + error = vfs_rename(&rd); -exit5: - dput(new_dentry); -exit4: - dput(old_dentry); -exit3: - unlock_rename(new_path.dentry, old_path.dentry); - if (delegated_inode) { +exit_unlock: + end_renaming(&rd); +exit_lock_rename: + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; @@ -4875,19 +6091,16 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna getname(newname), 0); } -int readlink_copy(char __user *buffer, int buflen, const char *link) +int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen) { - int len = PTR_ERR(link); - if (IS_ERR(link)) - goto out; + int copylen; - len = strlen(link); - if (len > (unsigned) buflen) - len = buflen; - if (copy_to_user(buffer, link, len)) - len = -EFAULT; -out: - return len; + copylen = linklen; + if (unlikely(copylen > (unsigned) buflen)) + copylen = buflen; + if (copy_to_user(buffer, link, copylen)) + copylen = -EFAULT; + return copylen; } /** @@ -4907,6 +6120,9 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) const char *link; int res; + if (inode->i_opflags & IOP_CACHED_LINK) + return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen); + if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) { if (unlikely(inode->i_op->readlink)) return inode->i_op->readlink(dentry, buffer, buflen); @@ -4925,7 +6141,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) if (IS_ERR(link)) return PTR_ERR(link); } - res = readlink_copy(buffer, buflen, link); + res = readlink_copy(buffer, buflen, link, strlen(link)); do_delayed_call(&done); return res; } @@ -4957,75 +6173,117 @@ const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done) EXPORT_SYMBOL(vfs_get_link); /* get the link contents into pagecache */ -const char *page_get_link(struct dentry *dentry, struct inode *inode, - struct delayed_call *callback) +static char *__page_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) { - char *kaddr; - struct page *page; + struct folio *folio; struct address_space *mapping = inode->i_mapping; if (!dentry) { - page = find_get_page(mapping, 0); - if (!page) + folio = filemap_get_folio(mapping, 0); + if (IS_ERR(folio)) return ERR_PTR(-ECHILD); - if (!PageUptodate(page)) { - put_page(page); + if (!folio_test_uptodate(folio)) { + folio_put(folio); return ERR_PTR(-ECHILD); } } else { - page = read_mapping_page(mapping, 0, NULL); - if (IS_ERR(page)) - return (char*)page; + folio = read_mapping_folio(mapping, 0, NULL); + if (IS_ERR(folio)) + return ERR_CAST(folio); } - set_delayed_call(callback, page_put_link, page); + set_delayed_call(callback, page_put_link, folio); BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM); - kaddr = page_address(page); - nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); - return kaddr; + return folio_address(folio); +} + +const char *page_get_link_raw(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) +{ + return __page_get_link(dentry, inode, callback); } +EXPORT_SYMBOL_GPL(page_get_link_raw); + +/** + * page_get_link() - An implementation of the get_link inode_operation. + * @dentry: The directory entry which is the symlink. + * @inode: The inode for the symlink. + * @callback: Used to drop the reference to the symlink. + * + * Filesystems which store their symlinks in the page cache should use + * this to implement the get_link() member of their inode_operations. + * + * Return: A pointer to the NUL-terminated symlink. + */ +const char *page_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) +{ + char *kaddr = __page_get_link(dentry, inode, callback); + if (!IS_ERR(kaddr)) + nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); + return kaddr; +} EXPORT_SYMBOL(page_get_link); +/** + * page_put_link() - Drop the reference to the symlink. + * @arg: The folio which contains the symlink. + * + * This is used internally by page_get_link(). It is exported for use + * by filesystems which need to implement a variant of page_get_link() + * themselves. Despite the apparent symmetry, filesystems which use + * page_get_link() do not need to call page_put_link(). + * + * The argument, while it has a void pointer type, must be a pointer to + * the folio which was retrieved from the page cache. The delayed_call + * infrastructure is used to drop the reference count once the caller + * is done with the symlink. + */ void page_put_link(void *arg) { - put_page(arg); + folio_put(arg); } EXPORT_SYMBOL(page_put_link); int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { + const char *link; + int res; + DEFINE_DELAYED_CALL(done); - int res = readlink_copy(buffer, buflen, - page_get_link(dentry, d_inode(dentry), - &done)); + link = page_get_link(dentry, d_inode(dentry), &done); + res = PTR_ERR(link); + if (!IS_ERR(link)) + res = readlink_copy(buffer, buflen, link, strlen(link)); do_delayed_call(&done); return res; } EXPORT_SYMBOL(page_readlink); -/* - * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS - */ -int __page_symlink(struct inode *inode, const char *symname, int len, int nofs) +int page_symlink(struct inode *inode, const char *symname, int len) { struct address_space *mapping = inode->i_mapping; - struct page *page; - void *fsdata; + const struct address_space_operations *aops = mapping->a_ops; + bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS); + struct folio *folio; + void *fsdata = NULL; int err; - unsigned int flags = 0; - if (nofs) - flags |= AOP_FLAG_NOFS; + unsigned int flags; retry: - err = pagecache_write_begin(NULL, mapping, 0, len-1, - flags, &page, &fsdata); + if (nofs) + flags = memalloc_nofs_save(); + err = aops->write_begin(NULL, mapping, 0, len-1, &folio, &fsdata); + if (nofs) + memalloc_nofs_restore(flags); if (err) goto fail; - memcpy(page_address(page), symname, len-1); + memcpy(folio_address(folio), symname, len - 1); - err = pagecache_write_end(NULL, mapping, 0, len-1, len-1, - page, fsdata); + err = aops->write_end(NULL, mapping, 0, len - 1, len - 1, + folio, fsdata); if (err < 0) goto fail; if (err < len-1) @@ -5036,13 +6294,6 @@ retry: fail: return err; } -EXPORT_SYMBOL(__page_symlink); - -int page_symlink(struct inode *inode, const char *symname, int len) -{ - return __page_symlink(inode, symname, len, - !mapping_gfp_constraint(inode->i_mapping, __GFP_FS)); -} EXPORT_SYMBOL(page_symlink); const struct inode_operations page_symlink_inode_operations = { |
