diff options
Diffstat (limited to 'fs/namei.c')
| -rw-r--r-- | fs/namei.c | 6758 |
1 files changed, 4453 insertions, 2305 deletions
diff --git a/fs/namei.c b/fs/namei.c index 8b61d103a8a7..bf0f66f0e9b9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/namei.c * @@ -16,15 +17,16 @@ #include <linux/init.h> #include <linux/export.h> -#include <linux/kernel.h> #include <linux/slab.h> +#include <linux/wordpart.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/namei.h> #include <linux/pagemap.h> +#include <linux/sched/mm.h> #include <linux/fsnotify.h> #include <linux/personality.h> #include <linux/security.h> -#include <linux/ima.h> #include <linux/syscalls.h> #include <linux/mount.h> #include <linux/audit.h> @@ -34,7 +36,10 @@ #include <linux/device_cgroup.h> #include <linux/fs_struct.h> #include <linux/posix_acl.h> -#include <asm/uaccess.h> +#include <linux/hash.h> +#include <linux/bitops.h> +#include <linux/init_task.h> +#include <linux/uaccess.h> #include "internal.h" #include "mount.h" @@ -117,25 +122,22 @@ * POSIX.1 2.4: an empty pathname is invalid (ENOENT). * PATH_MAX includes the nul terminator --RR. */ -void final_putname(struct filename *name) + +#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) + +static inline void initname(struct filename *name, const char __user *uptr) { - if (name->separate) { - __putname(name->name); - kfree(name); - } else { - __putname(name); - } + name->uptr = uptr; + name->aname = NULL; + atomic_set(&name->refcnt, 1); } -#define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename)) - -static struct filename * -getname_flags(const char __user *filename, int flags, int *empty) +struct filename * +getname_flags(const char __user *filename, int flags) { - struct filename *result, *err; - int len; - long max; + struct filename *result; char *kname; + int len; result = audit_reusename(filename); if (result) @@ -149,16 +151,24 @@ getname_flags(const char __user *filename, int flags, int *empty) * First, try to embed the struct filename inside the names_cache * allocation */ - kname = (char *)result + sizeof(*result); + kname = (char *)result->iname; result->name = kname; - result->separate = false; - max = EMBEDDED_NAME_MAX; -recopy: - len = strncpy_from_user(kname, filename, max); - if (unlikely(len < 0)) { - err = ERR_PTR(len); - goto error; + len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX); + /* + * Handle both empty path and copy failure in one go. + */ + if (unlikely(len <= 0)) { + if (unlikely(len < 0)) { + __putname(result); + return ERR_PTR(len); + } + + /* The empty path is special. */ + if (!(flags & LOOKUP_EMPTY)) { + __putname(result); + return ERR_PTR(-ENOENT); + } } /* @@ -167,60 +177,145 @@ recopy: * names_cache allocation for the pathname, and re-do the copy from * userland. */ - if (len == EMBEDDED_NAME_MAX && max == EMBEDDED_NAME_MAX) { + if (unlikely(len == EMBEDDED_NAME_MAX)) { + const size_t size = offsetof(struct filename, iname[1]); kname = (char *)result; - result = kzalloc(sizeof(*result), GFP_KERNEL); - if (!result) { - err = ERR_PTR(-ENOMEM); - result = (struct filename *)kname; - goto error; + /* + * size is chosen that way we to guarantee that + * result->iname[0] is within the same object and that + * kname can't be equal to result->iname, no matter what. + */ + result = kzalloc(size, GFP_KERNEL); + if (unlikely(!result)) { + __putname(kname); + return ERR_PTR(-ENOMEM); } result->name = kname; - result->separate = true; - max = PATH_MAX; - goto recopy; + len = strncpy_from_user(kname, filename, PATH_MAX); + if (unlikely(len < 0)) { + __putname(kname); + kfree(result); + return ERR_PTR(len); + } + /* The empty path is special. */ + if (unlikely(!len) && !(flags & LOOKUP_EMPTY)) { + __putname(kname); + kfree(result); + return ERR_PTR(-ENOENT); + } + if (unlikely(len == PATH_MAX)) { + __putname(kname); + kfree(result); + return ERR_PTR(-ENAMETOOLONG); + } } + initname(result, filename); + audit_getname(result); + return result; +} - /* The empty path is special. */ - if (unlikely(!len)) { - if (empty) - *empty = 1; - err = ERR_PTR(-ENOENT); - if (!(flags & LOOKUP_EMPTY)) - goto error; - } +struct filename *getname_uflags(const char __user *filename, int uflags) +{ + int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; - err = ERR_PTR(-ENAMETOOLONG); - if (unlikely(len >= PATH_MAX)) - goto error; + return getname_flags(filename, flags); +} - result->uptr = filename; - audit_getname(result); - return result; +struct filename *__getname_maybe_null(const char __user *pathname) +{ + struct filename *name; + char c; -error: - final_putname(result); - return err; + /* try to save on allocations; loss on um, though */ + if (get_user(c, pathname)) + return ERR_PTR(-EFAULT); + if (!c) + return NULL; + + name = getname_flags(pathname, LOOKUP_EMPTY); + if (!IS_ERR(name) && !(name->name[0])) { + putname(name); + name = NULL; + } + return name; } -struct filename * -getname(const char __user * filename) +struct filename *getname_kernel(const char * filename) { - return getname_flags(filename, 0, NULL); + struct filename *result; + int len = strlen(filename) + 1; + + result = __getname(); + if (unlikely(!result)) + return ERR_PTR(-ENOMEM); + + if (len <= EMBEDDED_NAME_MAX) { + result->name = (char *)result->iname; + } else if (len <= PATH_MAX) { + const size_t size = offsetof(struct filename, iname[1]); + struct filename *tmp; + + tmp = kmalloc(size, GFP_KERNEL); + if (unlikely(!tmp)) { + __putname(result); + return ERR_PTR(-ENOMEM); + } + tmp->name = (char *)result; + result = tmp; + } else { + __putname(result); + return ERR_PTR(-ENAMETOOLONG); + } + memcpy((char *)result->name, filename, len); + initname(result, NULL); + audit_getname(result); + return result; } -EXPORT_SYMBOL(getname); +EXPORT_SYMBOL(getname_kernel); -#ifdef CONFIG_AUDITSYSCALL void putname(struct filename *name) { - if (unlikely(!audit_dummy_context())) - return audit_putname(name); - final_putname(name); + int refcnt; + + if (IS_ERR_OR_NULL(name)) + return; + + refcnt = atomic_read(&name->refcnt); + if (unlikely(refcnt != 1)) { + if (WARN_ON_ONCE(!refcnt)) + return; + + if (!atomic_dec_and_test(&name->refcnt)) + return; + } + + if (unlikely(name->name != name->iname)) { + __putname(name->name); + kfree(name); + } else + __putname(name); } -#endif +EXPORT_SYMBOL(putname); -static int check_acl(struct inode *inode, int mask) +/** + * check_acl - perform ACL permission checking + * @idmap: idmap of the mount the inode was found from + * @inode: inode to check permissions on + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) + * + * This function performs the ACL permission checking. Since this function + * retrieve POSIX acls it needs to know whether it is called from a blocking or + * non-blocking context and thus cares about the MAY_NOT_BLOCK bit. + * + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply pass @nop_mnt_idmap. + */ +static int check_acl(struct mnt_idmap *idmap, + struct inode *inode, int mask) { #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *acl; @@ -229,35 +324,17 @@ static int check_acl(struct inode *inode, int mask) acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS); if (!acl) return -EAGAIN; - /* no ->get_acl() calls in RCU mode... */ - if (acl == ACL_NOT_CACHED) + /* no ->get_inode_acl() calls in RCU mode... */ + if (is_uncached_acl(acl)) return -ECHILD; - return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK); - } - - acl = get_cached_acl(inode, ACL_TYPE_ACCESS); - - /* - * A filesystem can force a ACL callback by just never filling the - * ACL cache. But normally you'd fill the cache either at inode - * instantiation time, or on the first ->get_acl call. - * - * If the filesystem doesn't have a get_acl() function at all, we'll - * just create the negative cache entry. - */ - if (acl == ACL_NOT_CACHED) { - if (inode->i_op->get_acl) { - acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } else { - set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); - return -EAGAIN; - } + return posix_acl_permission(idmap, inode, acl, mask); } + acl = get_inode_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); if (acl) { - int error = posix_acl_permission(inode, acl, mask); + int error = posix_acl_permission(idmap, inode, acl, mask); posix_acl_release(acl); return error; } @@ -267,37 +344,107 @@ static int check_acl(struct inode *inode, int mask) } /* - * This does the basic permission checking + * Very quick optimistic "we know we have no ACL's" check. + * + * Note that this is purely for ACL_TYPE_ACCESS, and purely + * for the "we have cached that there are no ACLs" case. + * + * If this returns true, we know there are no ACLs. But if + * it returns false, we might still not have ACLs (it could + * be the is_uncached_acl() case). + */ +static inline bool no_acl_inode(struct inode *inode) +{ +#ifdef CONFIG_FS_POSIX_ACL + return likely(!READ_ONCE(inode->i_acl)); +#else + return true; +#endif +} + +/** + * acl_permission_check - perform basic UNIX permission checking + * @idmap: idmap of the mount the inode was found from + * @inode: inode to check permissions on + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) + * + * This function performs the basic UNIX permission checking. Since this + * function may retrieve POSIX acls it needs to know whether it is called from a + * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit. + * + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply pass @nop_mnt_idmap. */ -static int acl_permission_check(struct inode *inode, int mask) +static int acl_permission_check(struct mnt_idmap *idmap, + struct inode *inode, int mask) { unsigned int mode = inode->i_mode; + vfsuid_t vfsuid; - if (likely(uid_eq(current_fsuid(), inode->i_uid))) + /* + * Common cheap case: everybody has the requested + * rights, and there are no ACLs to check. No need + * to do any owner/group checks in that case. + * + * - 'mask&7' is the requested permission bit set + * - multiplying by 0111 spreads them out to all of ugo + * - '& ~mode' looks for missing inode permission bits + * - the '!' is for "no missing permissions" + * + * After that, we just need to check that there are no + * ACL's on the inode - do the 'IS_POSIXACL()' check last + * because it will dereference the ->i_sb pointer and we + * want to avoid that if at all possible. + */ + if (!((mask & 7) * 0111 & ~mode)) { + if (no_acl_inode(inode)) + return 0; + if (!IS_POSIXACL(inode)) + return 0; + } + + /* Are we the owner? If so, ACL's don't matter */ + vfsuid = i_uid_into_vfsuid(idmap, inode); + if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { + mask &= 7; mode >>= 6; - else { - if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { - int error = check_acl(inode, mask); - if (error != -EAGAIN) - return error; - } + return (mask & ~mode) ? -EACCES : 0; + } - if (in_group_p(inode->i_gid)) - mode >>= 3; + /* Do we have ACL's? */ + if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { + int error = check_acl(idmap, inode, mask); + if (error != -EAGAIN) + return error; } + /* Only RWX matters for group/other mode bits */ + mask &= 7; + /* - * If the DACs are ok we don't need any capability check. + * Are the group permissions different from + * the other permissions in the bits we care + * about? Need to check group ownership if so. */ - if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) - return 0; - return -EACCES; + if (mask & (mode ^ (mode >> 3))) { + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); + if (vfsgid_in_group_p(vfsgid)) + mode >>= 3; + } + + /* Bits in 'mode' clear that we require? */ + return (mask & ~mode) ? -EACCES : 0; } /** * generic_permission - check for access rights on a Posix-like filesystem + * @idmap: idmap of the mount the inode was found from * @inode: inode to check access rights for - * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...) + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, + * %MAY_NOT_BLOCK ...) * * Used to check for read/write/execute permissions on a file. * We use "fsuid" for this, letting us set arbitrary permissions @@ -307,100 +454,83 @@ static int acl_permission_check(struct inode *inode, int mask) * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk * request cannot be satisfied (eg. requires blocking or too much complexity). * It would then be called again in ref-walk mode. + * + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply pass @nop_mnt_idmap. */ -int generic_permission(struct inode *inode, int mask) +int generic_permission(struct mnt_idmap *idmap, struct inode *inode, + int mask) { int ret; /* * Do the basic permission checks. */ - ret = acl_permission_check(inode, mask); + ret = acl_permission_check(idmap, inode, mask); if (ret != -EACCES) return ret; if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ - if (inode_capable(inode, CAP_DAC_OVERRIDE)) - return 0; if (!(mask & MAY_WRITE)) - if (inode_capable(inode, CAP_DAC_READ_SEARCH)) + if (capable_wrt_inode_uidgid(idmap, inode, + CAP_DAC_READ_SEARCH)) return 0; + if (capable_wrt_inode_uidgid(idmap, inode, + CAP_DAC_OVERRIDE)) + return 0; return -EACCES; } - /* - * Read/write DACs are always overridable. - * Executable DACs are overridable when there is - * at least one exec bit set. - */ - if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) - if (inode_capable(inode, CAP_DAC_OVERRIDE)) - return 0; /* * Searching includes executable on directories, else just read. */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) - if (inode_capable(inode, CAP_DAC_READ_SEARCH)) + if (capable_wrt_inode_uidgid(idmap, inode, + CAP_DAC_READ_SEARCH)) + return 0; + /* + * Read/write DACs are always overridable. + * Executable DACs are overridable when there is + * at least one exec bit set. + */ + if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) + if (capable_wrt_inode_uidgid(idmap, inode, + CAP_DAC_OVERRIDE)) return 0; return -EACCES; } +EXPORT_SYMBOL(generic_permission); -/* +/** + * do_inode_permission - UNIX permission checking + * @idmap: idmap of the mount the inode was found from + * @inode: inode to check permissions on + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) + * * We _really_ want to just do "generic_permission()" without * even looking at the inode->i_op values. So we keep a cache * flag in inode->i_opflags, that says "this has not special * permission function, use the fast case". */ -static inline int do_inode_permission(struct inode *inode, int mask) +static inline int do_inode_permission(struct mnt_idmap *idmap, + struct inode *inode, int mask) { if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { if (likely(inode->i_op->permission)) - return inode->i_op->permission(inode, mask); + return inode->i_op->permission(idmap, inode, mask); /* This gets set once for the inode lifetime */ spin_lock(&inode->i_lock); inode->i_opflags |= IOP_FASTPERM; spin_unlock(&inode->i_lock); } - return generic_permission(inode, mask); -} - -/** - * __inode_permission - Check for access rights to a given inode - * @inode: Inode to check permission on - * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) - * - * Check for read/write/execute permissions on an inode. - * - * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. - * - * This does not check for a read-only file system. You probably want - * inode_permission(). - */ -int __inode_permission(struct inode *inode, int mask) -{ - int retval; - - if (unlikely(mask & MAY_WRITE)) { - /* - * Nobody gets write access to an immutable file. - */ - if (IS_IMMUTABLE(inode)) - return -EACCES; - } - - retval = do_inode_permission(inode, mask); - if (retval) - return retval; - - retval = devcgroup_inode_permission(inode, mask); - if (retval) - return retval; - - return security_inode_permission(inode, mask); + return generic_permission(idmap, inode, mask); } /** @@ -410,15 +540,17 @@ int __inode_permission(struct inode *inode, int mask) * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Separate out file-system wide checks from inode-specific permission checks. + * + * Note: lookup_inode_permission_may_exec() does not call here. If you add + * MAY_EXEC checks, adjust it. */ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) { - if (unlikely(mask & MAY_WRITE)) { + if (mask & MAY_WRITE) { umode_t mode = inode->i_mode; /* Nobody gets write access to a read-only fs. */ - if ((sb->s_flags & MS_RDONLY) && - (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) + if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) return -EROFS; } return 0; @@ -426,8 +558,9 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) /** * inode_permission - Check for access rights to a given inode - * @inode: Inode to check permission on - * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * @idmap: idmap of the mount the inode was found from + * @inode: Inode to check permission on + * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Check for read/write/execute permissions on an inode. We use fs[ug]id for * this, letting us set arbitrary permissions for filesystem access without @@ -435,14 +568,77 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) * * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. */ -int inode_permission(struct inode *inode, int mask) +int inode_permission(struct mnt_idmap *idmap, + struct inode *inode, int mask) { int retval; retval = sb_permission(inode->i_sb, inode, mask); - if (retval) + if (unlikely(retval)) return retval; - return __inode_permission(inode, mask); + + if (mask & MAY_WRITE) { + /* + * Nobody gets write access to an immutable file. + */ + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + /* + * Updating mtime will likely cause i_uid and i_gid to be + * written back improperly if their true value is unknown + * to the vfs. + */ + if (unlikely(HAS_UNMAPPED_ID(idmap, inode))) + return -EACCES; + } + + retval = do_inode_permission(idmap, inode, mask); + if (unlikely(retval)) + return retval; + + retval = devcgroup_inode_permission(inode, mask); + if (unlikely(retval)) + return retval; + + return security_inode_permission(inode, mask); +} +EXPORT_SYMBOL(inode_permission); + +/* + * lookup_inode_permission_may_exec - Check traversal right for given inode + * + * This is a special case routine for may_lookup() making assumptions specific + * to path traversal. Use inode_permission() if you are doing something else. + * + * Work is shaved off compared to inode_permission() as follows: + * - we know for a fact there is no MAY_WRITE to worry about + * - it is an invariant the inode is a directory + * + * Since majority of real-world traversal happens on inodes which grant it for + * everyone, we check it upfront and only resort to more expensive work if it + * fails. + * + * Filesystems which have their own ->permission hook and consequently miss out + * on IOP_FASTPERM can still get the optimization if they set IOP_FASTPERM_MAY_EXEC + * on their directory inodes. + */ +static __always_inline int lookup_inode_permission_may_exec(struct mnt_idmap *idmap, + struct inode *inode, int mask) +{ + /* Lookup already checked this to return -ENOTDIR */ + VFS_BUG_ON_INODE(!S_ISDIR(inode->i_mode), inode); + VFS_BUG_ON((mask & ~MAY_NOT_BLOCK) != 0); + + mask |= MAY_EXEC; + + if (unlikely(!(inode->i_opflags & (IOP_FASTPERM | IOP_FASTPERM_MAY_EXEC)))) + return inode_permission(idmap, inode, mask); + + if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode))) + return inode_permission(idmap, inode, mask); + + return security_inode_permission(inode, mask); } /** @@ -471,99 +667,308 @@ void path_put(const struct path *path) } EXPORT_SYMBOL(path_put); +#define EMBEDDED_LEVELS 2 +struct nameidata { + struct path path; + struct qstr last; + struct path root; + struct inode *inode; /* path.dentry.d_inode */ + unsigned int flags, state; + unsigned seq, next_seq, m_seq, r_seq; + int last_type; + unsigned depth; + int total_link_count; + struct saved { + struct path link; + struct delayed_call done; + const char *name; + unsigned seq; + } *stack, internal[EMBEDDED_LEVELS]; + struct filename *name; + const char *pathname; + struct nameidata *saved; + unsigned root_seq; + int dfd; + vfsuid_t dir_vfsuid; + umode_t dir_mode; +} __randomize_layout; + +#define ND_ROOT_PRESET 1 +#define ND_ROOT_GRABBED 2 +#define ND_JUMPED 4 + +static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name) +{ + struct nameidata *old = current->nameidata; + p->stack = p->internal; + p->depth = 0; + p->dfd = dfd; + p->name = name; + p->pathname = likely(name) ? name->name : ""; + p->path.mnt = NULL; + p->path.dentry = NULL; + p->total_link_count = old ? old->total_link_count : 0; + p->saved = old; + current->nameidata = p; +} + +static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name, + const struct path *root) +{ + __set_nameidata(p, dfd, name); + p->state = 0; + if (unlikely(root)) { + p->state = ND_ROOT_PRESET; + p->root = *root; + } +} + +static void restore_nameidata(void) +{ + struct nameidata *now = current->nameidata, *old = now->saved; + + current->nameidata = old; + if (old) + old->total_link_count = now->total_link_count; + if (now->stack != now->internal) + kfree(now->stack); +} + +static bool nd_alloc_stack(struct nameidata *nd) +{ + struct saved *p; + + p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), + nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL); + if (unlikely(!p)) + return false; + memcpy(p, nd->internal, sizeof(nd->internal)); + nd->stack = p; + return true; +} + +/** + * path_connected - Verify that a dentry is below mnt.mnt_root + * @mnt: The mountpoint to check. + * @dentry: The dentry to check. + * + * Rename can sometimes move a file or directory outside of a bind + * mount, path_connected allows those cases to be detected. + */ +static bool path_connected(struct vfsmount *mnt, struct dentry *dentry) +{ + struct super_block *sb = mnt->mnt_sb; + + /* Bind mounts can have disconnected paths */ + if (mnt->mnt_root == sb->s_root) + return true; + + return is_subdir(dentry, mnt->mnt_root); +} + +static void drop_links(struct nameidata *nd) +{ + int i = nd->depth; + while (i--) { + struct saved *last = nd->stack + i; + do_delayed_call(&last->done); + clear_delayed_call(&last->done); + } +} + +static void leave_rcu(struct nameidata *nd) +{ + nd->flags &= ~LOOKUP_RCU; + nd->seq = nd->next_seq = 0; + rcu_read_unlock(); +} + +static void terminate_walk(struct nameidata *nd) +{ + if (unlikely(nd->depth)) + drop_links(nd); + if (!(nd->flags & LOOKUP_RCU)) { + int i; + path_put(&nd->path); + for (i = 0; i < nd->depth; i++) + path_put(&nd->stack[i].link); + if (nd->state & ND_ROOT_GRABBED) { + path_put(&nd->root); + nd->state &= ~ND_ROOT_GRABBED; + } + } else { + leave_rcu(nd); + } + nd->depth = 0; + nd->path.mnt = NULL; + nd->path.dentry = NULL; +} + +/* path_put is needed afterwards regardless of success or failure */ +static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq) +{ + int res = __legitimize_mnt(path->mnt, mseq); + if (unlikely(res)) { + if (res > 0) + path->mnt = NULL; + path->dentry = NULL; + return false; + } + if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) { + path->dentry = NULL; + return false; + } + return !read_seqcount_retry(&path->dentry->d_seq, seq); +} + +static inline bool legitimize_path(struct nameidata *nd, + struct path *path, unsigned seq) +{ + return __legitimize_path(path, seq, nd->m_seq); +} + +static bool legitimize_links(struct nameidata *nd) +{ + int i; + if (unlikely(nd->flags & LOOKUP_CACHED)) { + drop_links(nd); + nd->depth = 0; + return false; + } + for (i = 0; i < nd->depth; i++) { + struct saved *last = nd->stack + i; + if (unlikely(!legitimize_path(nd, &last->link, last->seq))) { + drop_links(nd); + nd->depth = i + 1; + return false; + } + } + return true; +} + +static bool legitimize_root(struct nameidata *nd) +{ + /* Nothing to do if nd->root is zero or is managed by the VFS user. */ + if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET)) + return true; + nd->state |= ND_ROOT_GRABBED; + return legitimize_path(nd, &nd->root, nd->root_seq); +} + /* * Path walking has 2 modes, rcu-walk and ref-walk (see * Documentation/filesystems/path-lookup.txt). In situations when we can't * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab - * normal reference counts on dentries and vfsmounts to transition to rcu-walk + * normal reference counts on dentries and vfsmounts to transition to ref-walk * mode. Refcounts are grabbed at the last known good point before rcu-walk * got stuck, so ref-walk may continue from there. If this is not successful * (eg. a seqcount has changed), then failure is returned and it's up to caller * to restart the path walk from the beginning in ref-walk mode. */ -static inline void lock_rcu_walk(void) +/** + * try_to_unlazy - try to switch to ref-walk mode. + * @nd: nameidata pathwalk data + * Returns: true on success, false on failure + * + * try_to_unlazy attempts to legitimize the current nd->path and nd->root + * for ref-walk mode. + * Must be called from rcu-walk context. + * Nothing should touch nameidata between try_to_unlazy() failure and + * terminate_walk(). + */ +static bool try_to_unlazy(struct nameidata *nd) { - br_read_lock(&vfsmount_lock); - rcu_read_lock(); -} + struct dentry *parent = nd->path.dentry; -static inline void unlock_rcu_walk(void) -{ - rcu_read_unlock(); - br_read_unlock(&vfsmount_lock); + BUG_ON(!(nd->flags & LOOKUP_RCU)); + + if (unlikely(nd->depth && !legitimize_links(nd))) + goto out1; + if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) + goto out; + if (unlikely(!legitimize_root(nd))) + goto out; + leave_rcu(nd); + BUG_ON(nd->inode != parent->d_inode); + return true; + +out1: + nd->path.mnt = NULL; + nd->path.dentry = NULL; +out: + leave_rcu(nd); + return false; } /** - * unlazy_walk - try to switch to ref-walk mode. + * try_to_unlazy_next - try to switch to ref-walk mode. * @nd: nameidata pathwalk data - * @dentry: child of nd->path.dentry or NULL - * Returns: 0 on success, -ECHILD on failure + * @dentry: next dentry to step into + * Returns: true on success, false on failure * - * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry - * for ref-walk mode. @dentry must be a path found by a do_lookup call on - * @nd or NULL. Must be called from rcu-walk context. + * Similar to try_to_unlazy(), but here we have the next dentry already + * picked by rcu-walk and want to legitimize that in addition to the current + * nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context. + * Nothing should touch nameidata between try_to_unlazy_next() failure and + * terminate_walk(). */ -static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) +static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) { - struct fs_struct *fs = current->fs; - struct dentry *parent = nd->path.dentry; - int want_root = 0; - + int res; BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { - want_root = 1; - spin_lock(&fs->lock); - if (nd->root.mnt != fs->root.mnt || - nd->root.dentry != fs->root.dentry) - goto err_root; - } - spin_lock(&parent->d_lock); - if (!dentry) { - if (!__d_rcu_to_refcount(parent, nd->seq)) - goto err_parent; - BUG_ON(nd->inode != parent->d_inode); - } else { - if (dentry->d_parent != parent) - goto err_parent; - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (!__d_rcu_to_refcount(dentry, nd->seq)) - goto err_child; - /* - * If the sequence check on the child dentry passed, then - * the child has not been removed from its parent. This - * means the parent dentry must be valid and able to take - * a reference at this point. - */ - BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); - BUG_ON(!parent->d_count); - parent->d_count++; - spin_unlock(&dentry->d_lock); - } - spin_unlock(&parent->d_lock); - if (want_root) { - path_get(&nd->root); - spin_unlock(&fs->lock); + + if (unlikely(nd->depth && !legitimize_links(nd))) + goto out2; + res = __legitimize_mnt(nd->path.mnt, nd->m_seq); + if (unlikely(res)) { + if (res > 0) + goto out2; + goto out1; } - mntget(nd->path.mnt); + if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref))) + goto out1; - unlock_rcu_walk(); - nd->flags &= ~LOOKUP_RCU; - return 0; + /* + * We need to move both the parent and the dentry from the RCU domain + * to be properly refcounted. And the sequence number in the dentry + * validates *both* dentry counters, since we checked the sequence + * number of the parent after we got the child sequence number. So we + * know the parent must still be valid if the child sequence number is + */ + if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) + goto out; + if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) + goto out_dput; + /* + * Sequence counts matched. Now make sure that the root is + * still valid and get it if required. + */ + if (unlikely(!legitimize_root(nd))) + goto out_dput; + leave_rcu(nd); + return true; -err_child: - spin_unlock(&dentry->d_lock); -err_parent: - spin_unlock(&parent->d_lock); -err_root: - if (want_root) - spin_unlock(&fs->lock); - return -ECHILD; +out2: + nd->path.mnt = NULL; +out1: + nd->path.dentry = NULL; +out: + leave_rcu(nd); + return false; +out_dput: + leave_rcu(nd); + dput(dentry); + return false; } -static inline int d_revalidate(struct dentry *dentry, unsigned int flags) +static inline int d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { - return dentry->d_op->d_revalidate(dentry, flags); + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) + return dentry->d_op->d_revalidate(dir, name, dentry, flags); + else + return 1; } /** @@ -582,22 +987,40 @@ static int complete_walk(struct nameidata *nd) int status; if (nd->flags & LOOKUP_RCU) { - nd->flags &= ~LOOKUP_RCU; - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; - spin_lock(&dentry->d_lock); - if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) { - spin_unlock(&dentry->d_lock); - unlock_rcu_walk(); + /* + * We don't want to zero nd->root for scoped-lookups or + * externally-managed nd->root. + */ + if (likely(!(nd->state & ND_ROOT_PRESET))) + if (likely(!(nd->flags & LOOKUP_IS_SCOPED))) + nd->root.mnt = NULL; + nd->flags &= ~LOOKUP_CACHED; + if (!try_to_unlazy(nd)) return -ECHILD; - } - BUG_ON(nd->inode != dentry->d_inode); - spin_unlock(&dentry->d_lock); - mntget(nd->path.mnt); - unlock_rcu_walk(); } - if (likely(!(nd->flags & LOOKUP_JUMPED))) + if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { + /* + * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't + * ever step outside the root during lookup" and should already + * be guaranteed by the rest of namei, we want to avoid a namei + * BUG resulting in userspace being given a path that was not + * scoped within the root at some point during the lookup. + * + * So, do a final sanity-check to make sure that in the + * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED) + * we won't silently return an fd completely outside of the + * requested root to userspace. + * + * Userspace could move the path outside the root after this + * check, but as discussed elsewhere this is not a concern (the + * resolved file was inside the root at some point). + */ + if (!path_is_under(&nd->path, &nd->root)) + return -EXDEV; + } + + if (likely(!(nd->state & ND_JUMPED))) return 0; if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE))) @@ -610,102 +1033,166 @@ static int complete_walk(struct nameidata *nd) if (!status) status = -ESTALE; - path_put(&nd->path); return status; } -static __always_inline void set_root(struct nameidata *nd) +static int set_root(struct nameidata *nd) { - if (!nd->root.mnt) - get_fs_root(current->fs, &nd->root); -} + struct fs_struct *fs = current->fs; -static int link_path_walk(const char *, struct nameidata *); + /* + * Jumping to the real root in a scoped-lookup is a BUG in namei, but we + * still have to ensure it doesn't happen because it will cause a breakout + * from the dirfd. + */ + if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED)) + return -ENOTRECOVERABLE; -static __always_inline void set_root_rcu(struct nameidata *nd) -{ - if (!nd->root.mnt) { - struct fs_struct *fs = current->fs; + if (nd->flags & LOOKUP_RCU) { unsigned seq; do { - seq = read_seqcount_begin(&fs->seq); + seq = read_seqbegin(&fs->seq); nd->root = fs->root; - nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); + nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); + } while (read_seqretry(&fs->seq, seq)); + } else { + get_fs_root(fs, &nd->root); + nd->state |= ND_ROOT_GRABBED; } + return 0; } -static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) +static int nd_jump_root(struct nameidata *nd) { - int ret; - - if (IS_ERR(link)) - goto fail; - - if (*link == '/') { - set_root(nd); + if (unlikely(nd->flags & LOOKUP_BENEATH)) + return -EXDEV; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { + /* Absolute path arguments to path_init() are allowed. */ + if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt) + return -EXDEV; + } + if (!nd->root.mnt) { + int error = set_root(nd); + if (unlikely(error)) + return error; + } + if (nd->flags & LOOKUP_RCU) { + struct dentry *d; + nd->path = nd->root; + d = nd->path.dentry; + nd->inode = d->d_inode; + nd->seq = nd->root_seq; + if (read_seqcount_retry(&d->d_seq, nd->seq)) + return -ECHILD; + } else { path_put(&nd->path); nd->path = nd->root; - path_get(&nd->root); - nd->flags |= LOOKUP_JUMPED; + path_get(&nd->path); + nd->inode = nd->path.dentry->d_inode; } - nd->inode = nd->path.dentry->d_inode; - - ret = link_path_walk(link, nd); - return ret; -fail: - path_put(&nd->path); - return PTR_ERR(link); + nd->state |= ND_JUMPED; + return 0; } -static void path_put_conditional(struct path *path, struct nameidata *nd) +/* + * Helper to directly jump to a known parsed path from ->get_link, + * caller must have taken a reference to path beforehand. + */ +int nd_jump_link(const struct path *path) { - dput(path->dentry); - if (path->mnt != nd->path.mnt) - mntput(path->mnt); -} + int error = -ELOOP; + struct nameidata *nd = current->nameidata; -static inline void path_to_nameidata(const struct path *path, - struct nameidata *nd) -{ - if (!(nd->flags & LOOKUP_RCU)) { - dput(nd->path.dentry); + if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS)) + goto err; + + error = -EXDEV; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { if (nd->path.mnt != path->mnt) - mntput(nd->path.mnt); + goto err; } - nd->path.mnt = path->mnt; - nd->path.dentry = path->dentry; -} + /* Not currently safe for scoped-lookups. */ + if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) + goto err; -/* - * Helper to directly jump to a known parsed path from ->follow_link, - * caller must have taken a reference to path beforehand. - */ -void nd_jump_link(struct nameidata *nd, struct path *path) -{ path_put(&nd->path); - nd->path = *path; nd->inode = nd->path.dentry->d_inode; - nd->flags |= LOOKUP_JUMPED; + nd->state |= ND_JUMPED; + return 0; + +err: + path_put(path); + return error; } -static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) +static inline void put_link(struct nameidata *nd) { - struct inode *inode = link->dentry->d_inode; - if (inode->i_op->put_link) - inode->i_op->put_link(link->dentry, nd, cookie); - path_put(link); + struct saved *last = nd->stack + --nd->depth; + do_delayed_call(&last->done); + if (!(nd->flags & LOOKUP_RCU)) + path_put(&last->link); +} + +static int sysctl_protected_symlinks __read_mostly; +static int sysctl_protected_hardlinks __read_mostly; +static int sysctl_protected_fifos __read_mostly; +static int sysctl_protected_regular __read_mostly; + +#ifdef CONFIG_SYSCTL +static const struct ctl_table namei_sysctls[] = { + { + .procname = "protected_symlinks", + .data = &sysctl_protected_symlinks, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "protected_hardlinks", + .data = &sysctl_protected_hardlinks, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "protected_fifos", + .data = &sysctl_protected_fifos, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "protected_regular", + .data = &sysctl_protected_regular, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, +}; + +static int __init init_fs_namei_sysctls(void) +{ + register_sysctl_init("fs", namei_sysctls); + return 0; } +fs_initcall(init_fs_namei_sysctls); -int sysctl_protected_symlinks __read_mostly = 0; -int sysctl_protected_hardlinks __read_mostly = 0; +#endif /* CONFIG_SYSCTL */ /** * may_follow_link - Check symlink following for unsafe situations - * @link: The path of the symlink * @nd: nameidata pathwalk data + * @inode: Used for idmapping. * * In the case of the sysctl_protected_symlinks sysctl being enabled, * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is @@ -718,36 +1205,39 @@ int sysctl_protected_hardlinks __read_mostly = 0; * * Returns 0 if following the symlink is allowed, -ve on error. */ -static inline int may_follow_link(struct path *link, struct nameidata *nd) +static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { - const struct inode *inode; - const struct inode *parent; + struct mnt_idmap *idmap; + vfsuid_t vfsuid; if (!sysctl_protected_symlinks) return 0; + idmap = mnt_idmap(nd->path.mnt); + vfsuid = i_uid_into_vfsuid(idmap, inode); /* Allowed if owner and follower match. */ - inode = link->dentry->d_inode; - if (uid_eq(current_cred()->fsuid, inode->i_uid)) + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return 0; /* Allowed if parent directory not sticky and world-writable. */ - parent = nd->path.dentry->d_inode; - if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) + if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) return 0; /* Allowed if parent directory and link owner match. */ - if (uid_eq(parent->i_uid, inode->i_uid)) + if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid)) return 0; - audit_log_link_denied("follow_link", link); - path_put_conditional(link, nd); - path_put(&nd->path); + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + audit_inode(nd->name, nd->stack[0].link.dentry, 0); + audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link"); return -EACCES; } /** * safe_hardlink_source - Check for safe hardlink conditions + * @idmap: idmap of the mount the inode was found from * @inode: the source inode to hardlink from * * Return false if at least one of the following conditions: @@ -758,7 +1248,8 @@ static inline int may_follow_link(struct path *link, struct nameidata *nd) * * Otherwise returns true. */ -static bool safe_hardlink_source(struct inode *inode) +static bool safe_hardlink_source(struct mnt_idmap *idmap, + struct inode *inode) { umode_t mode = inode->i_mode; @@ -775,7 +1266,7 @@ static bool safe_hardlink_source(struct inode *inode) return false; /* Hardlinking to unreadable or unwritable sources is dangerous. */ - if (inode_permission(inode, MAY_READ | MAY_WRITE)) + if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE)) return false; return true; @@ -783,100 +1274,116 @@ static bool safe_hardlink_source(struct inode *inode) /** * may_linkat - Check permissions for creating a hardlink - * @link: the source to hardlink from + * @idmap: idmap of the mount the inode was found from + * @link: the source to hardlink from * * Block hardlink when all of: * - sysctl_protected_hardlinks enabled * - fsuid does not match inode * - hardlink source is unsafe (see safe_hardlink_source() above) - * - not CAP_FOWNER + * - not CAP_FOWNER in a namespace with the inode owner uid mapped + * + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if successful, -ve on error. */ -static int may_linkat(struct path *link) +int may_linkat(struct mnt_idmap *idmap, const struct path *link) { - const struct cred *cred; - struct inode *inode; + struct inode *inode = link->dentry->d_inode; + + /* Inode writeback is not safe when the uid or gid are invalid. */ + if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || + !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) + return -EOVERFLOW; if (!sysctl_protected_hardlinks) return 0; - cred = current_cred(); - inode = link->dentry->d_inode; - /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ - if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) || - capable(CAP_FOWNER)) + if (safe_hardlink_source(idmap, inode) || + inode_owner_or_capable(idmap, inode)) return 0; - audit_log_link_denied("linkat", link); + audit_log_path_denied(AUDIT_ANOM_LINK, "linkat"); return -EPERM; } -static __always_inline int -follow_link(struct path *link, struct nameidata *nd, void **p) +/** + * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory + * should be allowed, or not, on files that already + * exist. + * @idmap: idmap of the mount the inode was found from + * @nd: nameidata pathwalk data + * @inode: the inode of the file to open + * + * Block an O_CREAT open of a FIFO (or a regular file) when: + * - sysctl_protected_fifos (or sysctl_protected_regular) is enabled + * - the file already exists + * - we are in a sticky directory + * - we don't own the file + * - the owner of the directory doesn't own the file + * - the directory is world writable + * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2 + * the directory doesn't have to be world writable: being group writable will + * be enough. + * + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply pass @nop_mnt_idmap. + * + * Returns 0 if the open is allowed, -ve on error. + */ +static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd, + struct inode *const inode) { - struct dentry *dentry = link->dentry; - int error; - char *s; + umode_t dir_mode = nd->dir_mode; + vfsuid_t dir_vfsuid = nd->dir_vfsuid, i_vfsuid; - BUG_ON(nd->flags & LOOKUP_RCU); + if (likely(!(dir_mode & S_ISVTX))) + return 0; - if (link->mnt == nd->path.mnt) - mntget(link->mnt); + if (S_ISREG(inode->i_mode) && !sysctl_protected_regular) + return 0; - error = -ELOOP; - if (unlikely(current->total_link_count >= 40)) - goto out_put_nd_path; + if (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos) + return 0; - cond_resched(); - current->total_link_count++; + i_vfsuid = i_uid_into_vfsuid(idmap, inode); - touch_atime(link); - nd_set_link(nd, NULL); + if (vfsuid_eq(i_vfsuid, dir_vfsuid)) + return 0; - error = security_inode_follow_link(link->dentry, nd); - if (error) - goto out_put_nd_path; - - nd->last_type = LAST_BIND; - *p = dentry->d_inode->i_op->follow_link(dentry, nd); - error = PTR_ERR(*p); - if (IS_ERR(*p)) - goto out_put_nd_path; - - error = 0; - s = nd_get_link(nd); - if (s) { - error = __vfs_follow_link(nd, s); - if (unlikely(error)) - put_link(nd, link, *p); - } + if (vfsuid_eq_kuid(i_vfsuid, current_fsuid())) + return 0; - return error; + if (likely(dir_mode & 0002)) { + audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create"); + return -EACCES; + } -out_put_nd_path: - *p = NULL; - path_put(&nd->path); - path_put(link); - return error; -} + if (dir_mode & 0020) { + if (sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) { + audit_log_path_denied(AUDIT_ANOM_CREAT, + "sticky_create_fifo"); + return -EACCES; + } -static int follow_up_rcu(struct path *path) -{ - struct mount *mnt = real_mount(path->mnt); - struct mount *parent; - struct dentry *mountpoint; + if (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode)) { + audit_log_path_denied(AUDIT_ANOM_CREAT, + "sticky_create_regular"); + return -EACCES; + } + } - parent = mnt->mnt_parent; - if (&parent->mnt == path->mnt) - return 0; - mountpoint = mnt->mnt_mountpoint; - path->dentry = mountpoint; - path->mnt = &parent->mnt; - return 1; + return 0; } /* @@ -895,35 +1402,76 @@ int follow_up(struct path *path) struct mount *parent; struct dentry *mountpoint; - br_read_lock(&vfsmount_lock); + read_seqlock_excl(&mount_lock); parent = mnt->mnt_parent; if (parent == mnt) { - br_read_unlock(&vfsmount_lock); + read_sequnlock_excl(&mount_lock); return 0; } mntget(&parent->mnt); mountpoint = dget(mnt->mnt_mountpoint); - br_read_unlock(&vfsmount_lock); + read_sequnlock_excl(&mount_lock); dput(path->dentry); path->dentry = mountpoint; mntput(path->mnt); path->mnt = &parent->mnt; return 1; } +EXPORT_SYMBOL(follow_up); + +static bool choose_mountpoint_rcu(struct mount *m, const struct path *root, + struct path *path, unsigned *seqp) +{ + while (mnt_has_parent(m)) { + struct dentry *mountpoint = m->mnt_mountpoint; + + m = m->mnt_parent; + if (unlikely(root->dentry == mountpoint && + root->mnt == &m->mnt)) + break; + if (mountpoint != m->mnt.mnt_root) { + path->mnt = &m->mnt; + path->dentry = mountpoint; + *seqp = read_seqcount_begin(&mountpoint->d_seq); + return true; + } + } + return false; +} + +static bool choose_mountpoint(struct mount *m, const struct path *root, + struct path *path) +{ + bool found; + + rcu_read_lock(); + while (1) { + unsigned seq, mseq = read_seqbegin(&mount_lock); + + found = choose_mountpoint_rcu(m, root, path, &seq); + if (unlikely(!found)) { + if (!read_seqretry(&mount_lock, mseq)) + break; + } else { + if (likely(__legitimize_path(path, seq, mseq))) + break; + rcu_read_unlock(); + path_put(path); + rcu_read_lock(); + } + } + rcu_read_unlock(); + return found; +} /* * Perform an automount * - return -EISDIR to tell follow_managed() to stop and return the path we * were called with. */ -static int follow_automount(struct path *path, unsigned flags, - bool *need_mntput) +static int follow_automount(struct path *path, int *count, unsigned lookup_flags) { - struct vfsmount *mnt; - int err; - - if (!path->dentry->d_op || !path->dentry->d_op->d_automount) - return -EREMOTE; + struct dentry *dentry = path->dentry; /* We don't want to mount if someone's just doing a stat - * unless they're stat'ing a directory and appended a '/' to @@ -936,125 +1484,101 @@ static int follow_automount(struct path *path, unsigned flags, * as being automount points. These will need the attentions * of the daemon to instantiate them before they can be used. */ - if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | - LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && - path->dentry->d_inode) + if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | + LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && + dentry->d_inode) return -EISDIR; - current->total_link_count++; - if (current->total_link_count >= 40) - return -ELOOP; - - mnt = path->dentry->d_op->d_automount(path); - if (IS_ERR(mnt)) { - /* - * The filesystem is allowed to return -EISDIR here to indicate - * it doesn't want to automount. For instance, autofs would do - * this so that its userspace daemon can mount on this dentry. - * - * However, we can only permit this if it's a terminal point in - * the path being looked up; if it wasn't then the remainder of - * the path is inaccessible and we should say so. - */ - if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT)) - return -EREMOTE; - return PTR_ERR(mnt); - } - - if (!mnt) /* mount collision */ - return 0; - - if (!*need_mntput) { - /* lock_mount() may release path->mnt on error */ - mntget(path->mnt); - *need_mntput = true; - } - err = finish_automount(mnt, path); + /* No need to trigger automounts if mountpoint crossing is disabled. */ + if (lookup_flags & LOOKUP_NO_XDEV) + return -EXDEV; - switch (err) { - case -EBUSY: - /* Someone else made a mount here whilst we were busy */ - return 0; - case 0: - path_put(path); - path->mnt = mnt; - path->dentry = dget(mnt->mnt_root); - return 0; - default: - return err; - } + if (count && (*count)++ >= MAXSYMLINKS) + return -ELOOP; + return finish_automount(dentry->d_op->d_automount(path), path); } /* - * Handle a dentry that is managed in some way. - * - Flagged for transit management (autofs) - * - Flagged as mountpoint - * - Flagged as automount point - * - * This may only be called in refwalk mode. - * - * Serialization is taken care of in namespace.c + * mount traversal - out-of-line part. One note on ->d_flags accesses - + * dentries are pinned but not locked here, so negative dentry can go + * positive right under us. Use of smp_load_acquire() provides a barrier + * sufficient for ->d_inode and ->d_flags consistency. */ -static int follow_managed(struct path *path, unsigned flags) +static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, + int *count, unsigned lookup_flags) { - struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */ - unsigned managed; + struct vfsmount *mnt = path->mnt; bool need_mntput = false; int ret = 0; - /* Given that we're not holding a lock here, we retain the value in a - * local variable for each dentry as we look at it so that we don't see - * the components of that value change under us */ - while (managed = ACCESS_ONCE(path->dentry->d_flags), - managed &= DCACHE_MANAGED_DENTRY, - unlikely(managed != 0)) { - /* Allow the filesystem to manage the transit without i_mutex + while (flags & DCACHE_MANAGED_DENTRY) { + /* Allow the filesystem to manage the transit without i_rwsem * being held. */ - if (managed & DCACHE_MANAGE_TRANSIT) { - BUG_ON(!path->dentry->d_op); - BUG_ON(!path->dentry->d_op->d_manage); - ret = path->dentry->d_op->d_manage(path->dentry, false); + if (flags & DCACHE_MANAGE_TRANSIT) { + if (lookup_flags & LOOKUP_NO_XDEV) { + ret = -EXDEV; + break; + } + ret = path->dentry->d_op->d_manage(path, false); + flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) break; } - /* Transit to a mounted filesystem. */ - if (managed & DCACHE_MOUNTED) { + if (flags & DCACHE_MOUNTED) { // something's mounted on it.. struct vfsmount *mounted = lookup_mnt(path); - if (mounted) { + if (mounted) { // ... in our namespace dput(path->dentry); if (need_mntput) mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); + // here we know it's positive + flags = path->dentry->d_flags; need_mntput = true; + if (unlikely(lookup_flags & LOOKUP_NO_XDEV)) { + ret = -EXDEV; + break; + } continue; } - - /* Something is mounted on this dentry in another - * namespace and/or whatever was mounted there in this - * namespace got unmounted before we managed to get the - * vfsmount_lock */ } - /* Handle an automount point */ - if (managed & DCACHE_NEED_AUTOMOUNT) { - ret = follow_automount(path, flags, &need_mntput); - if (ret < 0) - break; - continue; - } + if (!(flags & DCACHE_NEED_AUTOMOUNT)) + break; - /* We didn't change the current path point */ - break; + // uncovered automount point + ret = follow_automount(path, count, lookup_flags); + flags = smp_load_acquire(&path->dentry->d_flags); + if (ret < 0) + break; } - if (need_mntput && path->mnt == mnt) - mntput(path->mnt); if (ret == -EISDIR) ret = 0; - return ret < 0 ? ret : need_mntput; + // possible if you race with several mount --move + if (need_mntput && path->mnt == mnt) + mntput(path->mnt); + if (!ret && unlikely(d_flags_negative(flags))) + ret = -ENOENT; + *jumped = need_mntput; + return ret; +} + +static inline int traverse_mounts(struct path *path, bool *jumped, + int *count, unsigned lookup_flags) +{ + unsigned flags = smp_load_acquire(&path->dentry->d_flags); + + /* fastpath */ + if (likely(!(flags & DCACHE_MANAGED_DENTRY))) { + *jumped = false; + if (unlikely(d_flags_negative(flags))) + return -ENOENT; + return 0; + } + return __traverse_mounts(path, flags, jumped, count, lookup_flags); } int follow_down_one(struct path *path) @@ -1071,424 +1595,434 @@ int follow_down_one(struct path *path) } return 0; } +EXPORT_SYMBOL(follow_down_one); -static inline bool managed_dentry_might_block(struct dentry *dentry) +/* + * Follow down to the covering mount currently visible to userspace. At each + * point, the filesystem owning that dentry may be queried as to whether the + * caller is permitted to proceed or not. + */ +int follow_down(struct path *path, unsigned int flags) { - return (dentry->d_flags & DCACHE_MANAGE_TRANSIT && - dentry->d_op->d_manage(dentry, true) < 0); + struct vfsmount *mnt = path->mnt; + bool jumped; + int ret = traverse_mounts(path, &jumped, NULL, flags); + + if (path->mnt != mnt) + mntput(mnt); + return ret; } +EXPORT_SYMBOL(follow_down); /* * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if * we meet a managed dentry that would need blocking. */ -static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, - struct inode **inode) +static bool __follow_mount_rcu(struct nameidata *nd, struct path *path) { + struct dentry *dentry = path->dentry; + unsigned int flags = dentry->d_flags; + + if (likely(!(flags & DCACHE_MANAGED_DENTRY))) + return true; + + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return false; + for (;;) { - struct mount *mounted; /* * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ - if (unlikely(managed_dentry_might_block(path->dentry))) - return false; - - if (!d_mountpoint(path->dentry)) - break; - - mounted = __lookup_mnt(path->mnt, path->dentry, 1); - if (!mounted) - break; - path->mnt = &mounted->mnt; - path->dentry = mounted->mnt.mnt_root; - nd->flags |= LOOKUP_JUMPED; - nd->seq = read_seqcount_begin(&path->dentry->d_seq); - /* - * Update the inode too. We don't need to re-check the - * dentry sequence number here after this d_inode read, - * because a mount-point is always pinned. - */ - *inode = path->dentry->d_inode; - } - return true; -} - -static void follow_mount_rcu(struct nameidata *nd) -{ - while (d_mountpoint(nd->path.dentry)) { - struct mount *mounted; - mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1); - if (!mounted) - break; - nd->path.mnt = &mounted->mnt; - nd->path.dentry = mounted->mnt.mnt_root; - nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); - } -} - -static int follow_dotdot_rcu(struct nameidata *nd) -{ - set_root_rcu(nd); - - while (1) { - if (nd->path.dentry == nd->root.dentry && - nd->path.mnt == nd->root.mnt) { - break; + if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) { + int res = dentry->d_op->d_manage(path, true); + if (res) + return res == -EISDIR; + flags = dentry->d_flags; } - if (nd->path.dentry != nd->path.mnt->mnt_root) { - struct dentry *old = nd->path.dentry; - struct dentry *parent = old->d_parent; - unsigned seq; - seq = read_seqcount_begin(&parent->d_seq); - if (read_seqcount_retry(&old->d_seq, nd->seq)) - goto failed; - nd->path.dentry = parent; - nd->seq = seq; - break; + if (flags & DCACHE_MOUNTED) { + struct mount *mounted = __lookup_mnt(path->mnt, dentry); + if (mounted) { + path->mnt = &mounted->mnt; + dentry = path->dentry = mounted->mnt.mnt_root; + nd->state |= ND_JUMPED; + nd->next_seq = read_seqcount_begin(&dentry->d_seq); + flags = dentry->d_flags; + // makes sure that non-RCU pathwalk could reach + // this state. + if (read_seqretry(&mount_lock, nd->m_seq)) + return false; + continue; + } + if (read_seqretry(&mount_lock, nd->m_seq)) + return false; } - if (!follow_up_rcu(&nd->path)) - break; - nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); + return !(flags & DCACHE_NEED_AUTOMOUNT); } - follow_mount_rcu(nd); - nd->inode = nd->path.dentry->d_inode; - return 0; - -failed: - nd->flags &= ~LOOKUP_RCU; - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; - unlock_rcu_walk(); - return -ECHILD; } -/* - * Follow down to the covering mount currently visible to userspace. At each - * point, the filesystem owning that dentry may be queried as to whether the - * caller is permitted to proceed or not. - */ -int follow_down(struct path *path) +static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, + struct path *path) { - unsigned managed; + bool jumped; int ret; - while (managed = ACCESS_ONCE(path->dentry->d_flags), - unlikely(managed & DCACHE_MANAGED_DENTRY)) { - /* Allow the filesystem to manage the transit without i_mutex - * being held. - * - * We indicate to the filesystem if someone is trying to mount - * something here. This gives autofs the chance to deny anyone - * other than its daemon the right to mount on its - * superstructure. - * - * The filesystem may sleep at this point. - */ - if (managed & DCACHE_MANAGE_TRANSIT) { - BUG_ON(!path->dentry->d_op); - BUG_ON(!path->dentry->d_op->d_manage); - ret = path->dentry->d_op->d_manage( - path->dentry, false); - if (ret < 0) - return ret == -EISDIR ? 0 : ret; - } - - /* Transit to a mounted filesystem. */ - if (managed & DCACHE_MOUNTED) { - struct vfsmount *mounted = lookup_mnt(path); - if (!mounted) - break; - dput(path->dentry); - mntput(path->mnt); - path->mnt = mounted; - path->dentry = dget(mounted->mnt_root); - continue; - } - - /* Don't handle automount points here */ - break; + path->mnt = nd->path.mnt; + path->dentry = dentry; + if (nd->flags & LOOKUP_RCU) { + unsigned int seq = nd->next_seq; + if (likely(!d_managed(dentry))) + return 0; + if (likely(__follow_mount_rcu(nd, path))) + return 0; + // *path and nd->next_seq might've been clobbered + path->mnt = nd->path.mnt; + path->dentry = dentry; + nd->next_seq = seq; + if (unlikely(!try_to_unlazy_next(nd, dentry))) + return -ECHILD; } - return 0; -} - -/* - * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() - */ -static void follow_mount(struct path *path) -{ - while (d_mountpoint(path->dentry)) { - struct vfsmount *mounted = lookup_mnt(path); - if (!mounted) - break; + ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); + if (jumped) + nd->state |= ND_JUMPED; + if (unlikely(ret)) { dput(path->dentry); - mntput(path->mnt); - path->mnt = mounted; - path->dentry = dget(mounted->mnt_root); - } -} - -static void follow_dotdot(struct nameidata *nd) -{ - set_root(nd); - - while(1) { - struct dentry *old = nd->path.dentry; - - if (nd->path.dentry == nd->root.dentry && - nd->path.mnt == nd->root.mnt) { - break; - } - if (nd->path.dentry != nd->path.mnt->mnt_root) { - /* rare case of legitimate dget_parent()... */ - nd->path.dentry = dget_parent(nd->path.dentry); - dput(old); - break; - } - if (!follow_up(&nd->path)) - break; + if (path->mnt != nd->path.mnt) + mntput(path->mnt); } - follow_mount(&nd->path); - nd->inode = nd->path.dentry->d_inode; + return ret; } /* - * This looks up the name in dcache, possibly revalidates the old dentry and - * allocates a new one if not found or not valid. In the need_lookup argument - * returns whether i_op->lookup is necessary. - * - * dir->d_inode->i_mutex must be held + * This looks up the name in dcache and possibly revalidates the found dentry. + * NULL is returned if the dentry does not exist in the cache. */ -static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, - unsigned int flags, bool *need_lookup) +static struct dentry *lookup_dcache(const struct qstr *name, + struct dentry *dir, + unsigned int flags) { - struct dentry *dentry; - int error; - - *need_lookup = false; - dentry = d_lookup(dir, name); + struct dentry *dentry = d_lookup(dir, name); if (dentry) { - if (dentry->d_flags & DCACHE_OP_REVALIDATE) { - error = d_revalidate(dentry, flags); - if (unlikely(error <= 0)) { - if (error < 0) { - dput(dentry); - return ERR_PTR(error); - } else if (!d_invalidate(dentry)) { - dput(dentry); - dentry = NULL; - } - } + int error = d_revalidate(dir->d_inode, name, dentry, flags); + if (unlikely(error <= 0)) { + if (!error) + d_invalidate(dentry); + dput(dentry); + return ERR_PTR(error); } } - - if (!dentry) { - dentry = d_alloc(dir, name); - if (unlikely(!dentry)) - return ERR_PTR(-ENOMEM); - - *need_lookup = true; - } return dentry; } /* - * Call i_op->lookup on the dentry. The dentry must be negative but may be - * hashed if it was pouplated with DCACHE_NEED_LOOKUP. - * - * dir->d_inode->i_mutex must be held + * Parent directory has inode locked exclusive. This is one + * and only case when ->lookup() gets called on non in-lookup + * dentries - as the matter of fact, this only gets called + * when directory is guaranteed to have no in-lookup children + * at all. + * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed. + * Will return -EEXIST if name is found and LOOKUP_EXCL was passed. */ -static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry, - unsigned int flags) +struct dentry *lookup_one_qstr_excl(const struct qstr *name, + struct dentry *base, unsigned int flags) { + struct dentry *dentry; struct dentry *old; + struct inode *dir; + + dentry = lookup_dcache(name, base, flags); + if (dentry) + goto found; /* Don't create child dentry for a dead directory. */ - if (unlikely(IS_DEADDIR(dir))) { - dput(dentry); + dir = base->d_inode; + if (unlikely(IS_DEADDIR(dir))) return ERR_PTR(-ENOENT); - } + + dentry = d_alloc(base, name); + if (unlikely(!dentry)) + return ERR_PTR(-ENOMEM); old = dir->i_op->lookup(dir, dentry, flags); if (unlikely(old)) { dput(dentry); dentry = old; } - return dentry; -} - -static struct dentry *__lookup_hash(struct qstr *name, - struct dentry *base, unsigned int flags) -{ - bool need_lookup; - struct dentry *dentry; - - dentry = lookup_dcache(name, base, flags, &need_lookup); - if (!need_lookup) +found: + if (IS_ERR(dentry)) return dentry; - - return lookup_real(base->d_inode, dentry, flags); + if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) { + dput(dentry); + return ERR_PTR(-ENOENT); + } + if (d_is_positive(dentry) && (flags & LOOKUP_EXCL)) { + dput(dentry); + return ERR_PTR(-EEXIST); + } + return dentry; } +EXPORT_SYMBOL(lookup_one_qstr_excl); -/* - * It's more convoluted than I'd like it to be, but... it's still fairly - * small and for now I'd prefer to have fast path as straight as possible. - * It _is_ time-critical. +/** + * lookup_fast - do fast lockless (but racy) lookup of a dentry + * @nd: current nameidata + * + * Do a fast, but racy lookup in the dcache for the given dentry, and + * revalidate it. Returns a valid dentry pointer or NULL if one wasn't + * found. On error, an ERR_PTR will be returned. + * + * If this function returns a valid dentry and the walk is no longer + * lazy, the dentry will carry a reference that must later be put. If + * RCU mode is still in force, then this is not the case and the dentry + * must be legitimized before use. If this returns NULL, then the walk + * will no longer be in RCU mode. */ -static int lookup_fast(struct nameidata *nd, - struct path *path, struct inode **inode) +static struct dentry *lookup_fast(struct nameidata *nd) { - struct vfsmount *mnt = nd->path.mnt; struct dentry *dentry, *parent = nd->path.dentry; - int need_reval = 1; int status = 1; - int err; /* * Rename seqlock is not required here because in the off chance - * of a false negative due to a concurrent rename, we're going to - * do the non-racy lookup, below. + * of a false negative due to a concurrent rename, the caller is + * going to fall back to non-racy lookup. */ if (nd->flags & LOOKUP_RCU) { - unsigned seq; - dentry = __d_lookup_rcu(parent, &nd->last, &seq); - if (!dentry) - goto unlazy; - - /* - * This sequence count validates that the inode matches - * the dentry name information from lookup. - */ - *inode = dentry->d_inode; - if (read_seqcount_retry(&dentry->d_seq, seq)) - return -ECHILD; + dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq); + if (unlikely(!dentry)) { + if (!try_to_unlazy(nd)) + return ERR_PTR(-ECHILD); + return NULL; + } /* * This sequence count validates that the parent had no * changes while we did the lookup of the dentry above. - * - * The memory barrier in read_seqcount_begin of child is - * enough, we can use __read_seqcount_retry here. */ - if (__read_seqcount_retry(&parent->d_seq, nd->seq)) - return -ECHILD; - nd->seq = seq; - - if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { - status = d_revalidate(dentry, nd->flags); - if (unlikely(status <= 0)) { - if (status != -ECHILD) - need_reval = 0; - goto unlazy; - } - } - path->mnt = mnt; - path->dentry = dentry; - if (unlikely(!__follow_mount_rcu(nd, path, inode))) - goto unlazy; - if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) - goto unlazy; - return 0; -unlazy: - if (unlazy_walk(nd, dentry)) - return -ECHILD; + if (read_seqcount_retry(&parent->d_seq, nd->seq)) + return ERR_PTR(-ECHILD); + + status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags); + if (likely(status > 0)) + return dentry; + if (!try_to_unlazy_next(nd, dentry)) + return ERR_PTR(-ECHILD); + if (status == -ECHILD) + /* we'd been told to redo it in non-rcu mode */ + status = d_revalidate(nd->inode, &nd->last, + dentry, nd->flags); } else { dentry = __d_lookup(parent, &nd->last); + if (unlikely(!dentry)) + return NULL; + status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags); + } + if (unlikely(status <= 0)) { + if (!status) + d_invalidate(dentry); + dput(dentry); + return ERR_PTR(status); } + return dentry; +} - if (unlikely(!dentry)) - goto need_lookup; +/* Fast lookup failed, do it the slow way */ +static struct dentry *__lookup_slow(const struct qstr *name, + struct dentry *dir, + unsigned int flags) +{ + struct dentry *dentry, *old; + struct inode *inode = dir->d_inode; + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); - if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) - status = d_revalidate(dentry, nd->flags); - if (unlikely(status <= 0)) { - if (status < 0) { + /* Don't go there if it's already dead */ + if (unlikely(IS_DEADDIR(inode))) + return ERR_PTR(-ENOENT); +again: + dentry = d_alloc_parallel(dir, name, &wq); + if (IS_ERR(dentry)) + return dentry; + if (unlikely(!d_in_lookup(dentry))) { + int error = d_revalidate(inode, name, dentry, flags); + if (unlikely(error <= 0)) { + if (!error) { + d_invalidate(dentry); + dput(dentry); + goto again; + } dput(dentry); - return status; + dentry = ERR_PTR(error); } - if (!d_invalidate(dentry)) { + } else { + old = inode->i_op->lookup(inode, dentry, flags); + d_lookup_done(dentry); + if (unlikely(old)) { dput(dentry); - goto need_lookup; + dentry = old; } } + return dentry; +} - path->mnt = mnt; - path->dentry = dentry; - err = follow_managed(path, nd->flags); - if (unlikely(err < 0)) { - path_put_conditional(path, nd); - return err; - } - if (err) - nd->flags |= LOOKUP_JUMPED; - *inode = path->dentry->d_inode; - return 0; +static noinline struct dentry *lookup_slow(const struct qstr *name, + struct dentry *dir, + unsigned int flags) +{ + struct inode *inode = dir->d_inode; + struct dentry *res; + inode_lock_shared(inode); + res = __lookup_slow(name, dir, flags); + inode_unlock_shared(inode); + return res; +} -need_lookup: - return 1; +static struct dentry *lookup_slow_killable(const struct qstr *name, + struct dentry *dir, + unsigned int flags) +{ + struct inode *inode = dir->d_inode; + struct dentry *res; + + if (inode_lock_shared_killable(inode)) + return ERR_PTR(-EINTR); + res = __lookup_slow(name, dir, flags); + inode_unlock_shared(inode); + return res; } -/* Fast lookup failed, do it the slow way */ -static int lookup_slow(struct nameidata *nd, struct path *path) +static inline int may_lookup(struct mnt_idmap *idmap, + struct nameidata *restrict nd) { - struct dentry *dentry, *parent; - int err; + int err, mask; - parent = nd->path.dentry; - BUG_ON(nd->inode != parent->d_inode); + mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0; + err = lookup_inode_permission_may_exec(idmap, nd->inode, mask); + if (likely(!err)) + return 0; - mutex_lock(&parent->d_inode->i_mutex); - dentry = __lookup_hash(&nd->last, parent, nd->flags); - mutex_unlock(&parent->d_inode->i_mutex); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - path->mnt = nd->path.mnt; - path->dentry = dentry; - err = follow_managed(path, nd->flags); - if (unlikely(err < 0)) { - path_put_conditional(path, nd); + // If we failed, and we weren't in LOOKUP_RCU, it's final + if (!(nd->flags & LOOKUP_RCU)) return err; - } - if (err) - nd->flags |= LOOKUP_JUMPED; - return 0; + + // Drop out of RCU mode to make sure it wasn't transient + if (!try_to_unlazy(nd)) + return -ECHILD; // redo it all non-lazy + + if (err != -ECHILD) // hard error + return err; + + return lookup_inode_permission_may_exec(idmap, nd->inode, 0); } -static inline int may_lookup(struct nameidata *nd) +static int reserve_stack(struct nameidata *nd, struct path *link) { + if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) + return -ELOOP; + + if (likely(nd->depth != EMBEDDED_LEVELS)) + return 0; + if (likely(nd->stack != nd->internal)) + return 0; + if (likely(nd_alloc_stack(nd))) + return 0; + if (nd->flags & LOOKUP_RCU) { - int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK); - if (err != -ECHILD) - return err; - if (unlazy_walk(nd, NULL)) + // we need to grab link before we do unlazy. And we can't skip + // unlazy even if we fail to grab the link - cleanup needs it + bool grabbed_link = legitimize_path(nd, link, nd->next_seq); + + if (!try_to_unlazy(nd) || !grabbed_link) return -ECHILD; + + if (nd_alloc_stack(nd)) + return 0; } - return inode_permission(nd->inode, MAY_EXEC); + return -ENOMEM; } -static inline int handle_dots(struct nameidata *nd, int type) +enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; + +static noinline const char *pick_link(struct nameidata *nd, struct path *link, + struct inode *inode, int flags) { - if (type == LAST_DOTDOT) { + struct saved *last; + const char *res; + int error; + + if (nd->flags & LOOKUP_RCU) { + /* make sure that d_is_symlink from step_into_slowpath() matches the inode */ + if (read_seqcount_retry(&link->dentry->d_seq, nd->next_seq)) + return ERR_PTR(-ECHILD); + } else { + if (link->mnt == nd->path.mnt) + mntget(link->mnt); + } + + error = reserve_stack(nd, link); + if (unlikely(error)) { + if (!(nd->flags & LOOKUP_RCU)) + path_put(link); + return ERR_PTR(error); + } + last = nd->stack + nd->depth++; + last->link = *link; + clear_delayed_call(&last->done); + last->seq = nd->next_seq; + + if (flags & WALK_TRAILING) { + error = may_follow_link(nd, inode); + if (unlikely(error)) + return ERR_PTR(error); + } + + if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) || + unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW)) + return ERR_PTR(-ELOOP); + + if (unlikely(atime_needs_update(&last->link, inode))) { if (nd->flags & LOOKUP_RCU) { - if (follow_dotdot_rcu(nd)) - return -ECHILD; - } else - follow_dotdot(nd); + if (!try_to_unlazy(nd)) + return ERR_PTR(-ECHILD); + } + touch_atime(&last->link); + cond_resched(); } - return 0; -} -static void terminate_walk(struct nameidata *nd) -{ - if (!(nd->flags & LOOKUP_RCU)) { - path_put(&nd->path); - } else { - nd->flags &= ~LOOKUP_RCU; - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; - unlock_rcu_walk(); + error = security_inode_follow_link(link->dentry, inode, + nd->flags & LOOKUP_RCU); + if (unlikely(error)) + return ERR_PTR(error); + + res = READ_ONCE(inode->i_link); + if (!res) { + const char * (*get)(struct dentry *, struct inode *, + struct delayed_call *); + get = inode->i_op->get_link; + if (nd->flags & LOOKUP_RCU) { + res = get(NULL, inode, &last->done); + if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd)) + res = get(link->dentry, inode, &last->done); + } else { + res = get(link->dentry, inode, &last->done); + } + if (!res) + goto all_done; + if (IS_ERR(res)) + return res; } + if (*res == '/') { + error = nd_jump_root(nd); + if (unlikely(error)) + return ERR_PTR(error); + while (unlikely(*++res == '/')) + ; + } + if (*res) + return res; +all_done: // pure jump + put_link(nd); + return NULL; } /* @@ -1496,124 +2030,201 @@ static void terminate_walk(struct nameidata *nd) * to do this check without having to look at inode->i_op, * so we keep a cache of "no, this doesn't need follow_link" * for the common case. + * + * NOTE: dentry must be what nd->next_seq had been sampled from. */ -static inline int should_follow_link(struct inode *inode, int follow) +static noinline const char *step_into_slowpath(struct nameidata *nd, int flags, + struct dentry *dentry) { - if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { - if (likely(inode->i_op->follow_link)) - return follow; + struct path path; + struct inode *inode; + int err; - /* This gets set once for the inode lifetime */ - spin_lock(&inode->i_lock); - inode->i_opflags |= IOP_NOFOLLOW; - spin_unlock(&inode->i_lock); + err = handle_mounts(nd, dentry, &path); + if (unlikely(err < 0)) + return ERR_PTR(err); + inode = path.dentry->d_inode; + if (likely(!d_is_symlink(path.dentry)) || + ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) || + (flags & WALK_NOFOLLOW)) { + /* not a symlink or should not follow */ + if (nd->flags & LOOKUP_RCU) { + if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) + return ERR_PTR(-ECHILD); + if (unlikely(!inode)) + return ERR_PTR(-ENOENT); + } else { + dput(nd->path.dentry); + if (nd->path.mnt != path.mnt) + mntput(nd->path.mnt); + } + nd->path = path; + nd->inode = inode; + nd->seq = nd->next_seq; + return NULL; } - return 0; + return pick_link(nd, &path, inode, flags); } -static inline int walk_component(struct nameidata *nd, struct path *path, - int follow) +static __always_inline const char *step_into(struct nameidata *nd, int flags, + struct dentry *dentry) { - struct inode *inode; - int err; /* - * "." and ".." are special - ".." especially so because it has - * to be able to know about the current root directory and - * parent relationships. + * In the common case we are in rcu-walk and traversing over a non-mounted on + * directory (as opposed to e.g., a symlink). + * + * We can handle that and negative entries with the checks below. */ - if (unlikely(nd->last_type != LAST_NORM)) - return handle_dots(nd, nd->last_type); - err = lookup_fast(nd, path, &inode); - if (unlikely(err)) { - if (err < 0) - goto out_err; - - err = lookup_slow(nd, path); - if (err < 0) - goto out_err; - - inode = path->dentry->d_inode; + if (likely((nd->flags & LOOKUP_RCU) && + !d_managed(dentry) && !d_is_symlink(dentry))) { + struct inode *inode = dentry->d_inode; + if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) + return ERR_PTR(-ECHILD); + if (unlikely(!inode)) + return ERR_PTR(-ENOENT); + nd->path.dentry = dentry; + /* nd->path.mnt is retained on purpose */ + nd->inode = inode; + nd->seq = nd->next_seq; + return NULL; } - err = -ENOENT; - if (!inode) - goto out_path_put; + return step_into_slowpath(nd, flags, dentry); +} - if (should_follow_link(inode, follow)) { - if (nd->flags & LOOKUP_RCU) { - if (unlikely(unlazy_walk(nd, path->dentry))) { - err = -ECHILD; - goto out_err; - } - } - BUG_ON(inode != path->dentry->d_inode); - return 1; - } - path_to_nameidata(path, nd); - nd->inode = inode; - return 0; +static struct dentry *follow_dotdot_rcu(struct nameidata *nd) +{ + struct dentry *parent, *old; -out_path_put: - path_to_nameidata(path, nd); -out_err: - terminate_walk(nd); - return err; + if (path_equal(&nd->path, &nd->root)) + goto in_root; + if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { + struct path path; + unsigned seq; + if (!choose_mountpoint_rcu(real_mount(nd->path.mnt), + &nd->root, &path, &seq)) + goto in_root; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return ERR_PTR(-ECHILD); + nd->path = path; + nd->inode = path.dentry->d_inode; + nd->seq = seq; + // makes sure that non-RCU pathwalk could reach this state + if (read_seqretry(&mount_lock, nd->m_seq)) + return ERR_PTR(-ECHILD); + /* we know that mountpoint was pinned */ + } + old = nd->path.dentry; + parent = old->d_parent; + nd->next_seq = read_seqcount_begin(&parent->d_seq); + // makes sure that non-RCU pathwalk could reach this state + if (read_seqcount_retry(&old->d_seq, nd->seq)) + return ERR_PTR(-ECHILD); + if (unlikely(!path_connected(nd->path.mnt, parent))) + return ERR_PTR(-ECHILD); + return parent; +in_root: + if (read_seqretry(&mount_lock, nd->m_seq)) + return ERR_PTR(-ECHILD); + if (unlikely(nd->flags & LOOKUP_BENEATH)) + return ERR_PTR(-ECHILD); + nd->next_seq = nd->seq; + return nd->path.dentry; } -/* - * This limits recursive symlink follows to 8, while - * limiting consecutive symlinks to 40. - * - * Without that kind of total limit, nasty chains of consecutive - * symlinks can cause almost arbitrarily long lookups. - */ -static inline int nested_symlink(struct path *path, struct nameidata *nd) +static struct dentry *follow_dotdot(struct nameidata *nd) { - int res; + struct dentry *parent; + + if (path_equal(&nd->path, &nd->root)) + goto in_root; + if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { + struct path path; - if (unlikely(current->link_count >= MAX_NESTED_LINKS)) { - path_put_conditional(path, nd); + if (!choose_mountpoint(real_mount(nd->path.mnt), + &nd->root, &path)) + goto in_root; path_put(&nd->path); - return -ELOOP; + nd->path = path; + nd->inode = path.dentry->d_inode; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return ERR_PTR(-EXDEV); + } + /* rare case of legitimate dget_parent()... */ + parent = dget_parent(nd->path.dentry); + if (unlikely(!path_connected(nd->path.mnt, parent))) { + dput(parent); + return ERR_PTR(-ENOENT); } - BUG_ON(nd->depth >= MAX_NESTED_LINKS); + return parent; - nd->depth++; - current->link_count++; +in_root: + if (unlikely(nd->flags & LOOKUP_BENEATH)) + return ERR_PTR(-EXDEV); + return dget(nd->path.dentry); +} - do { - struct path link = *path; - void *cookie; +static const char *handle_dots(struct nameidata *nd, int type) +{ + if (type == LAST_DOTDOT) { + const char *error = NULL; + struct dentry *parent; - res = follow_link(&link, nd, &cookie); - if (res) - break; - res = walk_component(nd, path, LOOKUP_FOLLOW); - put_link(nd, &link, cookie); - } while (res > 0); + if (!nd->root.mnt) { + error = ERR_PTR(set_root(nd)); + if (unlikely(error)) + return error; + } + if (nd->flags & LOOKUP_RCU) + parent = follow_dotdot_rcu(nd); + else + parent = follow_dotdot(nd); + if (IS_ERR(parent)) + return ERR_CAST(parent); + error = step_into(nd, WALK_NOFOLLOW, parent); + if (unlikely(error)) + return error; - current->link_count--; - nd->depth--; - return res; + if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { + /* + * If there was a racing rename or mount along our + * path, then we can't be sure that ".." hasn't jumped + * above nd->root (and so userspace should retry or use + * some fallback). + */ + smp_rmb(); + if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)) + return ERR_PTR(-EAGAIN); + if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)) + return ERR_PTR(-EAGAIN); + } + } + return NULL; } -/* - * We really don't want to look at inode->i_op->lookup - * when we don't have to. So we keep a cache bit in - * the inode ->i_opflags field that says "yes, we can - * do lookup on this inode". - */ -static inline int can_lookup(struct inode *inode) +static __always_inline const char *walk_component(struct nameidata *nd, int flags) { - if (likely(inode->i_opflags & IOP_LOOKUP)) - return 1; - if (likely(!inode->i_op->lookup)) - return 0; - - /* We do this once for the lifetime of the inode */ - spin_lock(&inode->i_lock); - inode->i_opflags |= IOP_LOOKUP; - spin_unlock(&inode->i_lock); - return 1; + struct dentry *dentry; + /* + * "." and ".." are special - ".." especially so because it has + * to be able to know about the current root directory and + * parent relationships. + */ + if (unlikely(nd->last_type != LAST_NORM)) { + if (unlikely(nd->depth) && !(flags & WALK_MORE)) + put_link(nd); + return handle_dots(nd, nd->last_type); + } + dentry = lookup_fast(nd); + if (IS_ERR(dentry)) + return ERR_CAST(dentry); + if (unlikely(!dentry)) { + dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags); + if (IS_ERR(dentry)) + return ERR_CAST(dentry); + } + if (unlikely(nd->depth) && !(flags & WALK_MORE)) + put_link(nd); + return step_into(nd, flags, dentry); } /* @@ -1624,11 +2235,6 @@ static inline int can_lookup(struct inode *inode) * do a "get_unaligned()" if this helps and is sufficiently * fast. * - * - Little-endian machines (so that we can generate the mask - * of low bytes efficiently). Again, we *could* do a byte - * swapping load on big-endian architectures if that is not - * expensive enough to make the optimization worthless. - * * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we * do not trap on the (extremely unlikely) case of a page * crossing operation. @@ -1642,56 +2248,170 @@ static inline int can_lookup(struct inode *inode) #include <asm/word-at-a-time.h> -#ifdef CONFIG_64BIT +#ifdef HASH_MIX + +/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */ -static inline unsigned int fold_hash(unsigned long hash) +#elif defined(CONFIG_64BIT) +/* + * Register pressure in the mixing function is an issue, particularly + * on 32-bit x86, but almost any function requires one state value and + * one temporary. Instead, use a function designed for two state values + * and no temporaries. + * + * This function cannot create a collision in only two iterations, so + * we have two iterations to achieve avalanche. In those two iterations, + * we have six layers of mixing, which is enough to spread one bit's + * influence out to 2^6 = 64 state bits. + * + * Rotate constants are scored by considering either 64 one-bit input + * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the + * probability of that delta causing a change to each of the 128 output + * bits, using a sample of random initial states. + * + * The Shannon entropy of the computed probabilities is then summed + * to produce a score. Ideally, any input change has a 50% chance of + * toggling any given output bit. + * + * Mixing scores (in bits) for (12,45): + * Input delta: 1-bit 2-bit + * 1 round: 713.3 42542.6 + * 2 rounds: 2753.7 140389.8 + * 3 rounds: 5954.1 233458.2 + * 4 rounds: 7862.6 256672.2 + * Perfect: 8192 258048 + * (64*128) (64*63/2 * 128) + */ +#define HASH_MIX(x, y, a) \ + ( x ^= (a), \ + y ^= x, x = rol64(x,12),\ + x += y, y = rol64(y,45),\ + y *= 9 ) + +/* + * Fold two longs into one 32-bit hash value. This must be fast, but + * latency isn't quite as critical, as there is a fair bit of additional + * work done before the hash value is used. + */ +static inline unsigned int fold_hash(unsigned long x, unsigned long y) { - hash += hash >> (8*sizeof(int)); - return hash; + y ^= x * GOLDEN_RATIO_64; + y *= GOLDEN_RATIO_64; + return y >> 32; } #else /* 32-bit case */ -#define fold_hash(x) (x) +/* + * Mixing scores (in bits) for (7,20): + * Input delta: 1-bit 2-bit + * 1 round: 330.3 9201.6 + * 2 rounds: 1246.4 25475.4 + * 3 rounds: 1907.1 31295.1 + * 4 rounds: 2042.3 31718.6 + * Perfect: 2048 31744 + * (32*64) (32*31/2 * 64) + */ +#define HASH_MIX(x, y, a) \ + ( x ^= (a), \ + y ^= x, x = rol32(x, 7),\ + x += y, y = rol32(y,20),\ + y *= 9 ) + +static inline unsigned int fold_hash(unsigned long x, unsigned long y) +{ + /* Use arch-optimized multiply if one exists */ + return __hash_32(y ^ __hash_32(x)); +} #endif -unsigned int full_name_hash(const unsigned char *name, unsigned int len) +/* + * Return the hash of a string of known length. This is carfully + * designed to match hash_name(), which is the more critical function. + * In particular, we must end by hashing a final word containing 0..7 + * payload bytes, to match the way that hash_name() iterates until it + * finds the delimiter after the name. + */ +unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { - unsigned long a, mask; - unsigned long hash = 0; + unsigned long a, x = 0, y = (unsigned long)salt; for (;;) { + if (!len) + goto done; a = load_unaligned_zeropad(name); if (len < sizeof(unsigned long)) break; - hash += a; - hash *= 9; + HASH_MIX(x, y, a); name += sizeof(unsigned long); len -= sizeof(unsigned long); - if (!len) - goto done; } - mask = ~(~0ul << len*8); - hash += mask & a; + x ^= a & bytemask_from_count(len); done: - return fold_hash(hash); + return fold_hash(x, y); } EXPORT_SYMBOL(full_name_hash); +/* Return the "hash_len" (hash and length) of a null-terminated string */ +u64 hashlen_string(const void *salt, const char *name) +{ + unsigned long a = 0, x = 0, y = (unsigned long)salt; + unsigned long adata, mask, len; + const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; + + len = 0; + goto inside; + + do { + HASH_MIX(x, y, a); + len += sizeof(unsigned long); +inside: + a = load_unaligned_zeropad(name+len); + } while (!has_zero(a, &adata, &constants)); + + adata = prep_zero_mask(a, adata, &constants); + mask = create_zero_mask(adata); + x ^= a & zero_bytemask(mask); + + return hashlen_create(fold_hash(x, y), len + find_zero(mask)); +} +EXPORT_SYMBOL(hashlen_string); + /* * Calculate the length and hash of the path component, and - * return the length of the component; + * return the length as the result. */ -static inline unsigned long hash_name(const char *name, unsigned int *hashp) +static inline const char *hash_name(struct nameidata *nd, + const char *name, + unsigned long *lastword) { - unsigned long a, b, adata, bdata, mask, hash, len; + unsigned long a, b, x, y = (unsigned long)nd->path.dentry; + unsigned long adata, bdata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; - hash = a = 0; - len = -sizeof(unsigned long); + /* + * The first iteration is special, because it can result in + * '.' and '..' and has no mixing other than the final fold. + */ + a = load_unaligned_zeropad(name); + b = a ^ REPEAT_BYTE('/'); + if (has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)) { + adata = prep_zero_mask(a, adata, &constants); + bdata = prep_zero_mask(b, bdata, &constants); + mask = create_zero_mask(adata | bdata); + a &= zero_bytemask(mask); + *lastword = a; + len = find_zero(mask); + nd->last.hash = fold_hash(a, y); + nd->last.len = len; + return name + len; + } + + len = 0; + x = 0; do { - hash = (hash + a) * 9; + HASH_MIX(x, y, a); len += sizeof(unsigned long); a = load_unaligned_zeropad(name+len); b = a ^ REPEAT_BYTE('/'); @@ -1699,47 +2419,87 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp) adata = prep_zero_mask(a, adata, &constants); bdata = prep_zero_mask(b, bdata, &constants); - mask = create_zero_mask(adata | bdata); - - hash += a & zero_bytemask(mask); - *hashp = fold_hash(hash); - - return len + find_zero(mask); + a &= zero_bytemask(mask); + x ^= a; + len += find_zero(mask); + *lastword = 0; // Multi-word components cannot be DOT or DOTDOT + + nd->last.hash = fold_hash(x, y); + nd->last.len = len; + return name + len; } -#else +/* + * Note that the 'last' word is always zero-masked, but + * was loaded as a possibly big-endian word. + */ +#ifdef __BIG_ENDIAN + #define LAST_WORD_IS_DOT (0x2eul << (BITS_PER_LONG-8)) + #define LAST_WORD_IS_DOTDOT (0x2e2eul << (BITS_PER_LONG-16)) +#endif -unsigned int full_name_hash(const unsigned char *name, unsigned int len) +#else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */ + +/* Return the hash of a string of known length */ +unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { - unsigned long hash = init_name_hash(); + unsigned long hash = init_name_hash(salt); while (len--) - hash = partial_name_hash(*name++, hash); + hash = partial_name_hash((unsigned char)*name++, hash); return end_name_hash(hash); } EXPORT_SYMBOL(full_name_hash); +/* Return the "hash_len" (hash and length) of a null-terminated string */ +u64 hashlen_string(const void *salt, const char *name) +{ + unsigned long hash = init_name_hash(salt); + unsigned long len = 0, c; + + c = (unsigned char)*name; + while (c) { + len++; + hash = partial_name_hash(c, hash); + c = (unsigned char)name[len]; + } + return hashlen_create(end_name_hash(hash), len); +} +EXPORT_SYMBOL(hashlen_string); + /* * We know there's a real path component here of at least * one character. */ -static inline unsigned long hash_name(const char *name, unsigned int *hashp) +static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword) { - unsigned long hash = init_name_hash(); - unsigned long len = 0, c; + unsigned long hash = init_name_hash(nd->path.dentry); + unsigned long len = 0, c, last = 0; c = (unsigned char)*name; do { + last = (last << 8) + c; len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } while (c && c != '/'); - *hashp = end_name_hash(hash); - return len; + + // This is reliable for DOT or DOTDOT, since the component + // cannot contain NUL characters - top bits being zero means + // we cannot have had any other pathnames. + *lastword = last; + nd->last.hash = end_name_hash(hash); + nd->last.len = len; + return name + len; } #endif +#ifndef LAST_WORD_IS_DOT + #define LAST_WORD_IS_DOT 0x2e + #define LAST_WORD_IS_DOTDOT 0x2e2e +#endif + /* * Name resolution. * This is the basic name resolution function, turning a pathname into @@ -1750,301 +2510,505 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp) */ static int link_path_walk(const char *name, struct nameidata *nd) { - struct path next; + int depth = 0; // depth <= nd->depth int err; - - while (*name=='/') - name++; - if (!*name) + + nd->last_type = LAST_ROOT; + nd->flags |= LOOKUP_PARENT; + if (IS_ERR(name)) + return PTR_ERR(name); + if (*name == '/') { + do { + name++; + } while (unlikely(*name == '/')); + } + if (unlikely(!*name)) { + nd->dir_mode = 0; // short-circuit the 'hardening' idiocy return 0; + } /* At this point we know we have a real path component. */ for(;;) { - struct qstr this; - long len; - int type; + struct mnt_idmap *idmap; + const char *link; + unsigned long lastword; + + idmap = mnt_idmap(nd->path.mnt); + err = may_lookup(idmap, nd); + if (unlikely(err)) + return err; + + nd->last.name = name; + name = hash_name(nd, name, &lastword); - err = may_lookup(nd); - if (err) + switch(lastword) { + case LAST_WORD_IS_DOTDOT: + nd->last_type = LAST_DOTDOT; + nd->state |= ND_JUMPED; break; - len = hash_name(name, &this.hash); - this.name = name; - this.len = len; + case LAST_WORD_IS_DOT: + nd->last_type = LAST_DOT; + break; + + default: + nd->last_type = LAST_NORM; + nd->state &= ~ND_JUMPED; - type = LAST_NORM; - if (name[0] == '.') switch (len) { - case 2: - if (name[1] == '.') { - type = LAST_DOTDOT; - nd->flags |= LOOKUP_JUMPED; - } - break; - case 1: - type = LAST_DOT; - } - if (likely(type == LAST_NORM)) { struct dentry *parent = nd->path.dentry; - nd->flags &= ~LOOKUP_JUMPED; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { - err = parent->d_op->d_hash(parent, &this); + err = parent->d_op->d_hash(parent, &nd->last); if (err < 0) - break; + return err; } } - nd->last = this; - nd->last_type = type; - - if (!name[len]) - return 0; + if (!*name) + goto OK; /* * If it wasn't NUL, we know it was '/'. Skip that * slash, and continue until no more slashes. */ do { - len++; - } while (unlikely(name[len] == '/')); - if (!name[len]) - return 0; - - name += len; - - err = walk_component(nd, &next, LOOKUP_FOLLOW); - if (err < 0) - return err; - - if (err) { - err = nested_symlink(&next, nd); - if (err) - return err; + name++; + } while (unlikely(*name == '/')); + if (unlikely(!*name)) { +OK: + /* pathname or trailing symlink, done */ + if (likely(!depth)) { + nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode); + nd->dir_mode = nd->inode->i_mode; + nd->flags &= ~LOOKUP_PARENT; + return 0; + } + /* last component of nested symlink */ + name = nd->stack[--depth].name; + link = walk_component(nd, 0); + } else { + /* not the last component */ + link = walk_component(nd, WALK_MORE); } - if (!can_lookup(nd->inode)) { - err = -ENOTDIR; - break; + if (unlikely(link)) { + if (IS_ERR(link)) + return PTR_ERR(link); + /* a symlink to follow */ + nd->stack[depth++].name = name; + name = link; + continue; + } + if (unlikely(!d_can_lookup(nd->path.dentry))) { + if (nd->flags & LOOKUP_RCU) { + if (!try_to_unlazy(nd)) + return -ECHILD; + } + return -ENOTDIR; } } - terminate_walk(nd); - return err; } -static int path_init(int dfd, const char *name, unsigned int flags, - struct nameidata *nd, struct file **fp) +/* must be paired with terminate_walk() */ +static const char *path_init(struct nameidata *nd, unsigned flags) { - int retval = 0; + int error; + const char *s = nd->pathname; - nd->last_type = LAST_ROOT; /* if there are only slashes... */ - nd->flags = flags | LOOKUP_JUMPED; - nd->depth = 0; - if (flags & LOOKUP_ROOT) { - struct inode *inode = nd->root.dentry->d_inode; - if (*name) { - if (!can_lookup(inode)) - return -ENOTDIR; - retval = inode_permission(inode, MAY_EXEC); - if (retval) - return retval; - } + /* LOOKUP_CACHED requires RCU, ask caller to retry */ + if (unlikely((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)) + return ERR_PTR(-EAGAIN); + + if (unlikely(!*s)) + flags &= ~LOOKUP_RCU; + if (flags & LOOKUP_RCU) + rcu_read_lock(); + else + nd->seq = nd->next_seq = 0; + + nd->flags = flags; + nd->state |= ND_JUMPED; + + nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); + nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); + smp_rmb(); + + if (unlikely(nd->state & ND_ROOT_PRESET)) { + struct dentry *root = nd->root.dentry; + struct inode *inode = root->d_inode; + if (*s && unlikely(!d_can_lookup(root))) + return ERR_PTR(-ENOTDIR); nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { - lock_rcu_walk(); - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); + nd->root_seq = nd->seq; } else { path_get(&nd->path); } - return 0; + return s; } nd->root.mnt = NULL; - if (*name=='/') { - if (flags & LOOKUP_RCU) { - lock_rcu_walk(); - set_root_rcu(nd); - } else { - set_root(nd); - path_get(&nd->root); - } - nd->path = nd->root; - } else if (dfd == AT_FDCWD) { + /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */ + if (*s == '/' && likely(!(flags & LOOKUP_IN_ROOT))) { + error = nd_jump_root(nd); + if (unlikely(error)) + return ERR_PTR(error); + return s; + } + + /* Relative pathname -- get the starting-point it is relative to. */ + if (nd->dfd == AT_FDCWD) { if (flags & LOOKUP_RCU) { struct fs_struct *fs = current->fs; unsigned seq; - lock_rcu_walk(); - do { - seq = read_seqcount_begin(&fs->seq); + seq = read_seqbegin(&fs->seq); nd->path = fs->pwd; + nd->inode = nd->path.dentry->d_inode; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); + } while (read_seqretry(&fs->seq, seq)); } else { get_fs_pwd(current->fs, &nd->path); + nd->inode = nd->path.dentry->d_inode; } } else { /* Caller must check execute permissions on the starting path component */ - struct fd f = fdget_raw(dfd); + CLASS(fd_raw, f)(nd->dfd); struct dentry *dentry; - if (!f.file) - return -EBADF; - - dentry = f.file->f_path.dentry; + if (fd_empty(f)) + return ERR_PTR(-EBADF); - if (*name) { - if (!can_lookup(dentry->d_inode)) { - fdput(f); - return -ENOTDIR; - } + if (flags & LOOKUP_LINKAT_EMPTY) { + if (fd_file(f)->f_cred != current_cred() && + !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH)) + return ERR_PTR(-ENOENT); } - nd->path = f.file->f_path; + dentry = fd_file(f)->f_path.dentry; + + if (*s && unlikely(!d_can_lookup(dentry))) + return ERR_PTR(-ENOTDIR); + + nd->path = fd_file(f)->f_path; if (flags & LOOKUP_RCU) { - if (f.need_put) - *fp = f.file; - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - lock_rcu_walk(); + nd->inode = nd->path.dentry->d_inode; + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } else { path_get(&nd->path); - fdput(f); + nd->inode = nd->path.dentry->d_inode; } } - nd->inode = nd->path.dentry->d_inode; - return 0; + /* For scoped-lookups we need to set the root to the dirfd as well. */ + if (unlikely(flags & LOOKUP_IS_SCOPED)) { + nd->root = nd->path; + if (flags & LOOKUP_RCU) { + nd->root_seq = nd->seq; + } else { + path_get(&nd->root); + nd->state |= ND_ROOT_GRABBED; + } + } + return s; } -static inline int lookup_last(struct nameidata *nd, struct path *path) +static inline const char *lookup_last(struct nameidata *nd) { if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; - nd->flags &= ~LOOKUP_PARENT; - return walk_component(nd, path, nd->flags & LOOKUP_FOLLOW); + return walk_component(nd, WALK_TRAILING); } -/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ -static int path_lookupat(int dfd, const char *name, - unsigned int flags, struct nameidata *nd) +static int handle_lookup_down(struct nameidata *nd) { - struct file *base = NULL; - struct path path; - int err; - - /* - * Path walking is largely split up into 2 different synchronisation - * schemes, rcu-walk and ref-walk (explained in - * Documentation/filesystems/path-lookup.txt). These share much of the - * path walk code, but some things particularly setup, cleanup, and - * following mounts are sufficiently divergent that functions are - * duplicated. Typically there is a function foo(), and its RCU - * analogue, foo_rcu(). - * - * -ECHILD is the error number of choice (just to avoid clashes) that - * is returned if some aspect of an rcu-walk fails. Such an error must - * be handled by restarting a traditional ref-walk (which will always - * be able to complete). - */ - err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base); - - if (unlikely(err)) - return err; + if (!(nd->flags & LOOKUP_RCU)) + dget(nd->path.dentry); + nd->next_seq = nd->seq; + return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry)); +} - current->total_link_count = 0; - err = link_path_walk(name, nd); +/* Returns 0 and nd will be valid on success; Returns error, otherwise. */ +static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path) +{ + const char *s = path_init(nd, flags); + int err; - if (!err && !(flags & LOOKUP_PARENT)) { - err = lookup_last(nd, &path); - while (err > 0) { - void *cookie; - struct path link = path; - err = may_follow_link(&link, nd); - if (unlikely(err)) - break; - nd->flags |= LOOKUP_PARENT; - err = follow_link(&link, nd, &cookie); - if (err) - break; - err = lookup_last(nd, &path); - put_link(nd, &link, cookie); - } + if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) { + err = handle_lookup_down(nd); + if (unlikely(err < 0)) + s = ERR_PTR(err); } + while (!(err = link_path_walk(s, nd)) && + (s = lookup_last(nd)) != NULL) + ; + if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { + err = handle_lookup_down(nd); + nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please... + } if (!err) err = complete_walk(nd); - if (!err && nd->flags & LOOKUP_DIRECTORY) { - if (!can_lookup(nd->inode)) { - path_put(&nd->path); + if (!err && nd->flags & LOOKUP_DIRECTORY) + if (!d_can_lookup(nd->path.dentry)) err = -ENOTDIR; - } + if (!err) { + *path = nd->path; + nd->path.mnt = NULL; + nd->path.dentry = NULL; } + terminate_walk(nd); + return err; +} + +int filename_lookup(int dfd, struct filename *name, unsigned flags, + struct path *path, const struct path *root) +{ + int retval; + struct nameidata nd; + if (IS_ERR(name)) + return PTR_ERR(name); + set_nameidata(&nd, dfd, name, root); + retval = path_lookupat(&nd, flags | LOOKUP_RCU, path); + if (unlikely(retval == -ECHILD)) + retval = path_lookupat(&nd, flags, path); + if (unlikely(retval == -ESTALE)) + retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path); - if (base) - fput(base); + if (likely(!retval)) + audit_inode(name, path->dentry, + flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0); + restore_nameidata(); + return retval; +} - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { - path_put(&nd->root); - nd->root.mnt = NULL; +/* Returns 0 and nd will be valid on success; Returns error, otherwise. */ +static int path_parentat(struct nameidata *nd, unsigned flags, + struct path *parent) +{ + const char *s = path_init(nd, flags); + int err = link_path_walk(s, nd); + if (!err) + err = complete_walk(nd); + if (!err) { + *parent = nd->path; + nd->path.mnt = NULL; + nd->path.dentry = NULL; } + terminate_walk(nd); return err; } -static int filename_lookup(int dfd, struct filename *name, - unsigned int flags, struct nameidata *nd) +/* Note: this does not consume "name" */ +static int __filename_parentat(int dfd, struct filename *name, + unsigned int flags, struct path *parent, + struct qstr *last, int *type, + const struct path *root) { - int retval = path_lookupat(dfd, name->name, flags | LOOKUP_RCU, nd); + int retval; + struct nameidata nd; + + if (IS_ERR(name)) + return PTR_ERR(name); + set_nameidata(&nd, dfd, name, root); + retval = path_parentat(&nd, flags | LOOKUP_RCU, parent); if (unlikely(retval == -ECHILD)) - retval = path_lookupat(dfd, name->name, flags, nd); + retval = path_parentat(&nd, flags, parent); if (unlikely(retval == -ESTALE)) - retval = path_lookupat(dfd, name->name, - flags | LOOKUP_REVAL, nd); - - if (likely(!retval)) - audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT); + retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent); + if (likely(!retval)) { + *last = nd.last; + *type = nd.last_type; + audit_inode(name, parent->dentry, AUDIT_INODE_PARENT); + } + restore_nameidata(); return retval; } -static int do_path_lookup(int dfd, const char *name, - unsigned int flags, struct nameidata *nd) +static int filename_parentat(int dfd, struct filename *name, + unsigned int flags, struct path *parent, + struct qstr *last, int *type) { - struct filename filename = { .name = name }; + return __filename_parentat(dfd, name, flags, parent, last, type, NULL); +} - return filename_lookup(dfd, &filename, flags, nd); +/** + * start_dirop - begin a create or remove dirop, performing locking and lookup + * @parent: the dentry of the parent in which the operation will occur + * @name: a qstr holding the name within that parent + * @lookup_flags: intent and other lookup flags. + * + * The lookup is performed and necessary locks are taken so that, on success, + * the returned dentry can be operated on safely. + * The qstr must already have the hash value calculated. + * + * Returns: a locked dentry, or an error. + * + */ +static struct dentry *__start_dirop(struct dentry *parent, struct qstr *name, + unsigned int lookup_flags, + unsigned int state) +{ + struct dentry *dentry; + struct inode *dir = d_inode(parent); + + if (state == TASK_KILLABLE) { + int ret = down_write_killable_nested(&dir->i_rwsem, + I_MUTEX_PARENT); + if (ret) + return ERR_PTR(ret); + } else { + inode_lock_nested(dir, I_MUTEX_PARENT); + } + dentry = lookup_one_qstr_excl(name, parent, lookup_flags); + if (IS_ERR(dentry)) + inode_unlock(dir); + return dentry; } +struct dentry *start_dirop(struct dentry *parent, struct qstr *name, + unsigned int lookup_flags) +{ + return __start_dirop(parent, name, lookup_flags, TASK_NORMAL); +} + +/** + * end_dirop - signal completion of a dirop + * @de: the dentry which was returned by start_dirop or similar. + * + * If the de is an error, nothing happens. Otherwise any lock taken to + * protect the dentry is dropped and the dentry itself is release (dput()). + */ +void end_dirop(struct dentry *de) +{ + if (!IS_ERR(de)) { + inode_unlock(de->d_parent->d_inode); + dput(de); + } +} +EXPORT_SYMBOL(end_dirop); + /* does lookup, returns the object with parent locked */ -struct dentry *kern_path_locked(const char *name, struct path *path) +static struct dentry *__start_removing_path(int dfd, struct filename *name, + struct path *path) { - struct nameidata nd; + struct path parent_path __free(path_put) = {}; struct dentry *d; - int err = do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, &nd); - if (err) - return ERR_PTR(err); - if (nd.last_type != LAST_NORM) { - path_put(&nd.path); + struct qstr last; + int type, error; + + error = filename_parentat(dfd, name, 0, &parent_path, &last, &type); + if (error) + return ERR_PTR(error); + if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); - } - mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - d = __lookup_hash(&nd.last, nd.path.dentry, 0); - if (IS_ERR(d)) { - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - path_put(&nd.path); + /* don't fail immediately if it's r/o, at least try to report other errors */ + error = mnt_want_write(parent_path.mnt); + d = start_dirop(parent_path.dentry, &last, 0); + if (IS_ERR(d)) + goto drop; + if (error) + goto fail; + path->dentry = no_free_ptr(parent_path.dentry); + path->mnt = no_free_ptr(parent_path.mnt); + return d; + +fail: + end_dirop(d); + d = ERR_PTR(error); +drop: + if (!error) + mnt_drop_write(parent_path.mnt); + return d; +} + +/** + * kern_path_parent: lookup path returning parent and target + * @name: path name + * @path: path to store parent in + * + * The path @name should end with a normal component, not "." or ".." or "/". + * A lookup is performed and if successful the parent information + * is store in @parent and the dentry is returned. + * + * The dentry maybe negative, the parent will be positive. + * + * Returns: dentry or error. + */ +struct dentry *kern_path_parent(const char *name, struct path *path) +{ + struct path parent_path __free(path_put) = {}; + struct filename *filename __free(putname) = getname_kernel(name); + struct dentry *d; + struct qstr last; + int type, error; + + error = filename_parentat(AT_FDCWD, filename, 0, &parent_path, &last, &type); + if (error) + return ERR_PTR(error); + if (unlikely(type != LAST_NORM)) + return ERR_PTR(-EINVAL); + + d = lookup_noperm_unlocked(&last, parent_path.dentry); + if (IS_ERR(d)) return d; - } - *path = nd.path; + path->dentry = no_free_ptr(parent_path.dentry); + path->mnt = no_free_ptr(parent_path.mnt); return d; } -int kern_path(const char *name, unsigned int flags, struct path *path) +struct dentry *start_removing_path(const char *name, struct path *path) { - struct nameidata nd; - int res = do_path_lookup(AT_FDCWD, name, flags, &nd); - if (!res) - *path = nd.path; + struct filename *filename = getname_kernel(name); + struct dentry *res = __start_removing_path(AT_FDCWD, filename, path); + + putname(filename); + return res; +} + +struct dentry *start_removing_user_path_at(int dfd, + const char __user *name, + struct path *path) +{ + struct filename *filename = getname(name); + struct dentry *res = __start_removing_path(dfd, filename, path); + + putname(filename); return res; } +EXPORT_SYMBOL(start_removing_user_path_at); + +int kern_path(const char *name, unsigned int flags, struct path *path) +{ + struct filename *filename = getname_kernel(name); + int ret = filename_lookup(AT_FDCWD, filename, flags, path, NULL); + + putname(filename); + return ret; + +} +EXPORT_SYMBOL(kern_path); + +/** + * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair + * @filename: filename structure + * @flags: lookup flags + * @parent: pointer to struct path to fill + * @last: last component + * @type: type of the last component + * @root: pointer to struct path of the base directory + */ +int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, + struct path *parent, struct qstr *last, int *type, + const struct path *root) +{ + return __filename_parentat(AT_FDCWD, filename, flags, parent, last, + type, root); +} +EXPORT_SYMBOL(vfs_path_parent_lookup); /** * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair @@ -2058,148 +3022,567 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, const char *name, unsigned int flags, struct path *path) { - struct nameidata nd; + struct filename *filename; + struct path root = {.mnt = mnt, .dentry = dentry}; + int ret; + + filename = getname_kernel(name); + /* the first argument of filename_lookup() is ignored with root */ + ret = filename_lookup(AT_FDCWD, filename, flags, path, &root); + putname(filename); + return ret; +} +EXPORT_SYMBOL(vfs_path_lookup); + +int lookup_noperm_common(struct qstr *qname, struct dentry *base) +{ + const char *name = qname->name; + u32 len = qname->len; + + qname->hash = full_name_hash(base, name, len); + if (!len) + return -EACCES; + + if (is_dot_dotdot(name, len)) + return -EACCES; + + while (len--) { + unsigned int c = *(const unsigned char *)name++; + if (c == '/' || c == '\0') + return -EACCES; + } + /* + * See if the low-level filesystem might want + * to use its own hash.. + */ + if (base->d_flags & DCACHE_OP_HASH) { + int err = base->d_op->d_hash(base, qname); + if (err < 0) + return err; + } + return 0; +} + +static int lookup_one_common(struct mnt_idmap *idmap, + struct qstr *qname, struct dentry *base) +{ int err; - nd.root.dentry = dentry; - nd.root.mnt = mnt; - BUG_ON(flags & LOOKUP_PARENT); - /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */ - err = do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, &nd); - if (!err) - *path = nd.path; - return err; + err = lookup_noperm_common(qname, base); + if (err < 0) + return err; + return inode_permission(idmap, base->d_inode, MAY_EXEC); } -/* - * Restricted form of lookup. Doesn't follow links, single-component only, - * needs parent already locked. Doesn't follow mounts. - * SMP-safe. +/** + * try_lookup_noperm - filesystem helper to lookup single pathname component + * @name: qstr storing pathname component to lookup + * @base: base directory to lookup from + * + * Look up a dentry by name in the dcache, returning NULL if it does not + * currently exist. The function does not try to create a dentry and if one + * is found it doesn't try to revalidate it. + * + * Note that this routine is purely a helper for filesystem usage and should + * not be called by generic code. It does no permission checking. + * + * No locks need be held - only a counted reference to @base is needed. + * */ -static struct dentry *lookup_hash(struct nameidata *nd) +struct dentry *try_lookup_noperm(struct qstr *name, struct dentry *base) { - return __lookup_hash(&nd->last, nd->path.dentry, nd->flags); + int err; + + err = lookup_noperm_common(name, base); + if (err) + return ERR_PTR(err); + + return d_lookup(base, name); } +EXPORT_SYMBOL(try_lookup_noperm); /** - * lookup_one_len - filesystem helper to lookup single pathname component - * @name: pathname component to lookup + * lookup_noperm - filesystem helper to lookup single pathname component + * @name: qstr storing pathname component to lookup * @base: base directory to lookup from - * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. Also note that by using this function the - * nameidata argument is passed to the filesystem methods and a filesystem - * using this helper needs to be prepared for that. + * not be called by generic code. It does no permission checking. + * + * The caller must hold base->i_rwsem. */ -struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) +struct dentry *lookup_noperm(struct qstr *name, struct dentry *base) { - struct qstr this; - unsigned int c; + struct dentry *dentry; int err; - WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - this.name = name; - this.len = len; - this.hash = full_name_hash(name, len); - if (!len) - return ERR_PTR(-EACCES); + err = lookup_noperm_common(name, base); + if (err) + return ERR_PTR(err); + + dentry = lookup_dcache(name, base, 0); + return dentry ? dentry : __lookup_slow(name, base, 0); +} +EXPORT_SYMBOL(lookup_noperm); + +/** + * lookup_one - lookup single pathname component + * @idmap: idmap of the mount the lookup is performed from + * @name: qstr holding pathname component to lookup + * @base: base directory to lookup from + * + * This can be used for in-kernel filesystem clients such as file servers. + * + * The caller must hold base->i_rwsem. + */ +struct dentry *lookup_one(struct mnt_idmap *idmap, struct qstr *name, + struct dentry *base) +{ + struct dentry *dentry; + int err; + + WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - if (unlikely(name[0] == '.')) { - if (len < 2 || (len == 2 && name[1] == '.')) - return ERR_PTR(-EACCES); + err = lookup_one_common(idmap, name, base); + if (err) + return ERR_PTR(err); + + dentry = lookup_dcache(name, base, 0); + return dentry ? dentry : __lookup_slow(name, base, 0); +} +EXPORT_SYMBOL(lookup_one); + +/** + * lookup_one_unlocked - lookup single pathname component + * @idmap: idmap of the mount the lookup is performed from + * @name: qstr olding pathname component to lookup + * @base: base directory to lookup from + * + * This can be used for in-kernel filesystem clients such as file servers. + * + * Unlike lookup_one, it should be called without the parent + * i_rwsem held, and will take the i_rwsem itself if necessary. + */ +struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct qstr *name, + struct dentry *base) +{ + int err; + struct dentry *ret; + + err = lookup_one_common(idmap, name, base); + if (err) + return ERR_PTR(err); + + ret = lookup_dcache(name, base, 0); + if (!ret) + ret = lookup_slow(name, base, 0); + return ret; +} +EXPORT_SYMBOL(lookup_one_unlocked); + +/** + * lookup_one_positive_killable - lookup single pathname component + * @idmap: idmap of the mount the lookup is performed from + * @name: qstr olding pathname component to lookup + * @base: base directory to lookup from + * + * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns + * known positive or ERR_PTR(). This is what most of the users want. + * + * Note that pinned negative with unlocked parent _can_ become positive at any + * time, so callers of lookup_one_unlocked() need to be very careful; pinned + * positives have >d_inode stable, so this one avoids such problems. + * + * This can be used for in-kernel filesystem clients such as file servers. + * + * It should be called without the parent i_rwsem held, and will take + * the i_rwsem itself if necessary. If a fatal signal is pending or + * delivered, it will return %-EINTR if the lock is needed. + */ +struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap, + struct qstr *name, + struct dentry *base) +{ + int err; + struct dentry *ret; + + err = lookup_one_common(idmap, name, base); + if (err) + return ERR_PTR(err); + + ret = lookup_dcache(name, base, 0); + if (!ret) + ret = lookup_slow_killable(name, base, 0); + if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { + dput(ret); + ret = ERR_PTR(-ENOENT); } + return ret; +} +EXPORT_SYMBOL(lookup_one_positive_killable); - while (len--) { - c = *(const unsigned char *)name++; - if (c == '/' || c == '\0') - return ERR_PTR(-EACCES); +/** + * lookup_one_positive_unlocked - lookup single pathname component + * @idmap: idmap of the mount the lookup is performed from + * @name: qstr holding pathname component to lookup + * @base: base directory to lookup from + * + * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns + * known positive or ERR_PTR(). This is what most of the users want. + * + * Note that pinned negative with unlocked parent _can_ become positive at any + * time, so callers of lookup_one_unlocked() need to be very careful; pinned + * positives have >d_inode stable, so this one avoids such problems. + * + * This can be used for in-kernel filesystem clients such as file servers. + * + * The helper should be called without i_rwsem held. + */ +struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, + struct qstr *name, + struct dentry *base) +{ + struct dentry *ret = lookup_one_unlocked(idmap, name, base); + + if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { + dput(ret); + ret = ERR_PTR(-ENOENT); } - /* - * See if the low-level filesystem might want - * to use its own hash.. - */ - if (base->d_flags & DCACHE_OP_HASH) { - int err = base->d_op->d_hash(base, &this); - if (err < 0) - return ERR_PTR(err); + return ret; +} +EXPORT_SYMBOL(lookup_one_positive_unlocked); + +/** + * lookup_noperm_unlocked - filesystem helper to lookup single pathname component + * @name: pathname component to lookup + * @base: base directory to lookup from + * + * Note that this routine is purely a helper for filesystem usage and should + * not be called by generic code. It does no permission checking. + * + * Unlike lookup_noperm(), it should be called without the parent + * i_rwsem held, and will take the i_rwsem itself if necessary. + * + * Unlike try_lookup_noperm() it *does* revalidate the dentry if it already + * existed. + */ +struct dentry *lookup_noperm_unlocked(struct qstr *name, struct dentry *base) +{ + struct dentry *ret; + int err; + + err = lookup_noperm_common(name, base); + if (err) + return ERR_PTR(err); + + ret = lookup_dcache(name, base, 0); + if (!ret) + ret = lookup_slow(name, base, 0); + return ret; +} +EXPORT_SYMBOL(lookup_noperm_unlocked); + +/* + * Like lookup_noperm_unlocked(), except that it yields ERR_PTR(-ENOENT) + * on negatives. Returns known positive or ERR_PTR(); that's what + * most of the users want. Note that pinned negative with unlocked parent + * _can_ become positive at any time, so callers of lookup_noperm_unlocked() + * need to be very careful; pinned positives have ->d_inode stable, so + * this one avoids such problems. + */ +struct dentry *lookup_noperm_positive_unlocked(struct qstr *name, + struct dentry *base) +{ + struct dentry *ret; + + ret = lookup_noperm_unlocked(name, base); + if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { + dput(ret); + ret = ERR_PTR(-ENOENT); } + return ret; +} +EXPORT_SYMBOL(lookup_noperm_positive_unlocked); + +/** + * start_creating - prepare to create a given name with permission checking + * @idmap: idmap of the mount + * @parent: directory in which to prepare to create the name + * @name: the name to be created + * + * Locks are taken and a lookup is performed prior to creating + * an object in a directory. Permission checking (MAY_EXEC) is performed + * against @idmap. + * + * If the name already exists, a positive dentry is returned, so + * behaviour is similar to O_CREAT without O_EXCL, which doesn't fail + * with -EEXIST. + * + * Returns: a negative or positive dentry, or an error. + */ +struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent, + struct qstr *name) +{ + int err = lookup_one_common(idmap, name, parent); - err = inode_permission(base->d_inode, MAY_EXEC); if (err) return ERR_PTR(err); + return start_dirop(parent, name, LOOKUP_CREATE); +} +EXPORT_SYMBOL(start_creating); + +/** + * start_removing - prepare to remove a given name with permission checking + * @idmap: idmap of the mount + * @parent: directory in which to find the name + * @name: the name to be removed + * + * Locks are taken and a lookup in performed prior to removing + * an object from a directory. Permission checking (MAY_EXEC) is performed + * against @idmap. + * + * If the name doesn't exist, an error is returned. + * + * end_removing() should be called when removal is complete, or aborted. + * + * Returns: a positive dentry, or an error. + */ +struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent, + struct qstr *name) +{ + int err = lookup_one_common(idmap, name, parent); - return __lookup_hash(&this, base, 0); + if (err) + return ERR_PTR(err); + return start_dirop(parent, name, 0); } +EXPORT_SYMBOL(start_removing); -int user_path_at_empty(int dfd, const char __user *name, unsigned flags, - struct path *path, int *empty) +/** + * start_creating_killable - prepare to create a given name with permission checking + * @idmap: idmap of the mount + * @parent: directory in which to prepare to create the name + * @name: the name to be created + * + * Locks are taken and a lookup in performed prior to creating + * an object in a directory. Permission checking (MAY_EXEC) is performed + * against @idmap. + * + * If the name already exists, a positive dentry is returned. + * + * If a signal is received or was already pending, the function aborts + * with -EINTR; + * + * Returns: a negative or positive dentry, or an error. + */ +struct dentry *start_creating_killable(struct mnt_idmap *idmap, + struct dentry *parent, + struct qstr *name) { - struct nameidata nd; - struct filename *tmp = getname_flags(name, flags, empty); - int err = PTR_ERR(tmp); - if (!IS_ERR(tmp)) { + int err = lookup_one_common(idmap, name, parent); - BUG_ON(flags & LOOKUP_PARENT); + if (err) + return ERR_PTR(err); + return __start_dirop(parent, name, LOOKUP_CREATE, TASK_KILLABLE); +} +EXPORT_SYMBOL(start_creating_killable); - err = filename_lookup(dfd, tmp, flags, &nd); - putname(tmp); - if (!err) - *path = nd.path; - } - return err; +/** + * start_removing_killable - prepare to remove a given name with permission checking + * @idmap: idmap of the mount + * @parent: directory in which to find the name + * @name: the name to be removed + * + * Locks are taken and a lookup in performed prior to removing + * an object from a directory. Permission checking (MAY_EXEC) is performed + * against @idmap. + * + * If the name doesn't exist, an error is returned. + * + * end_removing() should be called when removal is complete, or aborted. + * + * If a signal is received or was already pending, the function aborts + * with -EINTR; + * + * Returns: a positive dentry, or an error. + */ +struct dentry *start_removing_killable(struct mnt_idmap *idmap, + struct dentry *parent, + struct qstr *name) +{ + int err = lookup_one_common(idmap, name, parent); + + if (err) + return ERR_PTR(err); + return __start_dirop(parent, name, 0, TASK_KILLABLE); } +EXPORT_SYMBOL(start_removing_killable); -int user_path_at(int dfd, const char __user *name, unsigned flags, - struct path *path) +/** + * start_creating_noperm - prepare to create a given name without permission checking + * @parent: directory in which to prepare to create the name + * @name: the name to be created + * + * Locks are taken and a lookup in performed prior to creating + * an object in a directory. + * + * If the name already exists, a positive dentry is returned. + * + * Returns: a negative or positive dentry, or an error. + */ +struct dentry *start_creating_noperm(struct dentry *parent, + struct qstr *name) { - return user_path_at_empty(dfd, name, flags, path, NULL); + int err = lookup_noperm_common(name, parent); + + if (err) + return ERR_PTR(err); + return start_dirop(parent, name, LOOKUP_CREATE); } +EXPORT_SYMBOL(start_creating_noperm); -/* - * NB: most callers don't do anything directly with the reference to the - * to struct filename, but the nd->last pointer points into the name string - * allocated by getname. So we must hold the reference to it until all - * path-walking is complete. +/** + * start_removing_noperm - prepare to remove a given name without permission checking + * @parent: directory in which to find the name + * @name: the name to be removed + * + * Locks are taken and a lookup in performed prior to removing + * an object from a directory. + * + * If the name doesn't exist, an error is returned. + * + * end_removing() should be called when removal is complete, or aborted. + * + * Returns: a positive dentry, or an error. */ -static struct filename * -user_path_parent(int dfd, const char __user *path, struct nameidata *nd, - unsigned int flags) +struct dentry *start_removing_noperm(struct dentry *parent, + struct qstr *name) { - struct filename *s = getname(path); - int error; + int err = lookup_noperm_common(name, parent); - /* only LOOKUP_REVAL is allowed in extra flags */ - flags &= LOOKUP_REVAL; + if (err) + return ERR_PTR(err); + return start_dirop(parent, name, 0); +} +EXPORT_SYMBOL(start_removing_noperm); - if (IS_ERR(s)) - return s; +/** + * start_creating_dentry - prepare to create a given dentry + * @parent: directory from which dentry should be removed + * @child: the dentry to be removed + * + * A lock is taken to protect the dentry again other dirops and + * the validity of the dentry is checked: correct parent and still hashed. + * + * If the dentry is valid and negative a reference is taken and + * returned. If not an error is returned. + * + * end_creating() should be called when creation is complete, or aborted. + * + * Returns: the valid dentry, or an error. + */ +struct dentry *start_creating_dentry(struct dentry *parent, + struct dentry *child) +{ + inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); + if (unlikely(IS_DEADDIR(parent->d_inode) || + child->d_parent != parent || + d_unhashed(child))) { + inode_unlock(parent->d_inode); + return ERR_PTR(-EINVAL); + } + if (d_is_positive(child)) { + inode_unlock(parent->d_inode); + return ERR_PTR(-EEXIST); + } + return dget(child); +} +EXPORT_SYMBOL(start_creating_dentry); - error = filename_lookup(dfd, s, flags | LOOKUP_PARENT, nd); - if (error) { - putname(s); - return ERR_PTR(error); +/** + * start_removing_dentry - prepare to remove a given dentry + * @parent: directory from which dentry should be removed + * @child: the dentry to be removed + * + * A lock is taken to protect the dentry again other dirops and + * the validity of the dentry is checked: correct parent and still hashed. + * + * If the dentry is valid and positive, a reference is taken and + * returned. If not an error is returned. + * + * end_removing() should be called when removal is complete, or aborted. + * + * Returns: the valid dentry, or an error. + */ +struct dentry *start_removing_dentry(struct dentry *parent, + struct dentry *child) +{ + inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); + if (unlikely(IS_DEADDIR(parent->d_inode) || + child->d_parent != parent || + d_unhashed(child))) { + inode_unlock(parent->d_inode); + return ERR_PTR(-EINVAL); + } + if (d_is_negative(child)) { + inode_unlock(parent->d_inode); + return ERR_PTR(-ENOENT); + } + return dget(child); +} +EXPORT_SYMBOL(start_removing_dentry); + +#ifdef CONFIG_UNIX98_PTYS +int path_pts(struct path *path) +{ + /* Find something mounted on "pts" in the same directory as + * the input path. + */ + struct dentry *parent = dget_parent(path->dentry); + struct dentry *child; + struct qstr this = QSTR_INIT("pts", 3); + + if (unlikely(!path_connected(path->mnt, parent))) { + dput(parent); + return -ENOENT; } + dput(path->dentry); + path->dentry = parent; + child = d_hash_and_lookup(parent, &this); + if (IS_ERR_OR_NULL(child)) + return -ENOENT; - return s; + path->dentry = child; + dput(parent); + follow_down(path, 0); + return 0; } +#endif -/* - * It's inline, so penalty for filesystems that don't use sticky bit is - * minimal. - */ -static inline int check_sticky(struct inode *dir, struct inode *inode) +int user_path_at(int dfd, const char __user *name, unsigned flags, + struct path *path) +{ + struct filename *filename = getname_flags(name, flags); + int ret = filename_lookup(dfd, filename, flags, path, NULL); + + putname(filename); + return ret; +} +EXPORT_SYMBOL(user_path_at); + +int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, + struct inode *inode) { kuid_t fsuid = current_fsuid(); - if (!(dir->i_mode & S_ISVTX)) - return 0; - if (uid_eq(inode->i_uid, fsuid)) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), fsuid)) return 0; - if (uid_eq(dir->i_uid, fsuid)) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, dir), fsuid)) return 0; - return !inode_capable(inode, CAP_FOWNER); + return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER); } +EXPORT_SYMBOL(__check_sticky); /* * Check whether we can remove a link victim from directory dir, check @@ -2214,36 +3597,48 @@ static inline int check_sticky(struct inode *dir, struct inode *inode) * c. have CAP_FOWNER capability * 6. If the victim is append-only or immutable we can't do antyhing with * links pointing to it. - * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. - * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. - * 9. We can't remove a root or mountpoint. - * 10. We don't allow removal of NFS sillyrenamed files; it's handled by + * 7. If the victim has an unknown uid or gid we can't change the inode. + * 8. If we were asked to remove a directory and victim isn't one - ENOTDIR. + * 9. If we were asked to remove a non-directory and victim isn't one - EISDIR. + * 10. We can't remove a root or mountpoint. + * 11. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ -static int may_delete(struct inode *dir,struct dentry *victim,int isdir) +static int may_delete(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *victim, bool isdir) { + struct inode *inode = d_backing_inode(victim); int error; - if (!victim->d_inode) + if (d_is_negative(victim)) return -ENOENT; + BUG_ON(!inode); BUG_ON(victim->d_parent->d_inode != dir); + + /* Inode writeback is not safe when the uid or gid are invalid. */ + if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || + !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) + return -EOVERFLOW; + audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); - error = inode_permission(dir, MAY_WRITE | MAY_EXEC); + error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)|| - IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) + + if (check_sticky(idmap, dir, inode) || IS_APPEND(inode) || + IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || + HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (isdir) { - if (!S_ISDIR(victim->d_inode->i_mode)) + if (!d_is_dir(victim)) return -ENOTDIR; if (IS_ROOT(victim)) return -EBUSY; - } else if (S_ISDIR(victim->d_inode->i_mode)) + } else if (d_is_dir(victim)) return -EISDIR; if (IS_DEADDIR(dir)) return -ENOENT; @@ -2257,16 +3652,55 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir) * 1. We can't do it if child already exists (open has special treatment for * this case, but since we are inlined it's OK) * 2. We can't do it if dir is read-only (done in permission()) - * 3. We should have write and exec permissions on dir - * 4. We can't do it if dir is immutable (done in permission()) + * 3. We can't do it if the fs can't represent the fsuid or fsgid. + * 4. We should have write and exec permissions on dir + * 5. We can't do it if dir is immutable (done in permission()) */ -static inline int may_create(struct inode *dir, struct dentry *child) +static inline int may_create(struct mnt_idmap *idmap, + struct inode *dir, struct dentry *child) { + audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); if (child->d_inode) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - return inode_permission(dir, MAY_WRITE | MAY_EXEC); + if (!fsuidgid_has_mapping(dir->i_sb, idmap)) + return -EOVERFLOW; + + return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); +} + +// p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held +static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2) +{ + struct dentry *p = p1, *q = p2, *r; + + while ((r = p->d_parent) != p2 && r != p) + p = r; + if (r == p2) { + // p is a child of p2 and an ancestor of p1 or p1 itself + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2); + return p; + } + // p is the root of connected component that contains p1 + // p2 does not occur on the path from p to p1 + while ((r = q->d_parent) != p1 && r != p && r != q) + q = r; + if (r == p1) { + // q is a child of p1 and an ancestor of p2 or p2 itself + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); + return q; + } else if (likely(r == p)) { + // both p2 and p1 are descendents of p + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); + return NULL; + } else { // no common ancestor at the time we'd been called + mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); + return ERR_PTR(-EXDEV); + } } /* @@ -2274,73 +3708,468 @@ static inline int may_create(struct inode *dir, struct dentry *child) */ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) { - struct dentry *p; - if (p1 == p2) { - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); return NULL; } - mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex); + mutex_lock(&p1->d_sb->s_vfs_rename_mutex); + return lock_two_directories(p1, p2); +} +EXPORT_SYMBOL(lock_rename); - p = d_ancestor(p2, p1); - if (p) { - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD); - return p; - } +/* + * c1 and p2 should be on the same fs. + */ +struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2) +{ + if (READ_ONCE(c1->d_parent) == p2) { + /* + * hopefully won't need to touch ->s_vfs_rename_mutex at all. + */ + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); + /* + * now that p2 is locked, nobody can move in or out of it, + * so the test below is safe. + */ + if (likely(c1->d_parent == p2)) + return NULL; - p = d_ancestor(p1, p2); - if (p) { - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); - return p; + /* + * c1 got moved out of p2 while we'd been taking locks; + * unlock and fall back to slow case. + */ + inode_unlock(p2->d_inode); } - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); + mutex_lock(&c1->d_sb->s_vfs_rename_mutex); + /* + * nobody can move out of any directories on this fs. + */ + if (likely(c1->d_parent != p2)) + return lock_two_directories(c1->d_parent, p2); + + /* + * c1 got moved into p2 while we were taking locks; + * we need p2 locked and ->s_vfs_rename_mutex unlocked, + * for consistency with lock_rename(). + */ + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); + mutex_unlock(&c1->d_sb->s_vfs_rename_mutex); return NULL; } +EXPORT_SYMBOL(lock_rename_child); void unlock_rename(struct dentry *p1, struct dentry *p2) { - mutex_unlock(&p1->d_inode->i_mutex); + inode_unlock(p1->d_inode); if (p1 != p2) { - mutex_unlock(&p2->d_inode->i_mutex); - mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex); + inode_unlock(p2->d_inode); + mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); + } +} +EXPORT_SYMBOL(unlock_rename); + +/** + * __start_renaming - lookup and lock names for rename + * @rd: rename data containing parents and flags, and + * for receiving found dentries + * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL, + * LOOKUP_NO_SYMLINKS etc). + * @old_last: name of object in @rd.old_parent + * @new_last: name of object in @rd.new_parent + * + * Look up two names and ensure locks are in place for + * rename. + * + * On success the found dentries are stored in @rd.old_dentry, + * @rd.new_dentry and an extra ref is taken on @rd.old_parent. + * These references and the lock are dropped by end_renaming(). + * + * The passed in qstrs must have the hash calculated, and no permission + * checking is performed. + * + * Returns: zero or an error. + */ +static int +__start_renaming(struct renamedata *rd, int lookup_flags, + struct qstr *old_last, struct qstr *new_last) +{ + struct dentry *trap; + struct dentry *d1, *d2; + int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE; + int err; + + if (rd->flags & RENAME_EXCHANGE) + target_flags = 0; + if (rd->flags & RENAME_NOREPLACE) + target_flags |= LOOKUP_EXCL; + + trap = lock_rename(rd->old_parent, rd->new_parent); + if (IS_ERR(trap)) + return PTR_ERR(trap); + + d1 = lookup_one_qstr_excl(old_last, rd->old_parent, + lookup_flags); + err = PTR_ERR(d1); + if (IS_ERR(d1)) + goto out_unlock; + + d2 = lookup_one_qstr_excl(new_last, rd->new_parent, + lookup_flags | target_flags); + err = PTR_ERR(d2); + if (IS_ERR(d2)) + goto out_dput_d1; + + if (d1 == trap) { + /* source is an ancestor of target */ + err = -EINVAL; + goto out_dput_d2; + } + + if (d2 == trap) { + /* target is an ancestor of source */ + if (rd->flags & RENAME_EXCHANGE) + err = -EINVAL; + else + err = -ENOTEMPTY; + goto out_dput_d2; + } + + rd->old_dentry = d1; + rd->new_dentry = d2; + dget(rd->old_parent); + return 0; + +out_dput_d2: + dput(d2); +out_dput_d1: + dput(d1); +out_unlock: + unlock_rename(rd->old_parent, rd->new_parent); + return err; +} + +/** + * start_renaming - lookup and lock names for rename with permission checking + * @rd: rename data containing parents and flags, and + * for receiving found dentries + * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL, + * LOOKUP_NO_SYMLINKS etc). + * @old_last: name of object in @rd.old_parent + * @new_last: name of object in @rd.new_parent + * + * Look up two names and ensure locks are in place for + * rename. + * + * On success the found dentries are stored in @rd.old_dentry, + * @rd.new_dentry. Also the refcount on @rd->old_parent is increased. + * These references and the lock are dropped by end_renaming(). + * + * The passed in qstrs need not have the hash calculated, and basic + * eXecute permission checking is performed against @rd.mnt_idmap. + * + * Returns: zero or an error. + */ +int start_renaming(struct renamedata *rd, int lookup_flags, + struct qstr *old_last, struct qstr *new_last) +{ + int err; + + err = lookup_one_common(rd->mnt_idmap, old_last, rd->old_parent); + if (err) + return err; + err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent); + if (err) + return err; + return __start_renaming(rd, lookup_flags, old_last, new_last); +} +EXPORT_SYMBOL(start_renaming); + +static int +__start_renaming_dentry(struct renamedata *rd, int lookup_flags, + struct dentry *old_dentry, struct qstr *new_last) +{ + struct dentry *trap; + struct dentry *d2; + int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE; + int err; + + if (rd->flags & RENAME_EXCHANGE) + target_flags = 0; + if (rd->flags & RENAME_NOREPLACE) + target_flags |= LOOKUP_EXCL; + + /* Already have the dentry - need to be sure to lock the correct parent */ + trap = lock_rename_child(old_dentry, rd->new_parent); + if (IS_ERR(trap)) + return PTR_ERR(trap); + if (d_unhashed(old_dentry) || + (rd->old_parent && rd->old_parent != old_dentry->d_parent)) { + /* dentry was removed, or moved and explicit parent requested */ + err = -EINVAL; + goto out_unlock; + } + + d2 = lookup_one_qstr_excl(new_last, rd->new_parent, + lookup_flags | target_flags); + err = PTR_ERR(d2); + if (IS_ERR(d2)) + goto out_unlock; + + if (old_dentry == trap) { + /* source is an ancestor of target */ + err = -EINVAL; + goto out_dput_d2; + } + + if (d2 == trap) { + /* target is an ancestor of source */ + if (rd->flags & RENAME_EXCHANGE) + err = -EINVAL; + else + err = -ENOTEMPTY; + goto out_dput_d2; + } + + rd->old_dentry = dget(old_dentry); + rd->new_dentry = d2; + rd->old_parent = dget(old_dentry->d_parent); + return 0; + +out_dput_d2: + dput(d2); +out_unlock: + unlock_rename(old_dentry->d_parent, rd->new_parent); + return err; +} + +/** + * start_renaming_dentry - lookup and lock name for rename with permission checking + * @rd: rename data containing parents and flags, and + * for receiving found dentries + * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL, + * LOOKUP_NO_SYMLINKS etc). + * @old_dentry: dentry of name to move + * @new_last: name of target in @rd.new_parent + * + * Look up target name and ensure locks are in place for + * rename. + * + * On success the found dentry is stored in @rd.new_dentry and + * @rd.old_parent is confirmed to be the parent of @old_dentry. If it + * was originally %NULL, it is set. In either case a reference is taken + * so that end_renaming() can have a stable reference to unlock. + * + * References and the lock can be dropped with end_renaming() + * + * The passed in qstr need not have the hash calculated, and basic + * eXecute permission checking is performed against @rd.mnt_idmap. + * + * Returns: zero or an error. + */ +int start_renaming_dentry(struct renamedata *rd, int lookup_flags, + struct dentry *old_dentry, struct qstr *new_last) +{ + int err; + + err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent); + if (err) + return err; + return __start_renaming_dentry(rd, lookup_flags, old_dentry, new_last); +} +EXPORT_SYMBOL(start_renaming_dentry); + +/** + * start_renaming_two_dentries - Lock to dentries in given parents for rename + * @rd: rename data containing parent + * @old_dentry: dentry of name to move + * @new_dentry: dentry to move to + * + * Ensure locks are in place for rename and check parentage is still correct. + * + * On success the two dentries are stored in @rd.old_dentry and + * @rd.new_dentry and @rd.old_parent and @rd.new_parent are confirmed to + * be the parents of the dentries. + * + * References and the lock can be dropped with end_renaming() + * + * Returns: zero or an error. + */ +int +start_renaming_two_dentries(struct renamedata *rd, + struct dentry *old_dentry, struct dentry *new_dentry) +{ + struct dentry *trap; + int err; + + /* Already have the dentry - need to be sure to lock the correct parent */ + trap = lock_rename_child(old_dentry, rd->new_parent); + if (IS_ERR(trap)) + return PTR_ERR(trap); + err = -EINVAL; + if (d_unhashed(old_dentry) || + (rd->old_parent && rd->old_parent != old_dentry->d_parent)) + /* old_dentry was removed, or moved and explicit parent requested */ + goto out_unlock; + if (d_unhashed(new_dentry) || + rd->new_parent != new_dentry->d_parent) + /* new_dentry was removed or moved */ + goto out_unlock; + + if (old_dentry == trap) + /* source is an ancestor of target */ + goto out_unlock; + + if (new_dentry == trap) { + /* target is an ancestor of source */ + if (rd->flags & RENAME_EXCHANGE) + err = -EINVAL; + else + err = -ENOTEMPTY; + goto out_unlock; } + + err = -EEXIST; + if (d_is_positive(new_dentry) && (rd->flags & RENAME_NOREPLACE)) + goto out_unlock; + + rd->old_dentry = dget(old_dentry); + rd->new_dentry = dget(new_dentry); + rd->old_parent = dget(old_dentry->d_parent); + return 0; + +out_unlock: + unlock_rename(old_dentry->d_parent, rd->new_parent); + return err; +} +EXPORT_SYMBOL(start_renaming_two_dentries); + +void end_renaming(struct renamedata *rd) +{ + unlock_rename(rd->old_parent, rd->new_parent); + dput(rd->old_dentry); + dput(rd->new_dentry); + dput(rd->old_parent); } +EXPORT_SYMBOL(end_renaming); -int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool want_excl) +/** + * vfs_prepare_mode - prepare the mode to be used for a new inode + * @idmap: idmap of the mount the inode was found from + * @dir: parent directory of the new inode + * @mode: mode of the new inode + * @mask_perms: allowed permission by the vfs + * @type: type of file to be created + * + * This helper consolidates and enforces vfs restrictions on the @mode of a new + * object to be created. + * + * Umask stripping depends on whether the filesystem supports POSIX ACLs (see + * the kernel documentation for mode_strip_umask()). Moving umask stripping + * after setgid stripping allows the same ordering for both non-POSIX ACL and + * POSIX ACL supporting filesystems. + * + * Note that it's currently valid for @type to be 0 if a directory is created. + * Filesystems raise that flag individually and we need to check whether each + * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a + * non-zero type. + * + * Returns: mode to be passed to the filesystem + */ +static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, + const struct inode *dir, umode_t mode, + umode_t mask_perms, umode_t type) { - int error = may_create(dir, dentry); + mode = mode_strip_sgid(idmap, dir, mode); + mode = mode_strip_umask(dir, mode); + + /* + * Apply the vfs mandated allowed permission mask and set the type of + * file to be created before we call into the filesystem. + */ + mode &= (mask_perms & ~S_IFMT); + mode |= (type & S_IFMT); + + return mode; +} + +/** + * vfs_create - create new file + * @idmap: idmap of the mount the inode was found from + * @dentry: dentry of the child file + * @mode: mode of the child file + * @di: returns parent inode, if the inode is delegated. + * + * Create a new file. + * + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply pass @nop_mnt_idmap. + */ +int vfs_create(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode, + struct delegated_inode *di) +{ + struct inode *dir = d_inode(dentry->d_parent); + int error; + + error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->create) return -EACCES; /* shouldn't it be ENOSYS? */ + + mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG); + error = security_inode_create(dir, dentry, mode); + if (error) + return error; + error = try_break_deleg(dir, di); + if (error) + return error; + error = dir->i_op->create(idmap, dir, dentry, mode, true); + if (!error) + fsnotify_create(dir, dentry); + return error; +} +EXPORT_SYMBOL(vfs_create); + +int vfs_mkobj(struct dentry *dentry, umode_t mode, + int (*f)(struct dentry *, umode_t, void *), + void *arg) +{ + struct inode *dir = dentry->d_parent->d_inode; + int error = may_create(&nop_mnt_idmap, dir, dentry); + if (error) + return error; + mode &= S_IALLUGO; mode |= S_IFREG; error = security_inode_create(dir, dentry, mode); if (error) return error; - error = dir->i_op->create(dir, dentry, mode, want_excl); + error = f(dentry, mode, arg); if (!error) fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_mkobj); + +bool may_open_dev(const struct path *path) +{ + return !(path->mnt->mnt_flags & MNT_NODEV) && + !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); +} -static int may_open(struct path *path, int acc_mode, int flag) +static int may_open(struct mnt_idmap *idmap, const struct path *path, + int acc_mode, int flag) { struct dentry *dentry = path->dentry; struct inode *inode = dentry->d_inode; int error; - /* O_PATH? */ - if (!acc_mode) - return 0; - if (!inode) return -ENOENT; @@ -2350,19 +4179,29 @@ static int may_open(struct path *path, int acc_mode, int flag) case S_IFDIR: if (acc_mode & MAY_WRITE) return -EISDIR; + if (acc_mode & MAY_EXEC) + return -EACCES; break; case S_IFBLK: case S_IFCHR: - if (path->mnt->mnt_flags & MNT_NODEV) + if (!may_open_dev(path)) return -EACCES; - /*FALLTHRU*/ + fallthrough; case S_IFIFO: case S_IFSOCK: + if (acc_mode & MAY_EXEC) + return -EACCES; flag &= ~O_TRUNC; break; + case S_IFREG: + if ((acc_mode & MAY_EXEC) && path_noexec(path)) + return -EACCES; + break; + default: + VFS_BUG_ON_INODE(!IS_ANON_FILE(inode), inode); } - error = inode_permission(inode, acc_mode); + error = inode_permission(idmap, inode, MAY_OPEN | acc_mode); if (error) return error; @@ -2377,27 +4216,23 @@ static int may_open(struct path *path, int acc_mode, int flag) } /* O_NOATIME can only be set by the owner or superuser */ - if (flag & O_NOATIME && !inode_owner_or_capable(inode)) + if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode)) return -EPERM; return 0; } -static int handle_truncate(struct file *filp) +static int handle_truncate(struct mnt_idmap *idmap, struct file *filp) { - struct path *path = &filp->f_path; + const struct path *path = &filp->f_path; struct inode *inode = path->dentry->d_inode; int error = get_write_access(inode); if (error) return error; - /* - * Refuse to truncate files with mandatory locks held on them. - */ - error = locks_verify_locked(inode); - if (!error) - error = security_path_truncate(path); + + error = security_file_truncate(filp); if (!error) { - error = do_truncate(path->dentry, 0, + error = do_truncate(idmap, path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, filp); } @@ -2412,13 +4247,19 @@ static inline int open_to_namei_flags(int flag) return flag; } -static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode) +static int may_o_create(struct mnt_idmap *idmap, + const struct path *dir, struct dentry *dentry, + umode_t mode) { int error = security_path_mknod(dir, dentry, mode, 0); if (error) return error; - error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); + if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap)) + return -EOVERFLOW; + + error = inode_permission(idmap, dir->dentry->d_inode, + MAY_WRITE | MAY_EXEC); if (error) return error; @@ -2432,43 +4273,111 @@ static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode) * Returns 0 if successful. The file will have been created and attached to * @file by the filesystem calling finish_open(). * - * Returns 1 if the file was looked up only or didn't need creating. The - * caller will need to perform the open themselves. @path will have been - * updated to point to the new dentry. This may be negative. + * If the file was looked up only or didn't need creating, FMODE_OPENED won't + * be set. The caller will need to perform the open themselves. @path will + * have been updated to point to the new dentry. This may be negative. * * Returns an error code otherwise. */ -static int atomic_open(struct nameidata *nd, struct dentry *dentry, - struct path *path, struct file *file, - const struct open_flags *op, - bool got_write, bool need_lookup, - int *opened) +static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, + struct file *file, + int open_flag, umode_t mode) { + struct dentry *const DENTRY_NOT_SET = (void *) -1UL; struct inode *dir = nd->path.dentry->d_inode; - unsigned open_flag = open_to_namei_flags(op->open_flag); - umode_t mode; int error; - int acc_mode; - int create_error = 0; - struct dentry *const DENTRY_NOT_SET = (void *) -1UL; - BUG_ON(dentry->d_inode); + if (nd->flags & LOOKUP_DIRECTORY) + open_flag |= O_DIRECTORY; - /* Don't create child dentry for a dead directory. */ - if (unlikely(IS_DEADDIR(dir))) { - error = -ENOENT; - goto out; + file->__f_path.dentry = DENTRY_NOT_SET; + file->__f_path.mnt = nd->path.mnt; + error = dir->i_op->atomic_open(dir, dentry, file, + open_to_namei_flags(open_flag), mode); + d_lookup_done(dentry); + if (!error) { + if (file->f_mode & FMODE_OPENED) { + if (unlikely(dentry != file->f_path.dentry)) { + dput(dentry); + dentry = dget(file->f_path.dentry); + } + } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { + error = -EIO; + } else { + if (file->f_path.dentry) { + dput(dentry); + dentry = file->f_path.dentry; + } + if (unlikely(d_is_negative(dentry))) + error = -ENOENT; + } } + if (error) { + dput(dentry); + dentry = ERR_PTR(error); + } + return dentry; +} - mode = op->mode; - if ((open_flag & O_CREAT) && !IS_POSIXACL(dir)) - mode &= ~current_umask(); +/* + * Look up and maybe create and open the last component. + * + * Must be called with parent locked (exclusive in O_CREAT case). + * + * Returns 0 on success, that is, if + * the file was successfully atomically created (if necessary) and opened, or + * the file was not completely opened at this time, though lookups and + * creations were performed. + * These case are distinguished by presence of FMODE_OPENED on file->f_mode. + * In the latter case dentry returned in @path might be negative if O_CREAT + * hadn't been specified. + * + * An error code is returned on failure. + */ +static struct dentry *lookup_open(struct nameidata *nd, struct file *file, + const struct open_flags *op, + bool got_write, struct delegated_inode *delegated_inode) +{ + struct mnt_idmap *idmap; + struct dentry *dir = nd->path.dentry; + struct inode *dir_inode = dir->d_inode; + int open_flag = op->open_flag; + struct dentry *dentry; + int error, create_error = 0; + umode_t mode = op->mode; + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); - if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) { - open_flag &= ~O_TRUNC; - *opened |= FILE_CREATED; + if (unlikely(IS_DEADDIR(dir_inode))) + return ERR_PTR(-ENOENT); + + file->f_mode &= ~FMODE_CREATED; + dentry = d_lookup(dir, &nd->last); + for (;;) { + if (!dentry) { + dentry = d_alloc_parallel(dir, &nd->last, &wq); + if (IS_ERR(dentry)) + return dentry; + } + if (d_in_lookup(dentry)) + break; + + error = d_revalidate(dir_inode, &nd->last, dentry, nd->flags); + if (likely(error > 0)) + break; + if (error) + goto out_dput; + d_invalidate(dentry); + dput(dentry); + dentry = NULL; + } + if (dentry->d_inode) { + /* Cached positive dentry: will open in f_op->open */ + return dentry; } + if (open_flag & O_CREAT) + audit_inode(nd->name, dir, AUDIT_INODE_PARENT); + /* * Checking write permission is tricky, bacuse we don't know if we are * going to actually need it: O_CREAT opens should work as long as the @@ -2478,559 +4387,420 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, * Another problem is returing the "right" error value (e.g. for an * O_EXCL open we want to return EEXIST not EROFS). */ - if (((open_flag & (O_CREAT | O_TRUNC)) || - (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) { - if (!(open_flag & O_CREAT)) { - /* - * No O_CREATE -> atomicity not a requirement -> fall - * back to lookup + open - */ - goto no_open; - } else if (open_flag & (O_EXCL | O_TRUNC)) { - /* Fall back and fail with the right error */ - create_error = -EROFS; - goto no_open; - } else { - /* No side effects, safe to clear O_CREAT */ + if (unlikely(!got_write)) + open_flag &= ~O_TRUNC; + idmap = mnt_idmap(nd->path.mnt); + if (open_flag & O_CREAT) { + if (open_flag & O_EXCL) + open_flag &= ~O_TRUNC; + mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode); + if (likely(got_write)) + create_error = may_o_create(idmap, &nd->path, + dentry, mode); + else create_error = -EROFS; - open_flag &= ~O_CREAT; - } + } + if (create_error) + open_flag &= ~O_CREAT; + if (dir_inode->i_op->atomic_open) { + dentry = atomic_open(nd, dentry, file, open_flag, mode); + if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT)) + dentry = ERR_PTR(create_error); + return dentry; } - if (open_flag & O_CREAT) { - error = may_o_create(&nd->path, dentry, mode); - if (error) { - create_error = error; - if (open_flag & O_EXCL) - goto no_open; - open_flag &= ~O_CREAT; + if (d_in_lookup(dentry)) { + struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry, + nd->flags); + d_lookup_done(dentry); + if (unlikely(res)) { + if (IS_ERR(res)) { + error = PTR_ERR(res); + goto out_dput; + } + dput(dentry); + dentry = res; } } - if (nd->flags & LOOKUP_DIRECTORY) - open_flag |= O_DIRECTORY; + /* Negative dentry, just create the file */ + if (!dentry->d_inode && (open_flag & O_CREAT)) { + /* but break the directory lease first! */ + error = try_break_deleg(dir_inode, delegated_inode); + if (error) + goto out_dput; - file->f_path.dentry = DENTRY_NOT_SET; - file->f_path.mnt = nd->path.mnt; - error = dir->i_op->atomic_open(dir, dentry, file, open_flag, mode, - opened); - if (error < 0) { - if (create_error && error == -ENOENT) - error = create_error; - goto out; - } + file->f_mode |= FMODE_CREATED; + audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE); + if (!dir_inode->i_op->create) { + error = -EACCES; + goto out_dput; + } - acc_mode = op->acc_mode; - if (*opened & FILE_CREATED) { - fsnotify_create(dir, dentry); - acc_mode = MAY_OPEN; + error = dir_inode->i_op->create(idmap, dir_inode, dentry, + mode, open_flag & O_EXCL); + if (error) + goto out_dput; } - - if (error) { /* returned 1, that is */ - if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { - error = -EIO; - goto out; - } - if (file->f_path.dentry) { - dput(dentry); - dentry = file->f_path.dentry; - } - if (create_error && dentry->d_inode == NULL) { - error = create_error; - goto out; - } - goto looked_up; + if (unlikely(create_error) && !dentry->d_inode) { + error = create_error; + goto out_dput; } + return dentry; - /* - * We didn't have the inode before the open, so check open permission - * here. - */ - error = may_open(&file->f_path, acc_mode, open_flag); - if (error) - fput(file); - -out: +out_dput: dput(dentry); - return error; - -no_open: - if (need_lookup) { - dentry = lookup_real(dir, dentry, nd->flags); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - - if (create_error) { - int open_flag = op->open_flag; + return ERR_PTR(error); +} - error = create_error; - if ((open_flag & O_EXCL)) { - if (!dentry->d_inode) - goto out; - } else if (!dentry->d_inode) { - goto out; - } else if ((open_flag & O_TRUNC) && - S_ISREG(dentry->d_inode->i_mode)) { - goto out; - } - /* will fail later, go on to get the right error */ - } - } -looked_up: - path->dentry = dentry; - path->mnt = nd->path.mnt; - return 1; +static inline bool trailing_slashes(struct nameidata *nd) +{ + return (bool)nd->last.name[nd->last.len]; } -/* - * Look up and maybe create and open the last component. - * - * Must be called with i_mutex held on parent. - * - * Returns 0 if the file was successfully atomically created (if necessary) and - * opened. In this case the file will be returned attached to @file. - * - * Returns 1 if the file was not completely opened at this time, though lookups - * and creations will have been performed and the dentry returned in @path will - * be positive upon return if O_CREAT was specified. If O_CREAT wasn't - * specified then a negative dentry may be returned. - * - * An error code is returned otherwise. - * - * FILE_CREATE will be set in @*opened if the dentry was created and will be - * cleared otherwise prior to returning. - */ -static int lookup_open(struct nameidata *nd, struct path *path, - struct file *file, - const struct open_flags *op, - bool got_write, int *opened) +static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag) { - struct dentry *dir = nd->path.dentry; - struct inode *dir_inode = dir->d_inode; struct dentry *dentry; - int error; - bool need_lookup; - - *opened &= ~FILE_CREATED; - dentry = lookup_dcache(&nd->last, dir, nd->flags, &need_lookup); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - /* Cached positive dentry: will open in f_op->open */ - if (!need_lookup && dentry->d_inode) - goto out_no_open; + if (open_flag & O_CREAT) { + if (trailing_slashes(nd)) + return ERR_PTR(-EISDIR); - if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) { - return atomic_open(nd, dentry, path, file, op, got_write, - need_lookup, opened); + /* Don't bother on an O_EXCL create */ + if (open_flag & O_EXCL) + return NULL; } - if (need_lookup) { - BUG_ON(dentry->d_inode); + if (trailing_slashes(nd)) + nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; - dentry = lookup_real(dir_inode, dentry, nd->flags); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - } + dentry = lookup_fast(nd); + if (IS_ERR_OR_NULL(dentry)) + return dentry; - /* Negative dentry, just create the file */ - if (!dentry->d_inode && (op->open_flag & O_CREAT)) { - umode_t mode = op->mode; - if (!IS_POSIXACL(dir->d_inode)) - mode &= ~current_umask(); - /* - * This write is needed to ensure that a - * rw->ro transition does not occur between - * the time when the file is created and when - * a permanent write count is taken through - * the 'struct file' in finish_open(). - */ - if (!got_write) { - error = -EROFS; - goto out_dput; + if (open_flag & O_CREAT) { + /* Discard negative dentries. Need inode_lock to do the create */ + if (!dentry->d_inode) { + if (!(nd->flags & LOOKUP_RCU)) + dput(dentry); + dentry = NULL; } - *opened |= FILE_CREATED; - error = security_path_mknod(&nd->path, dentry, mode, 0); - if (error) - goto out_dput; - error = vfs_create(dir->d_inode, dentry, mode, - nd->flags & LOOKUP_EXCL); - if (error) - goto out_dput; } -out_no_open: - path->dentry = dentry; - path->mnt = nd->path.mnt; - return 1; - -out_dput: - dput(dentry); - return error; + return dentry; } -/* - * Handle the last step of open() - */ -static int do_last(struct nameidata *nd, struct path *path, - struct file *file, const struct open_flags *op, - int *opened, struct filename *name) +static const char *open_last_lookups(struct nameidata *nd, + struct file *file, const struct open_flags *op) { + struct delegated_inode delegated_inode = { }; struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; - bool will_truncate = (open_flag & O_TRUNC) != 0; bool got_write = false; - int acc_mode = op->acc_mode; - struct inode *inode; - bool symlink_ok = false; - struct path save_parent = { .dentry = NULL, .mnt = NULL }; - bool retried = false; - int error; + struct dentry *dentry; + const char *res; - nd->flags &= ~LOOKUP_PARENT; nd->flags |= op->intent; if (nd->last_type != LAST_NORM) { - error = handle_dots(nd, nd->last_type); - if (error) - return error; - goto finish_open; + if (nd->depth) + put_link(nd); + return handle_dots(nd, nd->last_type); } - if (!(open_flag & O_CREAT)) { - if (nd->last.name[nd->last.len]) - nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; - if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) - symlink_ok = true; - /* we _can_ be in RCU mode here */ - error = lookup_fast(nd, path, &inode); - if (likely(!error)) - goto finish_lookup; - - if (error < 0) - goto out; + /* We _can_ be in RCU mode here */ + dentry = lookup_fast_for_open(nd, open_flag); + if (IS_ERR(dentry)) + return ERR_CAST(dentry); - BUG_ON(nd->inode != dir->d_inode); - } else { - /* create side of things */ - /* - * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED - * has been cleared when we got to the last component we are - * about to look up - */ - error = complete_walk(nd); - if (error) - return error; + if (likely(dentry)) + goto finish_lookup; - audit_inode(name, dir, LOOKUP_PARENT); - error = -EISDIR; - /* trailing slashes? */ - if (nd->last.name[nd->last.len]) - goto out; + if (!(open_flag & O_CREAT)) { + if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU)) + return ERR_PTR(-ECHILD); + } else { + if (nd->flags & LOOKUP_RCU) { + if (!try_to_unlazy(nd)) + return ERR_PTR(-ECHILD); + } } - -retry_lookup: - if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { - error = mnt_want_write(nd->path.mnt); - if (!error) - got_write = true; +retry: + if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { + got_write = !mnt_want_write(nd->path.mnt); /* * do _not_ fail yet - we might not need that or fail with * a different error; let lookup_open() decide; we'll be * dropping this one anyway. */ } - mutex_lock(&dir->d_inode->i_mutex); - error = lookup_open(nd, path, file, op, got_write, opened); - mutex_unlock(&dir->d_inode->i_mutex); - - if (error <= 0) { - if (error) - goto out; - - if ((*opened & FILE_CREATED) || - !S_ISREG(file_inode(file)->i_mode)) - will_truncate = false; - - audit_inode(name, file->f_path.dentry, 0); - goto opened; - } - - if (*opened & FILE_CREATED) { - /* Don't check for write permission, don't truncate */ - open_flag &= ~O_TRUNC; - will_truncate = false; - acc_mode = MAY_OPEN; - path_to_nameidata(path, nd); - goto finish_open_created; + if (open_flag & O_CREAT) + inode_lock(dir->d_inode); + else + inode_lock_shared(dir->d_inode); + dentry = lookup_open(nd, file, op, got_write, &delegated_inode); + if (!IS_ERR(dentry)) { + if (file->f_mode & FMODE_CREATED) + fsnotify_create(dir->d_inode, dentry); + if (file->f_mode & FMODE_OPENED) + fsnotify_open(file); } + if (open_flag & O_CREAT) + inode_unlock(dir->d_inode); + else + inode_unlock_shared(dir->d_inode); - /* - * create/update audit record if it already exists. - */ - if (path->dentry->d_inode) - audit_inode(name, path->dentry, 0); - - /* - * If atomic_open() acquired write access it is dropped now due to - * possible mount and symlink following (this might be optimized away if - * necessary...) - */ - if (got_write) { + if (got_write) mnt_drop_write(nd->path.mnt); - got_write = false; - } - - error = -EEXIST; - if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) - goto exit_dput; - error = follow_managed(path, nd->flags); - if (error < 0) - goto exit_dput; - - if (error) - nd->flags |= LOOKUP_JUMPED; + if (IS_ERR(dentry)) { + if (is_delegated(&delegated_inode)) { + int error = break_deleg_wait(&delegated_inode); - BUG_ON(nd->flags & LOOKUP_RCU); - inode = path->dentry->d_inode; -finish_lookup: - /* we _can_ be in RCU mode here */ - error = -ENOENT; - if (!inode) { - path_to_nameidata(path, nd); - goto out; + if (!error) + goto retry; + return ERR_PTR(error); + } + return ERR_CAST(dentry); } - if (should_follow_link(inode, !symlink_ok)) { - if (nd->flags & LOOKUP_RCU) { - if (unlikely(unlazy_walk(nd, path->dentry))) { - error = -ECHILD; - goto out; - } - } - BUG_ON(inode != path->dentry->d_inode); - return 1; + if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) { + dput(nd->path.dentry); + nd->path.dentry = dentry; + return NULL; } - if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) { - path_to_nameidata(path, nd); - } else { - save_parent.dentry = nd->path.dentry; - save_parent.mnt = mntget(path->mnt); - nd->path.dentry = path->dentry; +finish_lookup: + if (nd->depth) + put_link(nd); + res = step_into(nd, WALK_TRAILING, dentry); + if (unlikely(res)) + nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); + return res; +} + +/* + * Handle the last step of open() + */ +static int do_open(struct nameidata *nd, + struct file *file, const struct open_flags *op) +{ + struct mnt_idmap *idmap; + int open_flag = op->open_flag; + bool do_truncate; + int acc_mode; + int error; + if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) { + error = complete_walk(nd); + if (error) + return error; } - nd->inode = inode; - /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ -finish_open: - error = complete_walk(nd); - if (error) { - path_put(&save_parent); - return error; + if (!(file->f_mode & FMODE_CREATED)) + audit_inode(nd->name, nd->path.dentry, 0); + idmap = mnt_idmap(nd->path.mnt); + if (open_flag & O_CREAT) { + if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED)) + return -EEXIST; + if (d_is_dir(nd->path.dentry)) + return -EISDIR; + error = may_create_in_sticky(idmap, nd, + d_backing_inode(nd->path.dentry)); + if (unlikely(error)) + return error; } - audit_inode(name, nd->path.dentry, 0); - error = -EISDIR; - if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode)) - goto out; - error = -ENOTDIR; - if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode)) - goto out; - if (!S_ISREG(nd->inode->i_mode)) - will_truncate = false; + if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) + return -ENOTDIR; - if (will_truncate) { + do_truncate = false; + acc_mode = op->acc_mode; + if (file->f_mode & FMODE_CREATED) { + /* Don't check for write permission, don't truncate */ + open_flag &= ~O_TRUNC; + acc_mode = 0; + } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) { error = mnt_want_write(nd->path.mnt); if (error) - goto out; - got_write = true; + return error; + do_truncate = true; } -finish_open_created: - error = may_open(&nd->path, acc_mode, open_flag); - if (error) - goto out; - file->f_path.mnt = nd->path.mnt; - error = finish_open(file, nd->path.dentry, NULL, opened); - if (error) { - if (error == -EOPENSTALE) - goto stale_open; - goto out; + error = may_open(idmap, &nd->path, acc_mode, open_flag); + if (!error && !(file->f_mode & FMODE_OPENED)) + error = vfs_open(&nd->path, file); + if (!error) + error = security_file_post_open(file, op->acc_mode); + if (!error && do_truncate) + error = handle_truncate(idmap, file); + if (unlikely(error > 0)) { + WARN_ON(1); + error = -EINVAL; } -opened: - error = open_check_o_direct(file); + if (do_truncate) + mnt_drop_write(nd->path.mnt); + return error; +} + +/** + * vfs_tmpfile - create tmpfile + * @idmap: idmap of the mount the inode was found from + * @parentpath: pointer to the path of the base directory + * @file: file descriptor of the new tmpfile + * @mode: mode of the new tmpfile + * + * Create a temporary file. + * + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply pass @nop_mnt_idmap. + */ +int vfs_tmpfile(struct mnt_idmap *idmap, + const struct path *parentpath, + struct file *file, umode_t mode) +{ + struct dentry *child; + struct inode *dir = d_inode(parentpath->dentry); + struct inode *inode; + int error; + int open_flag = file->f_flags; + + /* we want directory to be writable */ + error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) - goto exit_fput; - error = ima_file_check(file, op->acc_mode); + return error; + if (!dir->i_op->tmpfile) + return -EOPNOTSUPP; + child = d_alloc(parentpath->dentry, &slash_name); + if (unlikely(!child)) + return -ENOMEM; + file->__f_path.mnt = parentpath->mnt; + file->__f_path.dentry = child; + mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); + error = dir->i_op->tmpfile(idmap, dir, file, mode); + dput(child); + if (file->f_mode & FMODE_OPENED) + fsnotify_open(file); if (error) - goto exit_fput; - - if (will_truncate) { - error = handle_truncate(file); - if (error) - goto exit_fput; + return error; + /* Don't check for other permissions, the inode was just created */ + error = may_open(idmap, &file->f_path, 0, file->f_flags); + if (error) + return error; + inode = file_inode(file); + if (!(open_flag & O_EXCL)) { + spin_lock(&inode->i_lock); + inode_state_set(inode, I_LINKABLE); + spin_unlock(&inode->i_lock); } -out: - if (got_write) - mnt_drop_write(nd->path.mnt); - path_put(&save_parent); - terminate_walk(nd); - return error; + security_inode_post_create_tmpfile(idmap, inode); + return 0; +} -exit_dput: - path_put_conditional(path, nd); - goto out; -exit_fput: - fput(file); - goto out; +/** + * kernel_tmpfile_open - open a tmpfile for kernel internal use + * @idmap: idmap of the mount the inode was found from + * @parentpath: path of the base directory + * @mode: mode of the new tmpfile + * @open_flag: flags + * @cred: credentials for open + * + * Create and open a temporary file. The file is not accounted in nr_files, + * hence this is only for kernel internal use, and must not be installed into + * file tables or such. + */ +struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, + const struct path *parentpath, + umode_t mode, int open_flag, + const struct cred *cred) +{ + struct file *file; + int error; -stale_open: - /* If no saved parent or already retried then can't retry */ - if (!save_parent.dentry || retried) - goto out; + file = alloc_empty_file_noaccount(open_flag, cred); + if (IS_ERR(file)) + return file; - BUG_ON(save_parent.dentry != dir); - path_put(&nd->path); - nd->path = save_parent; - nd->inode = dir->d_inode; - save_parent.mnt = NULL; - save_parent.dentry = NULL; - if (got_write) { - mnt_drop_write(nd->path.mnt); - got_write = false; + error = vfs_tmpfile(idmap, parentpath, file, mode); + if (error) { + fput(file); + file = ERR_PTR(error); } - retried = true; - goto retry_lookup; + return file; } +EXPORT_SYMBOL(kernel_tmpfile_open); -static int do_tmpfile(int dfd, struct filename *pathname, - struct nameidata *nd, int flags, +static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, - struct file *file, int *opened) + struct file *file) { - static const struct qstr name = QSTR_INIT("/", 1); - struct dentry *dentry, *child; - struct inode *dir; - int error = path_lookupat(dfd, pathname->name, - flags | LOOKUP_DIRECTORY, nd); + struct path path; + int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); + if (unlikely(error)) return error; - error = mnt_want_write(nd->path.mnt); + error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; - /* we want directory to be writable */ - error = inode_permission(nd->inode, MAY_WRITE | MAY_EXEC); - if (error) - goto out2; - dentry = nd->path.dentry; - dir = dentry->d_inode; - if (!dir->i_op->tmpfile) { - error = -EOPNOTSUPP; - goto out2; - } - child = d_alloc(dentry, &name); - if (unlikely(!child)) { - error = -ENOMEM; - goto out2; - } - nd->flags &= ~LOOKUP_DIRECTORY; - nd->flags |= op->intent; - dput(nd->path.dentry); - nd->path.dentry = child; - error = dir->i_op->tmpfile(dir, nd->path.dentry, op->mode); - if (error) - goto out2; - audit_inode(pathname, nd->path.dentry, 0); - error = may_open(&nd->path, op->acc_mode, op->open_flag); - if (error) - goto out2; - file->f_path.mnt = nd->path.mnt; - error = finish_open(file, nd->path.dentry, NULL, opened); + error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode); if (error) goto out2; - error = open_check_o_direct(file); - if (error) { - fput(file); - } else if (!(op->open_flag & O_EXCL)) { - struct inode *inode = file_inode(file); - spin_lock(&inode->i_lock); - inode->i_state |= I_LINKABLE; - spin_unlock(&inode->i_lock); - } + audit_inode(nd->name, file->f_path.dentry, 0); out2: - mnt_drop_write(nd->path.mnt); + mnt_drop_write(path.mnt); out: - path_put(&nd->path); + path_put(&path); return error; } -static struct file *path_openat(int dfd, struct filename *pathname, - struct nameidata *nd, const struct open_flags *op, int flags) +static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file) { - struct file *base = NULL; - struct file *file; struct path path; - int opened = 0; + int error = path_lookupat(nd, flags, &path); + if (!error) { + audit_inode(nd->name, path.dentry, 0); + error = vfs_open(&path, file); + path_put(&path); + } + return error; +} + +static struct file *path_openat(struct nameidata *nd, + const struct open_flags *op, unsigned flags) +{ + struct file *file; int error; - file = get_empty_filp(); + file = alloc_empty_file(op->open_flag, current_cred()); if (IS_ERR(file)) return file; - file->f_flags = op->open_flag; - if (unlikely(file->f_flags & __O_TMPFILE)) { - error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened); - goto out; - } - - error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base); - if (unlikely(error)) - goto out; - - current->total_link_count = 0; - error = link_path_walk(pathname->name, nd); - if (unlikely(error)) - goto out; - - error = do_last(nd, &path, file, op, &opened, pathname); - while (unlikely(error > 0)) { /* trailing symlink */ - struct path link = path; - void *cookie; - if (!(nd->flags & LOOKUP_FOLLOW)) { - path_put_conditional(&path, nd); - path_put(&nd->path); - error = -ELOOP; - break; - } - error = may_follow_link(&link, nd); - if (unlikely(error)) - break; - nd->flags |= LOOKUP_PARENT; - nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); - error = follow_link(&link, nd, &cookie); - if (unlikely(error)) - break; - error = do_last(nd, &path, file, op, &opened, pathname); - put_link(nd, &link, cookie); + error = do_tmpfile(nd, flags, op, file); + } else if (unlikely(file->f_flags & O_PATH)) { + error = do_o_path(nd, flags, file); + } else { + const char *s = path_init(nd, flags); + while (!(error = link_path_walk(s, nd)) && + (s = open_last_lookups(nd, file, op)) != NULL) + ; + if (!error) + error = do_open(nd, file, op); + terminate_walk(nd); } -out: - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) - path_put(&nd->root); - if (base) - fput(base); - if (!(opened & FILE_OPENED)) { - BUG_ON(!error); - put_filp(file); + if (likely(!error)) { + if (likely(file->f_mode & FMODE_OPENED)) + return file; + WARN_ON(1); + error = -EINVAL; } - if (unlikely(error)) { - if (error == -EOPENSTALE) { - if (flags & LOOKUP_RCU) - error = -ECHILD; - else - error = -ESTALE; - } - file = ERR_PTR(error); + fput_close(file); + if (error == -EOPENSTALE) { + if (flags & LOOKUP_RCU) + error = -ECHILD; + else + error = -ESTALE; } - return file; + return ERR_PTR(error); } struct file *do_filp_open(int dfd, struct filename *pathname, @@ -3040,52 +4810,54 @@ struct file *do_filp_open(int dfd, struct filename *pathname, int flags = op->lookup_flags; struct file *filp; - filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); + set_nameidata(&nd, dfd, pathname, NULL); + filp = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(filp == ERR_PTR(-ECHILD))) - filp = path_openat(dfd, pathname, &nd, op, flags); + filp = path_openat(&nd, op, flags); if (unlikely(filp == ERR_PTR(-ESTALE))) - filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL); + filp = path_openat(&nd, op, flags | LOOKUP_REVAL); + restore_nameidata(); return filp; } -struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, +struct file *do_file_open_root(const struct path *root, const char *name, const struct open_flags *op) { struct nameidata nd; struct file *file; - struct filename filename = { .name = name }; - int flags = op->lookup_flags | LOOKUP_ROOT; - - nd.root.mnt = mnt; - nd.root.dentry = dentry; + struct filename *filename; + int flags = op->lookup_flags; - if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) + if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); - file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU); + filename = getname_kernel(name); + if (IS_ERR(filename)) + return ERR_CAST(filename); + + set_nameidata(&nd, -1, filename, root); + file = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(file == ERR_PTR(-ECHILD))) - file = path_openat(-1, &filename, &nd, op, flags); + file = path_openat(&nd, op, flags); if (unlikely(file == ERR_PTR(-ESTALE))) - file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_REVAL); + file = path_openat(&nd, op, flags | LOOKUP_REVAL); + restore_nameidata(); + putname(filename); return file; } -struct dentry *kern_path_create(int dfd, const char *pathname, - struct path *path, unsigned int lookup_flags) +static struct dentry *filename_create(int dfd, struct filename *name, + struct path *path, unsigned int lookup_flags) { struct dentry *dentry = ERR_PTR(-EEXIST); - struct nameidata nd; - int err2; + struct qstr last; + bool want_dir = lookup_flags & LOOKUP_DIRECTORY; + unsigned int reval_flag = lookup_flags & LOOKUP_REVAL; + unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL; + int type; int error; - bool is_dir = (lookup_flags & LOOKUP_DIRECTORY); - /* - * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any - * other flags passed in are ignored! - */ - lookup_flags &= LOOKUP_REVAL; - - error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd); + error = filename_parentat(dfd, name, reval_flag, path, &last, &type); if (error) return ERR_PTR(error); @@ -3093,88 +4865,114 @@ struct dentry *kern_path_create(int dfd, const char *pathname, * Yucky last component or no last component at all? * (foo/., foo/.., /////) */ - if (nd.last_type != LAST_NORM) + if (unlikely(type != LAST_NORM)) goto out; - nd.flags &= ~LOOKUP_PARENT; - nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL; /* don't fail immediately if it's r/o, at least try to report other errors */ - err2 = mnt_want_write(nd.path.mnt); + error = mnt_want_write(path->mnt); /* - * Do the final lookup. + * Do the final lookup. Suppress 'create' if there is a trailing + * '/', and a directory wasn't requested. */ - mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - dentry = lookup_hash(&nd); + if (last.name[last.len] && !want_dir) + create_flags &= ~LOOKUP_CREATE; + dentry = start_dirop(path->dentry, &last, reval_flag | create_flags); if (IS_ERR(dentry)) - goto unlock; + goto out_drop_write; - error = -EEXIST; - if (dentry->d_inode) - goto fail; - /* - * Special case - lookup gave negative, but... we had foo/bar/ - * From the vfs_mknod() POV we just have a negative dentry - - * all is fine. Let's be bastards - you had / on the end, you've - * been asking for (non-existent) directory. -ENOENT for you. - */ - if (unlikely(!is_dir && nd.last.name[nd.last.len])) { - error = -ENOENT; - goto fail; - } - if (unlikely(err2)) { - error = err2; + if (unlikely(error)) goto fail; - } - *path = nd.path; + return dentry; fail: - dput(dentry); + end_dirop(dentry); dentry = ERR_PTR(error); -unlock: - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - if (!err2) - mnt_drop_write(nd.path.mnt); +out_drop_write: + if (!error) + mnt_drop_write(path->mnt); out: - path_put(&nd.path); + path_put(path); return dentry; } -EXPORT_SYMBOL(kern_path_create); -void done_path_create(struct path *path, struct dentry *dentry) +struct dentry *start_creating_path(int dfd, const char *pathname, + struct path *path, unsigned int lookup_flags) { - dput(dentry); - mutex_unlock(&path->dentry->d_inode->i_mutex); + struct filename *filename = getname_kernel(pathname); + struct dentry *res = filename_create(dfd, filename, path, lookup_flags); + + putname(filename); + return res; +} +EXPORT_SYMBOL(start_creating_path); + +/** + * end_creating_path - finish a code section started by start_creating_path() + * @path: the path instantiated by start_creating_path() + * @dentry: the dentry returned by start_creating_path() + * + * end_creating_path() will unlock and locks taken by start_creating_path() + * and drop an references that were taken. It should only be called + * if start_creating_path() returned a non-error. + * If vfs_mkdir() was called and it returned an error, that error *should* + * be passed to end_creating_path() together with the path. + */ +void end_creating_path(const struct path *path, struct dentry *dentry) +{ + end_creating(dentry); mnt_drop_write(path->mnt); path_put(path); } -EXPORT_SYMBOL(done_path_create); +EXPORT_SYMBOL(end_creating_path); -struct dentry *user_path_create(int dfd, const char __user *pathname, - struct path *path, unsigned int lookup_flags) +inline struct dentry *start_creating_user_path( + int dfd, const char __user *pathname, + struct path *path, unsigned int lookup_flags) { - struct filename *tmp = getname(pathname); - struct dentry *res; - if (IS_ERR(tmp)) - return ERR_CAST(tmp); - res = kern_path_create(dfd, tmp->name, path, lookup_flags); - putname(tmp); + struct filename *filename = getname(pathname); + struct dentry *res = filename_create(dfd, filename, path, lookup_flags); + + putname(filename); return res; } -EXPORT_SYMBOL(user_path_create); +EXPORT_SYMBOL(start_creating_user_path); + -int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) +/** + * vfs_mknod - create device node or file + * @idmap: idmap of the mount the inode was found from + * @dir: inode of the parent directory + * @dentry: dentry of the child device node + * @mode: mode of the child device node + * @dev: device number of device to create + * @delegated_inode: returns parent inode, if the inode is delegated. + * + * Create a device node or file. + * + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply pass @nop_mnt_idmap. + */ +int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev, + struct delegated_inode *delegated_inode) { - int error = may_create(dir, dentry); + bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; + int error = may_create(idmap, dir, dentry); if (error) return error; - if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) + if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout && + !capable(CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) return -EPERM; + mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = devcgroup_inode_mknod(mode, dev); if (error) return error; @@ -3183,11 +4981,16 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) if (error) return error; - error = dir->i_op->mknod(dir, dentry, mode, dev); + error = try_break_deleg(dir, delegated_inode); + if (error) + return error; + + error = dir->i_op->mknod(idmap, dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_mknod); static int may_mknod(umode_t mode) { @@ -3206,9 +5009,11 @@ static int may_mknod(umode_t mode) } } -SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, - unsigned, dev) +static int do_mknodat(int dfd, struct filename *name, umode_t mode, + unsigned int dev) { + struct delegated_inode di = { }; + struct mnt_idmap *idmap; struct dentry *dentry; struct path path; int error; @@ -3216,125 +5021,195 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, error = may_mknod(mode); if (error) - return error; + goto out1; retry: - dentry = user_path_create(dfd, filename, &path, lookup_flags); + dentry = filename_create(dfd, name, &path, lookup_flags); + error = PTR_ERR(dentry); if (IS_ERR(dentry)) - return PTR_ERR(dentry); + goto out1; - if (!IS_POSIXACL(path.dentry->d_inode)) - mode &= ~current_umask(); - error = security_path_mknod(&path, dentry, mode, dev); + error = security_path_mknod(&path, dentry, + mode_strip_umask(path.dentry->d_inode, mode), dev); if (error) - goto out; + goto out2; + + idmap = mnt_idmap(path.mnt); switch (mode & S_IFMT) { case 0: case S_IFREG: - error = vfs_create(path.dentry->d_inode,dentry,mode,true); + error = vfs_create(idmap, dentry, mode, &di); + if (!error) + security_path_post_mknod(idmap, dentry); break; case S_IFCHR: case S_IFBLK: - error = vfs_mknod(path.dentry->d_inode,dentry,mode, - new_decode_dev(dev)); + error = vfs_mknod(idmap, path.dentry->d_inode, + dentry, mode, new_decode_dev(dev), &di); break; case S_IFIFO: case S_IFSOCK: - error = vfs_mknod(path.dentry->d_inode,dentry,mode,0); + error = vfs_mknod(idmap, path.dentry->d_inode, + dentry, mode, 0, &di); break; } -out: - done_path_create(&path, dentry); +out2: + end_creating_path(&path, dentry); + if (is_delegated(&di)) { + error = break_deleg_wait(&di); + if (!error) + goto retry; + } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } +out1: + putname(name); return error; } +SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, + unsigned int, dev) +{ + return do_mknodat(dfd, getname(filename), mode, dev); +} + SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev) { - return sys_mknodat(AT_FDCWD, filename, mode, dev); + return do_mknodat(AT_FDCWD, getname(filename), mode, dev); } -int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +/** + * vfs_mkdir - create directory returning correct dentry if possible + * @idmap: idmap of the mount the inode was found from + * @dir: inode of the parent directory + * @dentry: dentry of the child directory + * @mode: mode of the child directory + * @delegated_inode: returns parent inode, if the inode is delegated. + * + * Create a directory. + * + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply pass @nop_mnt_idmap. + * + * In the event that the filesystem does not use the *@dentry but leaves it + * negative or unhashes it and possibly splices a different one returning it, + * the original dentry is dput() and the alternate is returned. + * + * In case of an error the dentry is dput() and an ERR_PTR() is returned. + */ +struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, + struct delegated_inode *delegated_inode) { - int error = may_create(dir, dentry); + int error; unsigned max_links = dir->i_sb->s_max_links; + struct dentry *de; + error = may_create(idmap, dir, dentry); if (error) - return error; + goto err; + error = -EPERM; if (!dir->i_op->mkdir) - return -EPERM; + goto err; - mode &= (S_IRWXUGO|S_ISVTX); + mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0); error = security_inode_mkdir(dir, dentry, mode); if (error) - return error; + goto err; + error = -EMLINK; if (max_links && dir->i_nlink >= max_links) - return -EMLINK; + goto err; - error = dir->i_op->mkdir(dir, dentry, mode); - if (!error) - fsnotify_mkdir(dir, dentry); - return error; + error = try_break_deleg(dir, delegated_inode); + if (error) + goto err; + + de = dir->i_op->mkdir(idmap, dir, dentry, mode); + error = PTR_ERR(de); + if (IS_ERR(de)) + goto err; + if (de) { + dput(dentry); + dentry = de; + } + fsnotify_mkdir(dir, dentry); + return dentry; + +err: + end_creating(dentry); + return ERR_PTR(error); } +EXPORT_SYMBOL(vfs_mkdir); -SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) +int do_mkdirat(int dfd, struct filename *name, umode_t mode) { struct dentry *dentry; struct path path; int error; unsigned int lookup_flags = LOOKUP_DIRECTORY; + struct delegated_inode delegated_inode = { }; retry: - dentry = user_path_create(dfd, pathname, &path, lookup_flags); + dentry = filename_create(dfd, name, &path, lookup_flags); + error = PTR_ERR(dentry); if (IS_ERR(dentry)) - return PTR_ERR(dentry); + goto out_putname; - if (!IS_POSIXACL(path.dentry->d_inode)) - mode &= ~current_umask(); - error = security_path_mkdir(&path, dentry, mode); - if (!error) - error = vfs_mkdir(path.dentry->d_inode, dentry, mode); - done_path_create(&path, dentry); + error = security_path_mkdir(&path, dentry, + mode_strip_umask(path.dentry->d_inode, mode)); + if (!error) { + dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, mode, &delegated_inode); + if (IS_ERR(dentry)) + error = PTR_ERR(dentry); + } + end_creating_path(&path, dentry); + if (is_delegated(&delegated_inode)) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry; + } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } +out_putname: + putname(name); return error; } +SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) +{ + return do_mkdirat(dfd, getname(pathname), mode); +} + SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) { - return sys_mkdirat(AT_FDCWD, pathname, mode); + return do_mkdirat(AT_FDCWD, getname(pathname), mode); } -/* - * The dentry_unhash() helper will try to drop the dentry early: we - * should have a usage count of 1 if we're the only user of this - * dentry, and if that is true (possibly after pruning the dcache), - * then we drop the dentry now. - * - * A low-level filesystem can, if it choses, legally - * do a +/** + * vfs_rmdir - remove directory + * @idmap: idmap of the mount the inode was found from + * @dir: inode of the parent directory + * @dentry: dentry of the child directory + * @delegated_inode: returns parent inode, if it's delegated. * - * if (!d_unhashed(dentry)) - * return -EBUSY; + * Remove a directory. * - * if it cannot handle the case of removing a directory - * that is still in use by something else.. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply pass @nop_mnt_idmap. */ -void dentry_unhash(struct dentry *dentry) +int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, struct delegated_inode *delegated_inode) { - shrink_dcache_parent(dentry); - spin_lock(&dentry->d_lock); - if (dentry->d_count == 1) - __d_drop(dentry); - spin_unlock(&dentry->d_lock); -} - -int vfs_rmdir(struct inode *dir, struct dentry *dentry) -{ - int error = may_delete(dir, dentry, 1); + int error = may_delete(idmap, dir, dentry, 1); if (error) return error; @@ -3343,97 +5218,133 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) return -EPERM; dget(dentry); - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock(dentry->d_inode); error = -EBUSY; - if (d_mountpoint(dentry)) + if (is_local_mountpoint(dentry) || + (dentry->d_inode->i_flags & S_KERNEL_FILE)) goto out; error = security_inode_rmdir(dir, dentry); if (error) goto out; - shrink_dcache_parent(dentry); + error = try_break_deleg(dir, delegated_inode); + if (error) + goto out; + error = dir->i_op->rmdir(dir, dentry); if (error) goto out; + shrink_dcache_parent(dentry); dentry->d_inode->i_flags |= S_DEAD; dont_mount(dentry); + detach_mounts(dentry); out: - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); dput(dentry); if (!error) - d_delete(dentry); + d_delete_notify(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_rmdir); -static long do_rmdir(int dfd, const char __user *pathname) +int do_rmdir(int dfd, struct filename *name) { - int error = 0; - struct filename *name; + int error; struct dentry *dentry; - struct nameidata nd; + struct path path; + struct qstr last; + int type; unsigned int lookup_flags = 0; + struct delegated_inode delegated_inode = { }; retry: - name = user_path_parent(dfd, pathname, &nd, lookup_flags); - if (IS_ERR(name)) - return PTR_ERR(name); + error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); + if (error) + goto exit1; - switch(nd.last_type) { + switch (type) { case LAST_DOTDOT: error = -ENOTEMPTY; - goto exit1; + goto exit2; case LAST_DOT: error = -EINVAL; - goto exit1; + goto exit2; case LAST_ROOT: error = -EBUSY; - goto exit1; + goto exit2; } - nd.flags &= ~LOOKUP_PARENT; - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(path.mnt); if (error) - goto exit1; + goto exit2; - mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - dentry = lookup_hash(&nd); + dentry = start_dirop(path.dentry, &last, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) - goto exit2; - if (!dentry->d_inode) { - error = -ENOENT; goto exit3; - } - error = security_path_rmdir(&nd.path, dentry); + error = security_path_rmdir(&path, dentry); if (error) - goto exit3; - error = vfs_rmdir(nd.path.dentry->d_inode, dentry); + goto exit4; + error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, &delegated_inode); +exit4: + end_dirop(dentry); exit3: - dput(dentry); + mnt_drop_write(path.mnt); exit2: - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - mnt_drop_write(nd.path.mnt); -exit1: - path_put(&nd.path); - putname(name); + path_put(&path); + if (is_delegated(&delegated_inode)) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry; + } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } +exit1: + putname(name); return error; } SYSCALL_DEFINE1(rmdir, const char __user *, pathname) { - return do_rmdir(AT_FDCWD, pathname); + return do_rmdir(AT_FDCWD, getname(pathname)); } -int vfs_unlink(struct inode *dir, struct dentry *dentry) +/** + * vfs_unlink - unlink a filesystem object + * @idmap: idmap of the mount the inode was found from + * @dir: parent directory + * @dentry: victim + * @delegated_inode: returns victim inode, if the inode is delegated. + * + * The caller must hold dir->i_rwsem exclusively. + * + * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and + * return a reference to the inode in delegated_inode. The caller + * should then break the delegation on that inode and retry. Because + * breaking a delegation may take a long time, the caller should drop + * dir->i_rwsem before doing so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + * + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply pass @nop_mnt_idmap. + */ +int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, struct delegated_inode *delegated_inode) { - int error = may_delete(dir, dentry, 0); + struct inode *target = dentry->d_inode; + int error = may_delete(idmap, dir, dentry, 0); if (error) return error; @@ -3441,92 +5352,111 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) if (!dir->i_op->unlink) return -EPERM; - mutex_lock(&dentry->d_inode->i_mutex); - if (d_mountpoint(dentry)) + inode_lock(target); + if (IS_SWAPFILE(target)) + error = -EPERM; + else if (is_local_mountpoint(dentry)) error = -EBUSY; else { error = security_inode_unlink(dir, dentry); if (!error) { + error = try_break_deleg(dir, delegated_inode); + if (error) + goto out; + error = try_break_deleg(target, delegated_inode); + if (error) + goto out; error = dir->i_op->unlink(dir, dentry); - if (!error) + if (!error) { dont_mount(dentry); + detach_mounts(dentry); + } } } - mutex_unlock(&dentry->d_inode->i_mutex); +out: + inode_unlock(target); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ - if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { - fsnotify_link_count(dentry->d_inode); - d_delete(dentry); + if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) { + fsnotify_unlink(dir, dentry); + } else if (!error) { + fsnotify_link_count(target); + d_delete_notify(dir, dentry); } return error; } +EXPORT_SYMBOL(vfs_unlink); /* * Make sure that the actual truncation of the file will occur outside its - * directory's i_mutex. Truncate can take a long time if there is a lot of + * directory's i_rwsem. Truncate can take a long time if there is a lot of * writeout happening, and we don't want to prevent access to the directory * while waiting on the I/O. */ -static long do_unlinkat(int dfd, const char __user *pathname) +int do_unlinkat(int dfd, struct filename *name) { int error; - struct filename *name; struct dentry *dentry; - struct nameidata nd; - struct inode *inode = NULL; + struct path path; + struct qstr last; + int type; + struct inode *inode; + struct delegated_inode delegated_inode = { }; unsigned int lookup_flags = 0; retry: - name = user_path_parent(dfd, pathname, &nd, lookup_flags); - if (IS_ERR(name)) - return PTR_ERR(name); + error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); + if (error) + goto exit_putname; error = -EISDIR; - if (nd.last_type != LAST_NORM) - goto exit1; + if (type != LAST_NORM) + goto exit_path_put; - nd.flags &= ~LOOKUP_PARENT; - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(path.mnt); if (error) - goto exit1; - - mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - dentry = lookup_hash(&nd); + goto exit_path_put; +retry_deleg: + dentry = start_dirop(path.dentry, &last, lookup_flags); error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - /* Why not before? Because we want correct error value */ - if (nd.last.name[nd.last.len]) - goto slashes; - inode = dentry->d_inode; - if (!inode) - goto slashes; - ihold(inode); - error = security_path_unlink(&nd.path, dentry); - if (error) - goto exit2; - error = vfs_unlink(nd.path.dentry->d_inode, dentry); -exit2: - dput(dentry); + if (IS_ERR(dentry)) + goto exit_drop_write; + + /* Why not before? Because we want correct error value */ + if (unlikely(last.name[last.len])) { + if (d_is_dir(dentry)) + error = -EISDIR; + else + error = -ENOTDIR; + end_dirop(dentry); + goto exit_drop_write; } - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - if (inode) - iput(inode); /* truncate the inode here */ - mnt_drop_write(nd.path.mnt); -exit1: - path_put(&nd.path); - putname(name); + inode = dentry->d_inode; + ihold(inode); + error = security_path_unlink(&path, dentry); + if (error) + goto exit_end_dirop; + error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, &delegated_inode); +exit_end_dirop: + end_dirop(dentry); + iput(inode); /* truncate the inode here */ + if (is_delegated(&delegated_inode)) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } +exit_drop_write: + mnt_drop_write(path.mnt); +exit_path_put: + path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; - inode = NULL; goto retry; } +exit_putname: + putname(name); return error; - -slashes: - error = !dentry->d_inode ? -ENOENT : - S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR; - goto exit2; } SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) @@ -3535,20 +5465,38 @@ SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) return -EINVAL; if (flag & AT_REMOVEDIR) - return do_rmdir(dfd, pathname); - - return do_unlinkat(dfd, pathname); + return do_rmdir(dfd, getname(pathname)); + return do_unlinkat(dfd, getname(pathname)); } SYSCALL_DEFINE1(unlink, const char __user *, pathname) { - return do_unlinkat(AT_FDCWD, pathname); + return do_unlinkat(AT_FDCWD, getname(pathname)); } -int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) +/** + * vfs_symlink - create symlink + * @idmap: idmap of the mount the inode was found from + * @dir: inode of the parent directory + * @dentry: dentry of the child symlink file + * @oldname: name of the file to link to + * @delegated_inode: returns victim inode, if the inode is delegated. + * + * Create a symlink. + * + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply pass @nop_mnt_idmap. + */ +int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, const char *oldname, + struct delegated_inode *delegated_inode) { - int error = may_create(dir, dentry); + int error; + error = may_create(idmap, dir, dentry); if (error) return error; @@ -3559,49 +5507,95 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) if (error) return error; - error = dir->i_op->symlink(dir, dentry, oldname); + error = try_break_deleg(dir, delegated_inode); + if (error) + return error; + + error = dir->i_op->symlink(idmap, dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_symlink); -SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, - int, newdfd, const char __user *, newname) +int do_symlinkat(struct filename *from, int newdfd, struct filename *to) { int error; - struct filename *from; struct dentry *dentry; struct path path; unsigned int lookup_flags = 0; + struct delegated_inode delegated_inode = { }; - from = getname(oldname); - if (IS_ERR(from)) - return PTR_ERR(from); + if (IS_ERR(from)) { + error = PTR_ERR(from); + goto out_putnames; + } retry: - dentry = user_path_create(newdfd, newname, &path, lookup_flags); + dentry = filename_create(newdfd, to, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) - goto out_putname; + goto out_putnames; error = security_path_symlink(&path, dentry, from->name); if (!error) - error = vfs_symlink(path.dentry->d_inode, dentry, from->name); - done_path_create(&path, dentry); + error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, from->name, &delegated_inode); + end_creating_path(&path, dentry); + if (is_delegated(&delegated_inode)) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry; + } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } -out_putname: +out_putnames: + putname(to); putname(from); return error; } +SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, + int, newdfd, const char __user *, newname) +{ + return do_symlinkat(getname(oldname), newdfd, getname(newname)); +} + SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname) { - return sys_symlinkat(oldname, AT_FDCWD, newname); + return do_symlinkat(getname(oldname), AT_FDCWD, getname(newname)); } -int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) +/** + * vfs_link - create a new link + * @old_dentry: object to be linked + * @idmap: idmap of the mount + * @dir: new parent + * @new_dentry: where to create the new link + * @delegated_inode: returns inode needing a delegation break + * + * The caller must hold dir->i_rwsem exclusively. + * + * If vfs_link discovers a delegation on the to-be-linked file in need + * of breaking, it will return -EWOULDBLOCK and return a reference to the + * inode in delegated_inode. The caller should then break the delegation + * and retry. Because breaking a delegation may take a long time, the + * caller should drop the i_rwsem before doing so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + * + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply pass @nop_mnt_idmap. + */ +int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, + struct inode *dir, struct dentry *new_dentry, + struct delegated_inode *delegated_inode) { struct inode *inode = old_dentry->d_inode; unsigned max_links = dir->i_sb->s_max_links; @@ -3610,7 +5604,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de if (!inode) return -ENOENT; - error = may_create(dir, new_dentry); + error = may_create(idmap, dir, new_dentry); if (error) return error; @@ -3622,6 +5616,13 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; + /* + * Updating the link count will likely cause i_uid and i_gid to + * be written back improperly if their true value is unknown to + * the vfs. + */ + if (HAS_UNMAPPED_ID(idmap, inode)) + return -EPERM; if (!dir->i_op->link) return -EPERM; if (S_ISDIR(inode->i_mode)) @@ -3631,25 +5632,31 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Make sure we don't allow creating hardlink to an unlinked file */ - if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE)) error = -ENOENT; else if (max_links && inode->i_nlink >= max_links) error = -EMLINK; - else - error = dir->i_op->link(old_dentry, dir, new_dentry); + else { + error = try_break_deleg(dir, delegated_inode); + if (!error) + error = try_break_deleg(inode, delegated_inode); + if (!error) + error = dir->i_op->link(old_dentry, dir, new_dentry); + } - if (!error && (inode->i_state & I_LINKABLE)) { + if (!error && (inode_state_read_once(inode) & I_LINKABLE)) { spin_lock(&inode->i_lock); - inode->i_state &= ~I_LINKABLE; + inode_state_clear(inode, I_LINKABLE); spin_unlock(&inode->i_lock); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!error) fsnotify_link(dir, inode, new_dentry); return error; } +EXPORT_SYMBOL(vfs_link); /* * Hardlinks are often used in delicate situations. We avoid @@ -3660,78 +5667,119 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de * with linux 2.0, and to avoid hard-linking to directories * and other special files. --ADM */ -SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, - int, newdfd, const char __user *, newname, int, flags) +int do_linkat(int olddfd, struct filename *old, int newdfd, + struct filename *new, int flags) { + struct mnt_idmap *idmap; struct dentry *new_dentry; struct path old_path, new_path; + struct delegated_inode delegated_inode = { }; int how = 0; int error; - if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) - return -EINVAL; + if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) { + error = -EINVAL; + goto out_putnames; + } /* - * To use null names we require CAP_DAC_READ_SEARCH + * To use null names we require CAP_DAC_READ_SEARCH or + * that the open-time creds of the dfd matches current. * This ensures that not everyone will be able to create - * handlink using the passed filedescriptor. + * a hardlink using the passed file descriptor. */ - if (flags & AT_EMPTY_PATH) { - if (!capable(CAP_DAC_READ_SEARCH)) - return -ENOENT; - how = LOOKUP_EMPTY; - } + if (flags & AT_EMPTY_PATH) + how |= LOOKUP_LINKAT_EMPTY; if (flags & AT_SYMLINK_FOLLOW) how |= LOOKUP_FOLLOW; retry: - error = user_path_at(olddfd, oldname, how, &old_path); + error = filename_lookup(olddfd, old, how, &old_path, NULL); if (error) - return error; + goto out_putnames; - new_dentry = user_path_create(newdfd, newname, &new_path, + new_dentry = filename_create(newdfd, new, &new_path, (how & LOOKUP_REVAL)); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) - goto out; + goto out_putpath; error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; - error = may_linkat(&old_path); + idmap = mnt_idmap(new_path.mnt); + error = may_linkat(idmap, &old_path); if (unlikely(error)) goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; - error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry); + error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, + new_dentry, &delegated_inode); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); + if (is_delegated(&delegated_inode)) { + error = break_deleg_wait(&delegated_inode); + if (!error) { + path_put(&old_path); + goto retry; + } + } if (retry_estale(error, how)) { + path_put(&old_path); how |= LOOKUP_REVAL; goto retry; } -out: +out_putpath: path_put(&old_path); +out_putnames: + putname(old); + putname(new); return error; } +SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname, int, flags) +{ + return do_linkat(olddfd, getname_uflags(oldname, flags), + newdfd, getname(newname), flags); +} + SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname) { - return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); + return do_linkat(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0); } -/* +/** + * vfs_rename - rename a filesystem object + * @rd: pointer to &struct renamedata info + * + * The caller must hold multiple mutexes--see lock_rename()). + * + * If vfs_rename discovers a delegation in need of breaking at either + * the source or destination, it will return -EWOULDBLOCK and return a + * reference to the inode in delegated_inode. The caller should then + * break the delegation and retry. Because breaking a delegation may + * take a long time, the caller should drop all locks before doing + * so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + * * The worst of all namespace operations - renaming directory. "Perverted" * doesn't even start to describe it. Somebody in UCB had a heck of a trip... * Problems: - * a) we can get into loop creation. Check is done in is_subdir(). + * + * a) we can get into loop creation. * b) race potential - two innocent renames can create a loop together. - * That's where 4.4 screws up. Current fix: serialization on + * That's where 4.4BSD screws up. Current fix: serialization on * sb->s_vfs_rename_mutex. We might be more accurate, but that's another * story. - * c) we have to lock _three_ objects - parents and victim (if it exists). - * And that - after we got ->i_mutex on parents (until then we don't know + * c) we may have to lock up to _four_ objects - parents and victim (if it exists), + * and source (if it's a non-directory or a subdirectory that moves to + * different parent). + * And that - after we got ->i_rwsem on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change * only under ->s_vfs_rename_mutex _and_ that parent of the object we @@ -3743,362 +5791,499 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * has no more than 1 dentry. If "hybrid" objects will ever appear, * we'd better make sure that there's no link(2) for them. * d) conversion from fhandle to dentry may come in the wrong moment - when - * we are removing the target. Solution: we will have to grab ->i_mutex + * we are removing the target. Solution: we will have to grab ->i_rwsem * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on - * ->i_mutex on parents, which works but leads to some truly excessive + * ->i_rwsem on parents, which works but leads to some truly excessive * locking]. */ -static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) +int vfs_rename(struct renamedata *rd) { - int error = 0; + int error; + struct inode *old_dir = d_inode(rd->old_parent); + struct inode *new_dir = d_inode(rd->new_parent); + struct dentry *old_dentry = rd->old_dentry; + struct dentry *new_dentry = rd->new_dentry; + struct delegated_inode *delegated_inode = rd->delegated_inode; + unsigned int flags = rd->flags; + bool is_dir = d_is_dir(old_dentry); + struct inode *source = old_dentry->d_inode; struct inode *target = new_dentry->d_inode; + bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; + struct name_snapshot old_name; + bool lock_old_subdir, lock_new_subdir; + + if (source == target) + return 0; + + error = may_delete(rd->mnt_idmap, old_dir, old_dentry, is_dir); + if (error) + return error; + + if (!target) { + error = may_create(rd->mnt_idmap, new_dir, new_dentry); + } else { + new_is_dir = d_is_dir(new_dentry); + + if (!(flags & RENAME_EXCHANGE)) + error = may_delete(rd->mnt_idmap, new_dir, + new_dentry, is_dir); + else + error = may_delete(rd->mnt_idmap, new_dir, + new_dentry, new_is_dir); + } + if (error) + return error; + + if (!old_dir->i_op->rename) + return -EPERM; /* * If we are going to change the parent - check write permissions, * we'll need to flip '..'. */ if (new_dir != old_dir) { - error = inode_permission(old_dentry->d_inode, MAY_WRITE); - if (error) - return error; + if (is_dir) { + error = inode_permission(rd->mnt_idmap, source, + MAY_WRITE); + if (error) + return error; + } + if ((flags & RENAME_EXCHANGE) && new_is_dir) { + error = inode_permission(rd->mnt_idmap, target, + MAY_WRITE); + if (error) + return error; + } } - error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); + error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry, + flags); if (error) return error; + take_dentry_name_snapshot(&old_name, old_dentry); dget(new_dentry); - if (target) - mutex_lock(&target->i_mutex); + /* + * Lock children. + * The source subdirectory needs to be locked on cross-directory + * rename or cross-directory exchange since its parent changes. + * The target subdirectory needs to be locked on cross-directory + * exchange due to parent change and on any rename due to becoming + * a victim. + * Non-directories need locking in all cases (for NFS reasons); + * they get locked after any subdirectories (in inode address order). + * + * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE. + * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex. + */ + lock_old_subdir = new_dir != old_dir; + lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE); + if (is_dir) { + if (lock_old_subdir) + inode_lock_nested(source, I_MUTEX_CHILD); + if (target && (!new_is_dir || lock_new_subdir)) + inode_lock(target); + } else if (new_is_dir) { + if (lock_new_subdir) + inode_lock_nested(target, I_MUTEX_CHILD); + inode_lock(source); + } else { + lock_two_nondirectories(source, target); + } - error = -EBUSY; - if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) + error = -EPERM; + if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target))) goto out; - error = -EMLINK; - if (max_links && !target && new_dir != old_dir && - new_dir->i_nlink >= max_links) - goto out; - - if (target) - shrink_dcache_parent(new_dentry); - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); - if (error) + error = -EBUSY; + if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) goto out; - if (target) { - target->i_flags |= S_DEAD; - dont_mount(new_dentry); + if (max_links && new_dir != old_dir) { + error = -EMLINK; + if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links) + goto out; + if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir && + old_dir->i_nlink >= max_links) + goto out; } -out: - if (target) - mutex_unlock(&target->i_mutex); - dput(new_dentry); - if (!error) - if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) - d_move(old_dentry,new_dentry); - return error; -} - -static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - struct inode *target = new_dentry->d_inode; - int error; - - error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); + error = try_break_deleg(old_dir, delegated_inode); if (error) - return error; - - dget(new_dentry); - if (target) - mutex_lock(&target->i_mutex); - - error = -EBUSY; - if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) goto out; - - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); + if (new_dir != old_dir) { + error = try_break_deleg(new_dir, delegated_inode); + if (error) + goto out; + } + if (!is_dir) { + error = try_break_deleg(source, delegated_inode); + if (error) + goto out; + } + if (target && !new_is_dir) { + error = try_break_deleg(target, delegated_inode); + if (error) + goto out; + } + error = old_dir->i_op->rename(rd->mnt_idmap, old_dir, old_dentry, + new_dir, new_dentry, flags); if (error) goto out; - if (target) + if (!(flags & RENAME_EXCHANGE) && target) { + if (is_dir) { + shrink_dcache_parent(new_dentry); + target->i_flags |= S_DEAD; + } dont_mount(new_dentry); - if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) - d_move(old_dentry, new_dentry); + detach_mounts(new_dentry); + } + if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) { + if (!(flags & RENAME_EXCHANGE)) + d_move(old_dentry, new_dentry); + else + d_exchange(old_dentry, new_dentry); + } out: - if (target) - mutex_unlock(&target->i_mutex); + if (!is_dir || lock_old_subdir) + inode_unlock(source); + if (target && (!new_is_dir || lock_new_subdir)) + inode_unlock(target); dput(new_dentry); - return error; -} - -int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - int error; - int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); - const unsigned char *old_name; - - if (old_dentry->d_inode == new_dentry->d_inode) - return 0; - - error = may_delete(old_dir, old_dentry, is_dir); - if (error) - return error; - - if (!new_dentry->d_inode) - error = may_create(new_dir, new_dentry); - else - error = may_delete(new_dir, new_dentry, is_dir); - if (error) - return error; - - if (!old_dir->i_op->rename) - return -EPERM; - - old_name = fsnotify_oldname_init(old_dentry->d_name.name); - - if (is_dir) - error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); - else - error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); - if (!error) - fsnotify_move(old_dir, new_dir, old_name, is_dir, - new_dentry->d_inode, old_dentry); - fsnotify_oldname_free(old_name); + if (!error) { + fsnotify_move(old_dir, new_dir, &old_name.name, is_dir, + !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry); + if (flags & RENAME_EXCHANGE) { + fsnotify_move(new_dir, old_dir, &old_dentry->d_name, + new_is_dir, NULL, new_dentry); + } + } + release_dentry_name_snapshot(&old_name); return error; } +EXPORT_SYMBOL(vfs_rename); -SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, - int, newdfd, const char __user *, newname) +int do_renameat2(int olddfd, struct filename *from, int newdfd, + struct filename *to, unsigned int flags) { - struct dentry *old_dir, *new_dir; - struct dentry *old_dentry, *new_dentry; - struct dentry *trap; - struct nameidata oldnd, newnd; - struct filename *from; - struct filename *to; + struct renamedata rd; + struct path old_path, new_path; + struct qstr old_last, new_last; + int old_type, new_type; + struct delegated_inode delegated_inode = { }; unsigned int lookup_flags = 0; bool should_retry = false; - int error; + int error = -EINVAL; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) + goto put_names; + + if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && + (flags & RENAME_EXCHANGE)) + goto put_names; + retry: - from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags); - if (IS_ERR(from)) { - error = PTR_ERR(from); - goto exit; - } + error = filename_parentat(olddfd, from, lookup_flags, &old_path, + &old_last, &old_type); + if (error) + goto put_names; - to = user_path_parent(newdfd, newname, &newnd, lookup_flags); - if (IS_ERR(to)) { - error = PTR_ERR(to); + error = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last, + &new_type); + if (error) goto exit1; - } error = -EXDEV; - if (oldnd.path.mnt != newnd.path.mnt) + if (old_path.mnt != new_path.mnt) goto exit2; - old_dir = oldnd.path.dentry; error = -EBUSY; - if (oldnd.last_type != LAST_NORM) + if (old_type != LAST_NORM) goto exit2; - new_dir = newnd.path.dentry; - if (newnd.last_type != LAST_NORM) + if (flags & RENAME_NOREPLACE) + error = -EEXIST; + if (new_type != LAST_NORM) goto exit2; - error = mnt_want_write(oldnd.path.mnt); + error = mnt_want_write(old_path.mnt); if (error) goto exit2; - oldnd.flags &= ~LOOKUP_PARENT; - newnd.flags &= ~LOOKUP_PARENT; - newnd.flags |= LOOKUP_RENAME_TARGET; +retry_deleg: + rd.old_parent = old_path.dentry; + rd.mnt_idmap = mnt_idmap(old_path.mnt); + rd.new_parent = new_path.dentry; + rd.delegated_inode = &delegated_inode; + rd.flags = flags; - trap = lock_rename(new_dir, old_dir); + error = __start_renaming(&rd, lookup_flags, &old_last, &new_last); + if (error) + goto exit_lock_rename; - old_dentry = lookup_hash(&oldnd); - error = PTR_ERR(old_dentry); - if (IS_ERR(old_dentry)) - goto exit3; - /* source must exist */ - error = -ENOENT; - if (!old_dentry->d_inode) - goto exit4; + if (flags & RENAME_EXCHANGE) { + if (!d_is_dir(rd.new_dentry)) { + error = -ENOTDIR; + if (new_last.name[new_last.len]) + goto exit_unlock; + } + } /* unless the source is a directory trailing slashes give -ENOTDIR */ - if (!S_ISDIR(old_dentry->d_inode->i_mode)) { + if (!d_is_dir(rd.old_dentry)) { error = -ENOTDIR; - if (oldnd.last.name[oldnd.last.len]) - goto exit4; - if (newnd.last.name[newnd.last.len]) - goto exit4; + if (old_last.name[old_last.len]) + goto exit_unlock; + if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len]) + goto exit_unlock; } - /* source should not be ancestor of target */ - error = -EINVAL; - if (old_dentry == trap) - goto exit4; - new_dentry = lookup_hash(&newnd); - error = PTR_ERR(new_dentry); - if (IS_ERR(new_dentry)) - goto exit4; - /* target should not be an ancestor of source */ - error = -ENOTEMPTY; - if (new_dentry == trap) - goto exit5; - error = security_path_rename(&oldnd.path, old_dentry, - &newnd.path, new_dentry); + error = security_path_rename(&old_path, rd.old_dentry, + &new_path, rd.new_dentry, flags); if (error) - goto exit5; - error = vfs_rename(old_dir->d_inode, old_dentry, - new_dir->d_inode, new_dentry); -exit5: - dput(new_dentry); -exit4: - dput(old_dentry); -exit3: - unlock_rename(new_dir, old_dir); - mnt_drop_write(oldnd.path.mnt); + goto exit_unlock; + + error = vfs_rename(&rd); +exit_unlock: + end_renaming(&rd); +exit_lock_rename: + if (is_delegated(&delegated_inode)) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } + mnt_drop_write(old_path.mnt); exit2: if (retry_estale(error, lookup_flags)) should_retry = true; - path_put(&newnd.path); - putname(to); + path_put(&new_path); exit1: - path_put(&oldnd.path); - putname(from); + path_put(&old_path); if (should_retry) { should_retry = false; lookup_flags |= LOOKUP_REVAL; goto retry; } -exit: +put_names: + putname(from); + putname(to); return error; } -SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) +SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname, unsigned int, flags) { - return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname); + return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), + flags); } -int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link) +SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname) { - int len; + return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), + 0); +} - len = PTR_ERR(link); - if (IS_ERR(link)) - goto out; +SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) +{ + return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD, + getname(newname), 0); +} - len = strlen(link); - if (len > (unsigned) buflen) - len = buflen; - if (copy_to_user(buffer, link, len)) - len = -EFAULT; -out: - return len; +int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen) +{ + int copylen; + + copylen = linklen; + if (unlikely(copylen > (unsigned) buflen)) + copylen = buflen; + if (copy_to_user(buffer, link, copylen)) + copylen = -EFAULT; + return copylen; } -/* - * A helper for ->readlink(). This should be used *ONLY* for symlinks that - * have ->follow_link() touching nd only in nd_set_link(). Using (or not - * using) it for any given inode is up to filesystem. +/** + * vfs_readlink - copy symlink body into userspace buffer + * @dentry: dentry on which to get symbolic link + * @buffer: user memory pointer + * @buflen: size of buffer + * + * Does not touch atime. That's up to the caller if necessary + * + * Does not call security hook. */ -int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) +int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - struct nameidata nd; - void *cookie; + struct inode *inode = d_inode(dentry); + DEFINE_DELAYED_CALL(done); + const char *link; int res; - nd.depth = 0; - cookie = dentry->d_inode->i_op->follow_link(dentry, &nd); - if (IS_ERR(cookie)) - return PTR_ERR(cookie); + if (inode->i_opflags & IOP_CACHED_LINK) + return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen); + + if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) { + if (unlikely(inode->i_op->readlink)) + return inode->i_op->readlink(dentry, buffer, buflen); - res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd)); - if (dentry->d_inode->i_op->put_link) - dentry->d_inode->i_op->put_link(dentry, &nd, cookie); + if (!d_is_symlink(dentry)) + return -EINVAL; + + spin_lock(&inode->i_lock); + inode->i_opflags |= IOP_DEFAULT_READLINK; + spin_unlock(&inode->i_lock); + } + + link = READ_ONCE(inode->i_link); + if (!link) { + link = inode->i_op->get_link(dentry, inode, &done); + if (IS_ERR(link)) + return PTR_ERR(link); + } + res = readlink_copy(buffer, buflen, link, strlen(link)); + do_delayed_call(&done); return res; } +EXPORT_SYMBOL(vfs_readlink); -int vfs_follow_link(struct nameidata *nd, const char *link) +/** + * vfs_get_link - get symlink body + * @dentry: dentry on which to get symbolic link + * @done: caller needs to free returned data with this + * + * Calls security hook and i_op->get_link() on the supplied inode. + * + * It does not touch atime. That's up to the caller if necessary. + * + * Does not work on "special" symlinks like /proc/$$/fd/N + */ +const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done) { - return __vfs_follow_link(nd, link); -} + const char *res = ERR_PTR(-EINVAL); + struct inode *inode = d_inode(dentry); -/* get the link contents into pagecache */ -static char *page_getlink(struct dentry * dentry, struct page **ppage) -{ - char *kaddr; - struct page *page; - struct address_space *mapping = dentry->d_inode->i_mapping; - page = read_mapping_page(mapping, 0, NULL); - if (IS_ERR(page)) - return (char*)page; - *ppage = page; - kaddr = kmap(page); - nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1); - return kaddr; + if (d_is_symlink(dentry)) { + res = ERR_PTR(security_inode_readlink(dentry)); + if (!res) + res = inode->i_op->get_link(dentry, inode, done); + } + return res; } +EXPORT_SYMBOL(vfs_get_link); -int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) +/* get the link contents into pagecache */ +static char *__page_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) { - struct page *page = NULL; - char *s = page_getlink(dentry, &page); - int res = vfs_readlink(dentry,buffer,buflen,s); - if (page) { - kunmap(page); - page_cache_release(page); + struct folio *folio; + struct address_space *mapping = inode->i_mapping; + + if (!dentry) { + folio = filemap_get_folio(mapping, 0); + if (IS_ERR(folio)) + return ERR_PTR(-ECHILD); + if (!folio_test_uptodate(folio)) { + folio_put(folio); + return ERR_PTR(-ECHILD); + } + } else { + folio = read_mapping_folio(mapping, 0, NULL); + if (IS_ERR(folio)) + return ERR_CAST(folio); } - return res; + set_delayed_call(callback, page_put_link, folio); + BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM); + return folio_address(folio); } -void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) +const char *page_get_link_raw(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) { - struct page *page = NULL; - nd_set_link(nd, page_getlink(dentry, &page)); - return page; + return __page_get_link(dentry, inode, callback); } +EXPORT_SYMBOL_GPL(page_get_link_raw); -void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +/** + * page_get_link() - An implementation of the get_link inode_operation. + * @dentry: The directory entry which is the symlink. + * @inode: The inode for the symlink. + * @callback: Used to drop the reference to the symlink. + * + * Filesystems which store their symlinks in the page cache should use + * this to implement the get_link() member of their inode_operations. + * + * Return: A pointer to the NUL-terminated symlink. + */ +const char *page_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) { - struct page *page = cookie; + char *kaddr = __page_get_link(dentry, inode, callback); - if (page) { - kunmap(page); - page_cache_release(page); - } + if (!IS_ERR(kaddr)) + nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); + return kaddr; } +EXPORT_SYMBOL(page_get_link); -/* - * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS +/** + * page_put_link() - Drop the reference to the symlink. + * @arg: The folio which contains the symlink. + * + * This is used internally by page_get_link(). It is exported for use + * by filesystems which need to implement a variant of page_get_link() + * themselves. Despite the apparent symmetry, filesystems which use + * page_get_link() do not need to call page_put_link(). + * + * The argument, while it has a void pointer type, must be a pointer to + * the folio which was retrieved from the page cache. The delayed_call + * infrastructure is used to drop the reference count once the caller + * is done with the symlink. */ -int __page_symlink(struct inode *inode, const char *symname, int len, int nofs) +void page_put_link(void *arg) +{ + folio_put(arg); +} +EXPORT_SYMBOL(page_put_link); + +int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) +{ + const char *link; + int res; + + DEFINE_DELAYED_CALL(done); + link = page_get_link(dentry, d_inode(dentry), &done); + res = PTR_ERR(link); + if (!IS_ERR(link)) + res = readlink_copy(buffer, buflen, link, strlen(link)); + do_delayed_call(&done); + return res; +} +EXPORT_SYMBOL(page_readlink); + +int page_symlink(struct inode *inode, const char *symname, int len) { struct address_space *mapping = inode->i_mapping; - struct page *page; - void *fsdata; + const struct address_space_operations *aops = mapping->a_ops; + bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS); + struct folio *folio; + void *fsdata = NULL; int err; - char *kaddr; - unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE; - if (nofs) - flags |= AOP_FLAG_NOFS; + unsigned int flags; retry: - err = pagecache_write_begin(NULL, mapping, 0, len-1, - flags, &page, &fsdata); + if (nofs) + flags = memalloc_nofs_save(); + err = aops->write_begin(NULL, mapping, 0, len-1, &folio, &fsdata); + if (nofs) + memalloc_nofs_restore(flags); if (err) goto fail; - kaddr = kmap_atomic(page); - memcpy(kaddr, symname, len-1); - kunmap_atomic(kaddr); + memcpy(folio_address(folio), symname, len - 1); - err = pagecache_write_end(NULL, mapping, 0, len-1, len-1, - page, fsdata); + err = aops->write_end(NULL, mapping, 0, len - 1, len - 1, + folio, fsdata); if (err < 0) goto fail; if (err < len-1) @@ -4109,46 +6294,9 @@ retry: fail: return err; } - -int page_symlink(struct inode *inode, const char *symname, int len) -{ - return __page_symlink(inode, symname, len, - !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS)); -} +EXPORT_SYMBOL(page_symlink); const struct inode_operations page_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, }; - -EXPORT_SYMBOL(user_path_at); -EXPORT_SYMBOL(follow_down_one); -EXPORT_SYMBOL(follow_down); -EXPORT_SYMBOL(follow_up); -EXPORT_SYMBOL(get_write_access); /* nfsd */ -EXPORT_SYMBOL(lock_rename); -EXPORT_SYMBOL(lookup_one_len); -EXPORT_SYMBOL(page_follow_link_light); -EXPORT_SYMBOL(page_put_link); -EXPORT_SYMBOL(page_readlink); -EXPORT_SYMBOL(__page_symlink); -EXPORT_SYMBOL(page_symlink); EXPORT_SYMBOL(page_symlink_inode_operations); -EXPORT_SYMBOL(kern_path); -EXPORT_SYMBOL(vfs_path_lookup); -EXPORT_SYMBOL(inode_permission); -EXPORT_SYMBOL(unlock_rename); -EXPORT_SYMBOL(vfs_create); -EXPORT_SYMBOL(vfs_follow_link); -EXPORT_SYMBOL(vfs_link); -EXPORT_SYMBOL(vfs_mkdir); -EXPORT_SYMBOL(vfs_mknod); -EXPORT_SYMBOL(generic_permission); -EXPORT_SYMBOL(vfs_readlink); -EXPORT_SYMBOL(vfs_rename); -EXPORT_SYMBOL(vfs_rmdir); -EXPORT_SYMBOL(vfs_symlink); -EXPORT_SYMBOL(vfs_unlink); -EXPORT_SYMBOL(dentry_unhash); -EXPORT_SYMBOL(generic_readlink); |
