diff options
Diffstat (limited to 'fs/open.c')
| -rw-r--r-- | fs/open.c | 1254 |
1 files changed, 909 insertions, 345 deletions
diff --git a/fs/open.c b/fs/open.c index 9156cb050d08..f328622061c5 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/open.c * @@ -19,7 +20,7 @@ #include <linux/mount.h> #include <linux/fcntl.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/fs.h> #include <linux/personality.h> #include <linux/pagemap.h> @@ -28,14 +29,15 @@ #include <linux/audit.h> #include <linux/falloc.h> #include <linux/fs_struct.h> -#include <linux/ima.h> #include <linux/dnotify.h> #include <linux/compat.h> +#include <linux/mnt_idmapping.h> +#include <linux/filelock.h> #include "internal.h" -int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, - struct file *filp) +int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry, + loff_t length, unsigned int time_attrs, struct file *filp) { int ret; struct iattr newattrs; @@ -51,21 +53,28 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, newattrs.ia_valid |= ATTR_FILE; } - /* Remove suid/sgid on truncate too */ - ret = should_remove_suid(dentry); + /* Remove suid, sgid, and file capabilities on truncate too */ + ret = dentry_needs_remove_privs(idmap, dentry); + if (ret < 0) + return ret; if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; - mutex_lock(&dentry->d_inode->i_mutex); - ret = notify_change(dentry, &newattrs); - mutex_unlock(&dentry->d_inode->i_mutex); + ret = inode_lock_killable(dentry->d_inode); + if (ret) + return ret; + + /* Note any delegations or leases have already been broken: */ + ret = notify_change(idmap, dentry, &newattrs, NULL); + inode_unlock(dentry->d_inode); return ret; } -long vfs_truncate(struct path *path, loff_t length) +int vfs_truncate(const struct path *path, loff_t length) { + struct mnt_idmap *idmap; struct inode *inode; - long error; + int error; inode = path->dentry->d_inode; @@ -75,13 +84,18 @@ long vfs_truncate(struct path *path, loff_t length) if (!S_ISREG(inode->i_mode)) return -EINVAL; - error = mnt_want_write(path->mnt); + idmap = mnt_idmap(path->mnt); + error = inode_permission(idmap, inode, MAY_WRITE); if (error) - goto out; + return error; - error = inode_permission(inode, MAY_WRITE); + error = fsnotify_truncate_perm(path, length); if (error) - goto mnt_drop_write_and_out; + return error; + + error = mnt_want_write(path->mnt); + if (error) + return error; error = -EPERM; if (IS_APPEND(inode)) @@ -99,22 +113,20 @@ long vfs_truncate(struct path *path, loff_t length) if (error) goto put_write_and_out; - error = locks_verify_truncate(inode, NULL, length); + error = security_path_truncate(path); if (!error) - error = security_path_truncate(path); - if (!error) - error = do_truncate(path->dentry, length, 0, NULL); + error = do_truncate(idmap, path->dentry, length, 0, NULL); put_write_and_out: put_write_access(inode); mnt_drop_write_and_out: mnt_drop_write(path->mnt); -out: + return error; } EXPORT_SYMBOL_GPL(vfs_truncate); -static long do_sys_truncate(const char __user *pathname, loff_t length) +int do_sys_truncate(const char __user *pathname, loff_t length) { unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; @@ -148,60 +160,60 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length } #endif -static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) +int do_ftruncate(struct file *file, loff_t length, int small) { struct inode *inode; struct dentry *dentry; - struct fd f; int error; - error = -EINVAL; - if (length < 0) - goto out; - error = -EBADF; - f = fdget(fd); - if (!f.file) - goto out; - /* explicitly opened as large or we are on 64-bit box */ - if (f.file->f_flags & O_LARGEFILE) + if (file->f_flags & O_LARGEFILE) small = 0; - dentry = f.file->f_path.dentry; + dentry = file->f_path.dentry; inode = dentry->d_inode; - error = -EINVAL; - if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE)) - goto out_putf; + if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE)) + return -EINVAL; - error = -EINVAL; /* Cannot ftruncate over 2^31 bytes without large file support */ if (small && length > MAX_NON_LFS) - goto out_putf; + return -EINVAL; - error = -EPERM; - if (IS_APPEND(inode)) - goto out_putf; + /* Check IS_APPEND on real upper inode */ + if (IS_APPEND(file_inode(file))) + return -EPERM; - sb_start_write(inode->i_sb); - error = locks_verify_truncate(inode, f.file, length); - if (!error) - error = security_path_truncate(&f.file->f_path); - if (!error) - error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file); - sb_end_write(inode->i_sb); -out_putf: - fdput(f); -out: - return error; + error = security_file_truncate(file); + if (error) + return error; + + error = fsnotify_truncate_perm(&file->f_path, length); + if (error) + return error; + + scoped_guard(super_write, inode->i_sb) + return do_truncate(file_mnt_idmap(file), dentry, length, + ATTR_MTIME | ATTR_CTIME, file); } -SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length) +int do_sys_ftruncate(unsigned int fd, loff_t length, int small) +{ + if (length < 0) + return -EINVAL; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; + + return do_ftruncate(fd_file(f), length, small); +} + +SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length) { return do_sys_ftruncate(fd, length, 1); } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length) +COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length) { return do_sys_ftruncate(fd, length, 1); } @@ -220,35 +232,79 @@ SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length) } #endif /* BITS_PER_LONG == 32 */ +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64) +COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname, + compat_arg_u64_dual(length)) +{ + return ksys_truncate(pathname, compat_arg_u64_glue(length)); +} +#endif -int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64) +COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd, + compat_arg_u64_dual(length)) +{ + return ksys_ftruncate(fd, compat_arg_u64_glue(length)); +} +#endif + +int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - long ret; + int ret; + loff_t sum; if (offset < 0 || len <= 0) return -EINVAL; - /* Return error if mode is not supported */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_MODE_MASK | FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; - /* Punch hole must have keep size set */ - if ((mode & FALLOC_FL_PUNCH_HOLE) && - !(mode & FALLOC_FL_KEEP_SIZE)) + /* + * Modes are exclusive, even if that is not obvious from the encoding + * as bit masks and the mix with the flag in the same namespace. + * + * To make things even more complicated, FALLOC_FL_ALLOCATE_RANGE is + * encoded as no bit set. + */ + switch (mode & FALLOC_FL_MODE_MASK) { + case FALLOC_FL_ALLOCATE_RANGE: + case FALLOC_FL_UNSHARE_RANGE: + case FALLOC_FL_ZERO_RANGE: + break; + case FALLOC_FL_PUNCH_HOLE: + if (!(mode & FALLOC_FL_KEEP_SIZE)) + return -EOPNOTSUPP; + break; + case FALLOC_FL_COLLAPSE_RANGE: + case FALLOC_FL_INSERT_RANGE: + case FALLOC_FL_WRITE_ZEROES: + if (mode & FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + break; + default: return -EOPNOTSUPP; + } if (!(file->f_mode & FMODE_WRITE)) return -EBADF; - /* It's not possible punch hole on append only file */ - if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode)) + /* + * On append-only files only space preallocation is supported. + */ + if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) return -EPERM; if (IS_IMMUTABLE(inode)) return -EPERM; /* + * We cannot allow any fallocate operation on an active swapfile + */ + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + + /* * Revalidate the write permissions, in case security policy has * changed since the files were opened. */ @@ -256,61 +312,119 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (ret) return ret; + ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len); + if (ret) + return ret; + if (S_ISFIFO(inode->i_mode)) return -ESPIPE; - /* - * Let individual file system decide if it supports preallocation - * for directories or not. - */ - if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode)) + return -EISDIR; + + if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) return -ENODEV; - /* Check for wrap through zero too */ - if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) + /* Check for wraparound */ + if (check_add_overflow(offset, len, &sum)) + return -EFBIG; + + if (sum > inode->i_sb->s_maxbytes) return -EFBIG; if (!file->f_op->fallocate) return -EOPNOTSUPP; - sb_start_write(inode->i_sb); + file_start_write(file); ret = file->f_op->fallocate(file, mode, offset, len); - sb_end_write(inode->i_sb); + + /* + * Create inotify and fanotify events. + * + * To keep the logic simple always create events if fallocate succeeds. + * This implies that events are even created if the file size remains + * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE. + */ + if (ret == 0) + fsnotify_modify(file); + + file_end_write(file); return ret; } +EXPORT_SYMBOL_GPL(vfs_fallocate); + +int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len) +{ + CLASS(fd, f)(fd); + + if (fd_empty(f)) + return -EBADF; + + return vfs_fallocate(fd_file(f), mode, offset, len); +} SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) { - struct fd f = fdget(fd); - int error = -EBADF; + return ksys_fallocate(fd, mode, offset, len); +} - if (f.file) { - error = do_fallocate(f.file, mode, offset, len); - fdput(f); - } - return error; +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE) +COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset), + compat_arg_u64_dual(len)) +{ + return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset), + compat_arg_u64_glue(len)); } +#endif /* * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and * switching the fsuid/fsgid around to the real ones. + * + * Creating new credentials is expensive, so we try to skip doing it, + * which we can if the result would match what we already got. */ -SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) +static bool access_need_override_creds(int flags) { - const struct cred *old_cred; - struct cred *override_cred; - struct path path; - struct inode *inode; - int res; - unsigned int lookup_flags = LOOKUP_FOLLOW; + const struct cred *cred; - if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ - return -EINVAL; + if (flags & AT_EACCESS) + return false; + + cred = current_cred(); + if (!uid_eq(cred->fsuid, cred->uid) || + !gid_eq(cred->fsgid, cred->gid)) + return true; + + if (!issecure(SECURE_NO_SETUID_FIXUP)) { + kuid_t root_uid = make_kuid(cred->user_ns, 0); + if (!uid_eq(cred->uid, root_uid)) { + if (!cap_isclear(cred->cap_effective)) + return true; + } else { + if (!cap_isidentical(cred->cap_effective, + cred->cap_permitted)) + return true; + } + } + + return false; +} + +static const struct cred *access_override_creds(void) +{ + struct cred *override_cred; override_cred = prepare_creds(); if (!override_cred) - return -ENOMEM; + return NULL; + + /* + * XXX access_need_override_creds performs checks in hopes of skipping + * this work. Make sure it stays in sync if making any changes in this + * routine. + */ override_cred->fsuid = override_cred->uid; override_cred->fsgid = override_cred->gid; @@ -325,13 +439,59 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) override_cred->cap_permitted; } - old_cred = override_creds(override_cred); + /* + * The new set of credentials can *only* be used in + * task-synchronous circumstances, and does not need + * RCU freeing, unless somebody then takes a separate + * reference to it. + * + * NOTE! This is _only_ true because this credential + * is used purely for override_creds() that installs + * it as the subjective cred. Other threads will be + * accessing ->real_cred, not the subjective cred. + * + * If somebody _does_ make a copy of this (using the + * 'get_current_cred()' function), that will clear the + * non_rcu field, because now that other user may be + * expecting RCU freeing. But normal thread-synchronous + * cred accesses will keep things non-racy to avoid RCU + * freeing. + */ + override_cred->non_rcu = 1; + return override_creds(override_cred); +} + +static int do_faccessat(int dfd, const char __user *filename, int mode, int flags) +{ + struct path path; + struct inode *inode; + int res; + unsigned int lookup_flags = LOOKUP_FOLLOW; + const struct cred *old_cred = NULL; + + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ + return -EINVAL; + + if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) + return -EINVAL; + + if (flags & AT_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + if (access_need_override_creds(flags)) { + old_cred = access_override_creds(); + if (!old_cred) + return -ENOMEM; + } + retry: res = user_path_at(dfd, filename, lookup_flags, &path); if (res) goto out; - inode = path.dentry->d_inode; + inode = d_backing_inode(path.dentry); if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) { /* @@ -339,11 +499,11 @@ retry: * with the "noexec" flag. */ res = -EACCES; - if (path.mnt->mnt_flags & MNT_NOEXEC) + if (path_noexec(&path)) goto out_path_release; } - res = inode_permission(inode, mode | MAY_ACCESS); + res = inode_permission(mnt_idmap(path.mnt), inode, mode | MAY_ACCESS); /* SuS v2 requires we report a read only fs too */ if (res || !(mode & S_IWOTH) || special_file(inode->i_mode)) goto out_path_release; @@ -367,14 +527,26 @@ out_path_release: goto retry; } out: - revert_creds(old_cred); - put_cred(override_cred); + if (old_cred) + put_cred(revert_creds(old_cred)); + return res; } +SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) +{ + return do_faccessat(dfd, filename, mode, 0); +} + +SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode, + int, flags) +{ + return do_faccessat(dfd, filename, mode, flags); +} + SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) { - return sys_faccessat(AT_FDCWD, filename, mode); + return do_faccessat(AT_FDCWD, filename, mode, 0); } SYSCALL_DEFINE1(chdir, const char __user *, filename) @@ -387,7 +559,7 @@ retry: if (error) goto out; - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); + error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; @@ -405,26 +577,18 @@ out: SYSCALL_DEFINE1(fchdir, unsigned int, fd) { - struct fd f = fdget_raw(fd); - struct inode *inode; - int error = -EBADF; - - error = -EBADF; - if (!f.file) - goto out; + CLASS(fd_raw, f)(fd); + int error; - inode = file_inode(f.file); + if (fd_empty(f)) + return -EBADF; - error = -ENOTDIR; - if (!S_ISDIR(inode->i_mode)) - goto out_putf; + if (!d_can_lookup(fd_file(f)->f_path.dentry)) + return -ENOTDIR; - error = inode_permission(inode, MAY_EXEC | MAY_CHDIR); + error = file_permission(fd_file(f), MAY_EXEC | MAY_CHDIR); if (!error) - set_fs_pwd(current->fs, &f.file->f_path); -out_putf: - fdput(f); -out: + set_fs_pwd(current->fs, &fd_file(f)->f_path); return error; } @@ -438,12 +602,12 @@ retry: if (error) goto out; - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); + error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; error = -EPERM; - if (!nsown_capable(CAP_SYS_CHROOT)) + if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT)) goto dput_and_out; error = security_path_chroot(&path); if (error) @@ -461,47 +625,69 @@ out: return error; } -static int chmod_common(struct path *path, umode_t mode) +int chmod_common(const struct path *path, umode_t mode) { struct inode *inode = path->dentry->d_inode; + struct delegated_inode delegated_inode = { }; struct iattr newattrs; int error; error = mnt_want_write(path->mnt); if (error) return error; - mutex_lock(&inode->i_mutex); +retry_deleg: + error = inode_lock_killable(inode); + if (error) + goto out_mnt_unlock; error = security_path_chmod(path, mode); if (error) goto out_unlock; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - error = notify_change(path->dentry, &newattrs); + error = notify_change(mnt_idmap(path->mnt), path->dentry, + &newattrs, &delegated_inode); out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); + if (is_delegated(&delegated_inode)) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } +out_mnt_unlock: mnt_drop_write(path->mnt); return error; } +int vfs_fchmod(struct file *file, umode_t mode) +{ + audit_file(file); + return chmod_common(&file->f_path, mode); +} + SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) { - struct file * file; - int err = -EBADF; + CLASS(fd, f)(fd); - file = fget(fd); - if (file) { - audit_inode(NULL, file->f_path.dentry, 0); - err = chmod_common(&file->f_path, mode); - fput(file); - } - return err; + if (fd_empty(f)) + return -EBADF; + + return vfs_fchmod(fd_file(f), mode); } -SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode) +static int do_fchmodat(int dfd, const char __user *filename, umode_t mode, + unsigned int flags) { struct path path; int error; - unsigned int lookup_flags = LOOKUP_FOLLOW; + unsigned int lookup_flags; + + if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))) + return -EINVAL; + + lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (!error) { @@ -515,14 +701,59 @@ retry: return error; } +SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename, + umode_t, mode, unsigned int, flags) +{ + return do_fchmodat(dfd, filename, mode, flags); +} + +SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, + umode_t, mode) +{ + return do_fchmodat(dfd, filename, mode, 0); +} + SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) { - return sys_fchmodat(AT_FDCWD, filename, mode); + return do_fchmodat(AT_FDCWD, filename, mode, 0); +} + +/* + * Check whether @kuid is valid and if so generate and set vfsuid_t in + * ia_vfsuid. + * + * Return: true if @kuid is valid, false if not. + */ +static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid) +{ + if (!uid_valid(kuid)) + return false; + attr->ia_valid |= ATTR_UID; + attr->ia_vfsuid = VFSUIDT_INIT(kuid); + return true; } -static int chown_common(struct path *path, uid_t user, gid_t group) +/* + * Check whether @kgid is valid and if so generate and set vfsgid_t in + * ia_vfsgid. + * + * Return: true if @kgid is valid, false if not. + */ +static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid) { + if (!gid_valid(kgid)) + return false; + attr->ia_valid |= ATTR_GID; + attr->ia_vfsgid = VFSGIDT_INIT(kgid); + return true; +} + +int chown_common(const struct path *path, uid_t user, gid_t group) +{ + struct mnt_idmap *idmap; + struct user_namespace *fs_userns; struct inode *inode = path->dentry->d_inode; + struct delegated_inode delegated_inode = { }; int error; struct iattr newattrs; kuid_t uid; @@ -531,33 +762,42 @@ static int chown_common(struct path *path, uid_t user, gid_t group) uid = make_kuid(current_user_ns(), user); gid = make_kgid(current_user_ns(), group); + idmap = mnt_idmap(path->mnt); + fs_userns = i_user_ns(inode); + +retry_deleg: + newattrs.ia_vfsuid = INVALID_VFSUID; + newattrs.ia_vfsgid = INVALID_VFSGID; newattrs.ia_valid = ATTR_CTIME; - if (user != (uid_t) -1) { - if (!uid_valid(uid)) - return -EINVAL; - newattrs.ia_valid |= ATTR_UID; - newattrs.ia_uid = uid; - } - if (group != (gid_t) -1) { - if (!gid_valid(gid)) - return -EINVAL; - newattrs.ia_valid |= ATTR_GID; - newattrs.ia_gid = gid; - } + if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid)) + return -EINVAL; + if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid)) + return -EINVAL; + error = inode_lock_killable(inode); + if (error) + return error; if (!S_ISDIR(inode->i_mode)) - newattrs.ia_valid |= - ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; - mutex_lock(&inode->i_mutex); - error = security_path_chown(path, uid, gid); + newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | + setattr_should_drop_sgid(idmap, inode); + /* Continue to send actual fs values, not the mount values. */ + error = security_path_chown( + path, + from_vfsuid(idmap, fs_userns, newattrs.ia_vfsuid), + from_vfsgid(idmap, fs_userns, newattrs.ia_vfsgid)); if (!error) - error = notify_change(path->dentry, &newattrs); - mutex_unlock(&inode->i_mutex); - + error = notify_change(idmap, path->dentry, &newattrs, + &delegated_inode); + inode_unlock(inode); + if (is_delegated(&delegated_inode)) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } return error; } -SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, - gid_t, group, int, flag) +int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, + int flag) { struct path path; int error = -EINVAL; @@ -588,164 +828,206 @@ out: return error; } +SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, + gid_t, group, int, flag) +{ + return do_fchownat(dfd, filename, user, group, flag); +} + SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) { - return sys_fchownat(AT_FDCWD, filename, user, group, 0); + return do_fchownat(AT_FDCWD, filename, user, group, 0); } SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group) { - return sys_fchownat(AT_FDCWD, filename, user, group, - AT_SYMLINK_NOFOLLOW); + return do_fchownat(AT_FDCWD, filename, user, group, + AT_SYMLINK_NOFOLLOW); } -SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) +int vfs_fchown(struct file *file, uid_t user, gid_t group) { - struct fd f = fdget(fd); - int error = -EBADF; - - if (!f.file) - goto out; + int error; - error = mnt_want_write_file(f.file); + error = mnt_want_write_file(file); if (error) - goto out_fput; - audit_inode(NULL, f.file->f_path.dentry, 0); - error = chown_common(&f.file->f_path, user, group); - mnt_drop_write_file(f.file); -out_fput: - fdput(f); -out: + return error; + audit_file(file); + error = chown_common(&file->f_path, user, group); + mnt_drop_write_file(file); return error; } -/* - * You have to be very careful that these write - * counts get cleaned up in error cases and - * upon __fput(). This should probably never - * be called outside of __dentry_open(). - */ -static inline int __get_file_write_access(struct inode *inode, - struct vfsmount *mnt) +int ksys_fchown(unsigned int fd, uid_t user, gid_t group) { - int error; - error = get_write_access(inode); - if (error) - return error; - /* - * Do not take mount writer counts on - * special files since no writes to - * the mount itself will occur. - */ - if (!special_file(inode->i_mode)) { - /* - * Balanced in __fput() - */ - error = __mnt_want_write(mnt); - if (error) - put_write_access(inode); - } - return error; + CLASS(fd, f)(fd); + + if (fd_empty(f)) + return -EBADF; + + return vfs_fchown(fd_file(f), user, group); } -int open_check_o_direct(struct file *f) +SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) { - /* NB: we're sure to have correct a_ops only after f_op->open */ - if (f->f_flags & O_DIRECT) { - if (!f->f_mapping->a_ops || - ((!f->f_mapping->a_ops->direct_IO) && - (!f->f_mapping->a_ops->get_xip_mem))) { - return -EINVAL; - } + return ksys_fchown(fd, user, group); +} + +static inline int file_get_write_access(struct file *f) +{ + int error; + + error = get_write_access(f->f_inode); + if (unlikely(error)) + return error; + error = mnt_get_write_access(f->f_path.mnt); + if (unlikely(error)) + goto cleanup_inode; + if (unlikely(f->f_mode & FMODE_BACKING)) { + error = mnt_get_write_access(backing_file_user_path(f)->mnt); + if (unlikely(error)) + goto cleanup_mnt; } return 0; + +cleanup_mnt: + mnt_put_write_access(f->f_path.mnt); +cleanup_inode: + put_write_access(f->f_inode); + return error; } static int do_dentry_open(struct file *f, - int (*open)(struct inode *, struct file *), - const struct cred *cred) + int (*open)(struct inode *, struct file *)) { static const struct file_operations empty_fops = {}; - struct inode *inode; + struct inode *inode = f->f_path.dentry->d_inode; int error; - f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | - FMODE_PREAD | FMODE_PWRITE; - - if (unlikely(f->f_flags & O_PATH)) - f->f_mode = FMODE_PATH; - path_get(&f->f_path); - inode = f->f_inode = f->f_path.dentry->d_inode; - if (f->f_mode & FMODE_WRITE) { - error = __get_file_write_access(inode, f->f_path.mnt); - if (error) - goto cleanup_file; - if (!special_file(inode->i_mode)) - file_take_write(f); - } - + f->f_inode = inode; f->f_mapping = inode->i_mapping; - file_sb_list_add(f, inode->i_sb); + f->f_wb_err = filemap_sample_wb_err(f->f_mapping); + f->f_sb_err = file_sample_sb_err(f); - if (unlikely(f->f_mode & FMODE_PATH)) { + if (unlikely(f->f_flags & O_PATH)) { + f->f_mode = FMODE_PATH | FMODE_OPENED; + file_set_fsnotify_mode(f, FMODE_NONOTIFY); f->f_op = &empty_fops; return 0; } + if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { + i_readcount_inc(inode); + } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { + error = file_get_write_access(f); + if (unlikely(error)) + goto cleanup_file; + f->f_mode |= FMODE_WRITER; + } + + /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) + f->f_mode |= FMODE_ATOMIC_POS; + f->f_op = fops_get(inode->i_fop); + if (WARN_ON(!f->f_op)) { + error = -ENODEV; + goto cleanup_all; + } - error = security_file_open(f, cred); - if (error) + error = security_file_open(f); + if (unlikely(error)) goto cleanup_all; - error = break_lease(inode, f->f_flags); - if (error) + /* + * Call fsnotify open permission hook and set FMODE_NONOTIFY_* bits + * according to existing permission watches. + * If FMODE_NONOTIFY mode was already set for an fanotify fd or for a + * pseudo file, this call will not change the mode. + */ + error = fsnotify_open_perm_and_set_mode(f); + if (unlikely(error)) goto cleanup_all; - if (!open && f->f_op) + error = break_lease(file_inode(f), f->f_flags); + if (unlikely(error)) + goto cleanup_all; + + /* normally all 3 are set; ->open() can clear them if needed */ + f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; + if (!open) open = f->f_op->open; if (open) { error = open(inode, f); if (error) goto cleanup_all; } - if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) - i_readcount_inc(inode); + f->f_mode |= FMODE_OPENED; + if ((f->f_mode & FMODE_READ) && + likely(f->f_op->read || f->f_op->read_iter)) + f->f_mode |= FMODE_CAN_READ; + if ((f->f_mode & FMODE_WRITE) && + likely(f->f_op->write || f->f_op->write_iter)) + f->f_mode |= FMODE_CAN_WRITE; + if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek) + f->f_mode &= ~FMODE_LSEEK; + if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO) + f->f_mode |= FMODE_CAN_ODIRECT; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + f->f_iocb_flags = iocb_flags(f); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); - return 0; + if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; -cleanup_all: - fops_put(f->f_op); - file_sb_list_del(f); + /* + * XXX: Huge page cache doesn't support writing yet. Drop all page + * cache for this file before processing writes. + */ if (f->f_mode & FMODE_WRITE) { - put_write_access(inode); - if (!special_file(inode->i_mode)) { + /* + * Depends on full fence from get_write_access() to synchronize + * against collapse_file() regarding i_writecount and nr_thps + * updates. Ensures subsequent insertion of THPs into the page + * cache will fail. + */ + if (filemap_nr_thps(inode->i_mapping)) { + struct address_space *mapping = inode->i_mapping; + + filemap_invalidate_lock(inode->i_mapping); /* - * We don't consider this a real - * mnt_want/drop_write() pair - * because it all happenend right - * here, so just reset the state. + * unmap_mapping_range just need to be called once + * here, because the private pages is not need to be + * unmapped mapping (e.g. data segment of dynamic + * shared libraries here). */ - file_reset_write(f); - __mnt_drop_write(f->f_path.mnt); + unmap_mapping_range(mapping, 0, 0, 0); + truncate_inode_pages(mapping, 0); + filemap_invalidate_unlock(inode->i_mapping); } } + + return 0; + +cleanup_all: + if (WARN_ON_ONCE(error > 0)) + error = -EINVAL; + fops_put(f->f_op); + put_file_access(f); cleanup_file: path_put(&f->f_path); - f->f_path.mnt = NULL; - f->f_path.dentry = NULL; + f->__f_path.mnt = NULL; + f->__f_path.dentry = NULL; f->f_inode = NULL; return error; } /** * finish_open - finish opening a file - * @od: opaque open data + * @file: file pointer * @dentry: pointer to dentry * @open: open callback * @@ -753,64 +1035,88 @@ cleanup_file: * * If the open callback is set to NULL, then the standard f_op->open() * filesystem callback is substituted. + * + * NB: the dentry reference is _not_ consumed. If, for example, the dentry is + * the return value of d_splice_alias(), then the caller needs to perform dput() + * on it after finish_open(). + * + * Returns zero on success or -errno if the open failed. */ int finish_open(struct file *file, struct dentry *dentry, - int (*open)(struct inode *, struct file *), - int *opened) + int (*open)(struct inode *, struct file *)) { - int error; - BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ - - file->f_path.dentry = dentry; - error = do_dentry_open(file, open, current_cred()); - if (!error) - *opened |= FILE_OPENED; + BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ - return error; + file->__f_path.dentry = dentry; + return do_dentry_open(file, open); } EXPORT_SYMBOL(finish_open); /** * finish_no_open - finish ->atomic_open() without opening the file * - * @od: opaque open data - * @dentry: dentry or NULL (as returned from ->lookup()) + * @file: file pointer + * @dentry: dentry, ERR_PTR(-E...) or NULL (as returned from ->lookup()) + * + * This can be used to set the result of a lookup in ->atomic_open(). * - * This can be used to set the result of a successful lookup in ->atomic_open(). - * The filesystem's atomic_open() method shall return NULL after calling this. + * NB: unlike finish_open() this function does consume the dentry reference and + * the caller need not dput() it. + * + * Returns 0 or -E..., which must be the return value of ->atomic_open() after + * having called this function. */ int finish_no_open(struct file *file, struct dentry *dentry) { - file->f_path.dentry = dentry; - return 1; + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + file->__f_path.dentry = dentry; + return 0; } EXPORT_SYMBOL(finish_no_open); +char *file_path(struct file *filp, char *buf, int buflen) +{ + return d_path(&filp->f_path, buf, buflen); +} +EXPORT_SYMBOL(file_path); + +/** + * vfs_open - open the file at the given path + * @path: path to open + * @file: newly allocated file with f_flag initialized + */ +int vfs_open(const struct path *path, struct file *file) +{ + int ret; + + file->__f_path = *path; + ret = do_dentry_open(file, NULL); + if (!ret) { + /* + * Once we return a file with FMODE_OPENED, __fput() will call + * fsnotify_close(), so we need fsnotify_open() here for + * symmetry. + */ + fsnotify_open(file); + } + return ret; +} + struct file *dentry_open(const struct path *path, int flags, const struct cred *cred) { int error; struct file *f; - validate_creds(cred); - /* We must always pass in a valid mount pointer. */ BUG_ON(!path->mnt); - f = get_empty_filp(); + f = alloc_empty_file(flags, cred); if (!IS_ERR(f)) { - f->f_flags = flags; - f->f_path = *path; - error = do_dentry_open(f, NULL, cred); - if (!error) { - /* from now on we need fput() to dispose of f */ - error = open_check_o_direct(f); - if (error) { - fput(f); - f = ERR_PTR(error); - } - } else { - put_filp(f); + error = vfs_open(path, f); + if (error) { + fput(f); f = ERR_PTR(error); } } @@ -818,18 +1124,178 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); -static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) +struct file *dentry_open_nonotify(const struct path *path, int flags, + const struct cred *cred) +{ + struct file *f = alloc_empty_file(flags, cred); + if (!IS_ERR(f)) { + int error; + + file_set_fsnotify_mode(f, FMODE_NONOTIFY); + error = vfs_open(path, f); + if (error) { + fput(f); + f = ERR_PTR(error); + } + } + return f; +} + +/** + * dentry_create - Create and open a file + * @path: path to create + * @flags: O_ flags + * @mode: mode bits for new file + * @cred: credentials to use + * + * Caller must hold the parent directory's lock, and have prepared + * a negative dentry, placed in @path->dentry, for the new file. + * + * Caller sets @path->mnt to the vfsmount of the filesystem where + * the new file is to be created. The parent directory and the + * negative dentry must reside on the same filesystem instance. + * + * On success, returns a "struct file *". Otherwise a ERR_PTR + * is returned. + */ +struct file *dentry_create(const struct path *path, int flags, umode_t mode, + const struct cred *cred) +{ + struct file *f; + int error; + + f = alloc_empty_file(flags, cred); + if (IS_ERR(f)) + return f; + + error = vfs_create(mnt_idmap(path->mnt), path->dentry, mode, NULL); + if (!error) + error = vfs_open(path, f); + + if (unlikely(error)) { + fput(f); + return ERR_PTR(error); + } + return f; +} +EXPORT_SYMBOL(dentry_create); + +/** + * kernel_file_open - open a file for kernel internal use + * @path: path of the file to open + * @flags: open flags + * @cred: credentials for open + * + * Open a file for use by in-kernel consumers. The file is not accounted + * against nr_files and must not be installed into the file descriptor + * table. + * + * Return: Opened file on success, an error pointer on failure. + */ +struct file *kernel_file_open(const struct path *path, int flags, + const struct cred *cred) { + struct file *f; + int error; + + f = alloc_empty_file_noaccount(flags, cred); + if (IS_ERR(f)) + return f; + + error = vfs_open(path, f); + if (error) { + fput(f); + return ERR_PTR(error); + } + return f; +} +EXPORT_SYMBOL_GPL(kernel_file_open); + +#define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) +#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) + +inline struct open_how build_open_how(int flags, umode_t mode) +{ + struct open_how how = { + .flags = flags & VALID_OPEN_FLAGS, + .mode = mode & S_IALLUGO, + }; + + /* O_PATH beats everything else. */ + if (how.flags & O_PATH) + how.flags &= O_PATH_FLAGS; + /* Modes should only be set for create-like flags. */ + if (!WILL_CREATE(how.flags)) + how.mode = 0; + return how; +} + +inline int build_open_flags(const struct open_how *how, struct open_flags *op) +{ + u64 flags = how->flags; + u64 strip = O_CLOEXEC; int lookup_flags = 0; - int acc_mode; + int acc_mode = ACC_MODE(flags); + + BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS), + "struct open_flags doesn't yet handle flags > 32 bits"); + + /* + * Strip flags that aren't relevant in determining struct open_flags. + */ + flags &= ~strip; + + /* + * Older syscalls implicitly clear all of the invalid flags or argument + * values before calling build_open_flags(), but openat2(2) checks all + * of its arguments. + */ + if (flags & ~VALID_OPEN_FLAGS) + return -EINVAL; + if (how->resolve & ~VALID_RESOLVE_FLAGS) + return -EINVAL; - if (flags & O_CREAT) - op->mode = (mode & S_IALLUGO) | S_IFREG; - else + /* Scoping flags are mutually exclusive. */ + if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT)) + return -EINVAL; + + /* Deal with the mode. */ + if (WILL_CREATE(flags)) { + if (how->mode & ~S_IALLUGO) + return -EINVAL; + op->mode = how->mode | S_IFREG; + } else { + if (how->mode != 0) + return -EINVAL; op->mode = 0; + } - /* Must never be set by userspace */ - flags &= ~FMODE_NONOTIFY & ~O_CLOEXEC; + /* + * Block bugs where O_DIRECTORY | O_CREAT created regular files. + * Note, that blocking O_DIRECTORY | O_CREAT here also protects + * O_TMPFILE below which requires O_DIRECTORY being raised. + */ + if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT)) + return -EINVAL; + + /* Now handle the creative implementation of O_TMPFILE. */ + if (flags & __O_TMPFILE) { + /* + * In order to ensure programs get explicit errors when trying + * to use O_TMPFILE on old kernels we enforce that O_DIRECTORY + * is raised alongside __O_TMPFILE. + */ + if (!(flags & O_DIRECTORY)) + return -EINVAL; + if (!(acc_mode & MAY_WRITE)) + return -EINVAL; + } + if (flags & O_PATH) { + /* O_PATH only permits certain other flags to be set. */ + if (flags & ~O_PATH_FLAGS) + return -EINVAL; + acc_mode = 0; + } /* * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only @@ -840,21 +1306,6 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o if (flags & __O_SYNC) flags |= O_DSYNC; - if (flags & __O_TMPFILE) { - if ((flags & O_TMPFILE_MASK) != O_TMPFILE) - return -EINVAL; - acc_mode = MAY_OPEN | ACC_MODE(flags); - } else if (flags & O_PATH) { - /* - * If we have O_PATH in the open flag. Then we - * cannot have anything other than the below set of flags - */ - flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; - acc_mode = 0; - } else { - acc_mode = MAY_OPEN | ACC_MODE(flags); - } - op->open_flag = flags; /* O_TRUNC implies we need access checks for write permissions */ @@ -872,14 +1323,34 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o if (flags & O_CREAT) { op->intent |= LOOKUP_CREATE; - if (flags & O_EXCL) + if (flags & O_EXCL) { op->intent |= LOOKUP_EXCL; + flags |= O_NOFOLLOW; + } } if (flags & O_DIRECTORY) lookup_flags |= LOOKUP_DIRECTORY; if (!(flags & O_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; + + if (how->resolve & RESOLVE_NO_XDEV) + lookup_flags |= LOOKUP_NO_XDEV; + if (how->resolve & RESOLVE_NO_MAGICLINKS) + lookup_flags |= LOOKUP_NO_MAGICLINKS; + if (how->resolve & RESOLVE_NO_SYMLINKS) + lookup_flags |= LOOKUP_NO_SYMLINKS; + if (how->resolve & RESOLVE_BENEATH) + lookup_flags |= LOOKUP_BENEATH; + if (how->resolve & RESOLVE_IN_ROOT) + lookup_flags |= LOOKUP_IN_ROOT; + if (how->resolve & RESOLVE_CACHED) { + /* Don't bother even trying for create/truncate/tmpfile open */ + if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE)) + return -EAGAIN; + lookup_flags |= LOOKUP_CACHED; + } + op->lookup_flags = lookup_flags; return 0; } @@ -898,8 +1369,11 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o struct file *file_open_name(struct filename *name, int flags, umode_t mode) { struct open_flags op; - int err = build_open_flags(flags, mode, &op); - return err ? ERR_PTR(err) : do_filp_open(AT_FDCWD, name, &op); + struct open_how how = build_open_how(flags, mode); + int err = build_open_flags(&how, &op); + if (err) + return ERR_PTR(err); + return do_filp_open(AT_FDCWD, name, &op); } /** @@ -915,60 +1389,58 @@ struct file *file_open_name(struct filename *name, int flags, umode_t mode) */ struct file *filp_open(const char *filename, int flags, umode_t mode) { - struct filename name = {.name = filename}; - return file_open_name(&name, flags, mode); + struct filename *name = getname_kernel(filename); + struct file *file = ERR_CAST(name); + + if (!IS_ERR(name)) { + file = file_open_name(name, flags, mode); + putname(name); + } + return file; } EXPORT_SYMBOL(filp_open); -struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, - const char *filename, int flags) +struct file *file_open_root(const struct path *root, + const char *filename, int flags, umode_t mode) { struct open_flags op; - int err = build_open_flags(flags, 0, &op); + struct open_how how = build_open_how(flags, mode); + int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); - if (flags & O_CREAT) - return ERR_PTR(-EINVAL); - if (!filename && (flags & O_DIRECTORY)) - if (!dentry->d_inode->i_op->lookup) - return ERR_PTR(-ENOTDIR); - return do_file_open_root(dentry, mnt, filename, &op); + return do_file_open_root(root, filename, &op); } EXPORT_SYMBOL(file_open_root); -long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) +static int do_sys_openat2(int dfd, const char __user *filename, + struct open_how *how) { struct open_flags op; - int fd = build_open_flags(flags, mode, &op); - struct filename *tmp; + struct filename *tmp __free(putname) = NULL; + int err; - if (fd) - return fd; + err = build_open_flags(how, &op); + if (unlikely(err)) + return err; tmp = getname(filename); if (IS_ERR(tmp)) return PTR_ERR(tmp); - fd = get_unused_fd_flags(flags); - if (fd >= 0) { - struct file *f = do_filp_open(dfd, tmp, &op); - if (IS_ERR(f)) { - put_unused_fd(fd); - fd = PTR_ERR(f); - } else { - fsnotify_open(f); - fd_install(fd, f); - } - } - putname(tmp); - return fd; + return FD_ADD(how->flags, do_filp_open(dfd, tmp, &op)); +} + +int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) +{ + struct open_how how = build_open_how(flags, mode); + return do_sys_openat2(dfd, filename, &how); } + SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { if (force_o_largefile()) flags |= O_LARGEFILE; - return do_sys_open(AT_FDCWD, filename, flags, mode); } @@ -977,9 +1449,55 @@ SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, { if (force_o_largefile()) flags |= O_LARGEFILE; + return do_sys_open(dfd, filename, flags, mode); +} + +SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename, + struct open_how __user *, how, size_t, usize) +{ + int err; + struct open_how tmp; + + BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0); + BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST); + + if (unlikely(usize < OPEN_HOW_SIZE_VER0)) + return -EINVAL; + if (unlikely(usize > PAGE_SIZE)) + return -E2BIG; + + err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize); + if (err) + return err; + + audit_openat2_how(&tmp); + + /* O_LARGEFILE is only allowed for non-O_PATH. */ + if (!(tmp.flags & O_PATH) && force_o_largefile()) + tmp.flags |= O_LARGEFILE; + return do_sys_openat2(dfd, filename, &tmp); +} + +#ifdef CONFIG_COMPAT +/* + * Exactly like sys_open(), except that it doesn't set the + * O_LARGEFILE flag. + */ +COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) +{ + return do_sys_open(AT_FDCWD, filename, flags, mode); +} + +/* + * Exactly like sys_openat(), except that it doesn't set the + * O_LARGEFILE flag. + */ +COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) +{ return do_sys_open(dfd, filename, flags, mode); } +#endif #ifndef __alpha__ @@ -989,35 +1507,47 @@ SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, */ SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) { - return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode); -} + int flags = O_CREAT | O_WRONLY | O_TRUNC; + if (force_o_largefile()) + flags |= O_LARGEFILE; + return do_sys_open(AT_FDCWD, pathname, flags, mode); +} #endif /* * "id" is the POSIX thread ID. We use the * files pointer for this.. */ -int filp_close(struct file *filp, fl_owner_t id) +static int filp_flush(struct file *filp, fl_owner_t id) { int retval = 0; - if (!file_count(filp)) { - printk(KERN_ERR "VFS: Close: file count is 0\n"); + if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, filp, + "VFS: Close: file count is 0 (f_op=%ps)", + filp->f_op)) { return 0; } - if (filp->f_op && filp->f_op->flush) + if (filp->f_op->flush) retval = filp->f_op->flush(filp, id); if (likely(!(filp->f_mode & FMODE_PATH))) { dnotify_flush(filp, id); locks_remove_posix(filp, id); } - fput(filp); return retval; } +int filp_close(struct file *filp, fl_owner_t id) +{ + int retval; + + retval = filp_flush(filp, id); + fput_close(filp); + + return retval; +} EXPORT_SYMBOL(filp_close); /* @@ -1027,18 +1557,33 @@ EXPORT_SYMBOL(filp_close); */ SYSCALL_DEFINE1(close, unsigned int, fd) { - int retval = __close_fd(current->files, fd); + int retval; + struct file *file; + + file = file_close_fd(fd); + if (!file) + return -EBADF; + + retval = filp_flush(file, current->files); + + /* + * We're returning to user space. Don't bother + * with any delayed fput() cases. + */ + fput_close_sync(file); + + if (likely(retval == 0)) + return 0; /* can't restart close syscall because file table entry was cleared */ - if (unlikely(retval == -ERESTARTSYS || - retval == -ERESTARTNOINTR || - retval == -ERESTARTNOHAND || - retval == -ERESTART_RESTARTBLOCK)) + if (retval == -ERESTARTSYS || + retval == -ERESTARTNOINTR || + retval == -ERESTARTNOHAND || + retval == -ERESTART_RESTARTBLOCK) retval = -EINTR; return retval; } -EXPORT_SYMBOL(sys_close); /* * This routine simulates a hangup on the tty, to arrange that users @@ -1081,3 +1626,22 @@ int nonseekable_open(struct inode *inode, struct file *filp) } EXPORT_SYMBOL(nonseekable_open); + +/* + * stream_open is used by subsystems that want stream-like file descriptors. + * Such file descriptors are not seekable and don't have notion of position + * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL). + * Contrary to file descriptors of other regular files, .read() and .write() + * can run simultaneously. + * + * stream_open never fails and is marked to return int so that it could be + * directly used as file_operations.open . + */ +int stream_open(struct inode *inode, struct file *filp) +{ + filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS); + filp->f_mode |= FMODE_STREAM; + return 0; +} + +EXPORT_SYMBOL(stream_open); |
