diff options
Diffstat (limited to 'fs/open.c')
| -rw-r--r-- | fs/open.c | 622 |
1 files changed, 371 insertions, 251 deletions
diff --git a/fs/open.c b/fs/open.c index 82c1a28b3308..f328622061c5 100644 --- a/fs/open.c +++ b/fs/open.c @@ -29,14 +29,14 @@ #include <linux/audit.h> #include <linux/falloc.h> #include <linux/fs_struct.h> -#include <linux/ima.h> #include <linux/dnotify.h> #include <linux/compat.h> #include <linux/mnt_idmapping.h> +#include <linux/filelock.h> #include "internal.h" -int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry, +int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { int ret; @@ -54,24 +54,27 @@ int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry, } /* Remove suid, sgid, and file capabilities on truncate too */ - ret = dentry_needs_remove_privs(mnt_userns, dentry); + ret = dentry_needs_remove_privs(idmap, dentry); if (ret < 0) return ret; if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; - inode_lock(dentry->d_inode); + ret = inode_lock_killable(dentry->d_inode); + if (ret) + return ret; + /* Note any delegations or leases have already been broken: */ - ret = notify_change(mnt_userns, dentry, &newattrs, NULL); + ret = notify_change(idmap, dentry, &newattrs, NULL); inode_unlock(dentry->d_inode); return ret; } -long vfs_truncate(const struct path *path, loff_t length) +int vfs_truncate(const struct path *path, loff_t length) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct inode *inode; - long error; + int error; inode = path->dentry->d_inode; @@ -81,14 +84,18 @@ long vfs_truncate(const struct path *path, loff_t length) if (!S_ISREG(inode->i_mode)) return -EINVAL; - error = mnt_want_write(path->mnt); + idmap = mnt_idmap(path->mnt); + error = inode_permission(idmap, inode, MAY_WRITE); if (error) - goto out; + return error; - mnt_userns = mnt_user_ns(path->mnt); - error = inode_permission(mnt_userns, inode, MAY_WRITE); + error = fsnotify_truncate_perm(path, length); if (error) - goto mnt_drop_write_and_out; + return error; + + error = mnt_want_write(path->mnt); + if (error) + return error; error = -EPERM; if (IS_APPEND(inode)) @@ -108,18 +115,18 @@ long vfs_truncate(const struct path *path, loff_t length) error = security_path_truncate(path); if (!error) - error = do_truncate(mnt_userns, path->dentry, length, 0, NULL); + error = do_truncate(idmap, path->dentry, length, 0, NULL); put_write_and_out: put_write_access(inode); mnt_drop_write_and_out: mnt_drop_write(path->mnt); -out: + return error; } EXPORT_SYMBOL_GPL(vfs_truncate); -long do_sys_truncate(const char __user *pathname, loff_t length) +int do_sys_truncate(const char __user *pathname, loff_t length) { unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; @@ -153,59 +160,60 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length } #endif -long do_sys_ftruncate(unsigned int fd, loff_t length, int small) +int do_ftruncate(struct file *file, loff_t length, int small) { struct inode *inode; struct dentry *dentry; - struct fd f; int error; - error = -EINVAL; - if (length < 0) - goto out; - error = -EBADF; - f = fdget(fd); - if (!f.file) - goto out; - /* explicitly opened as large or we are on 64-bit box */ - if (f.file->f_flags & O_LARGEFILE) + if (file->f_flags & O_LARGEFILE) small = 0; - dentry = f.file->f_path.dentry; + dentry = file->f_path.dentry; inode = dentry->d_inode; - error = -EINVAL; - if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE)) - goto out_putf; + if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE)) + return -EINVAL; - error = -EINVAL; /* Cannot ftruncate over 2^31 bytes without large file support */ if (small && length > MAX_NON_LFS) - goto out_putf; + return -EINVAL; - error = -EPERM; /* Check IS_APPEND on real upper inode */ - if (IS_APPEND(file_inode(f.file))) - goto out_putf; - sb_start_write(inode->i_sb); - error = security_file_truncate(f.file); - if (!error) - error = do_truncate(file_mnt_user_ns(f.file), dentry, length, - ATTR_MTIME | ATTR_CTIME, f.file); - sb_end_write(inode->i_sb); -out_putf: - fdput(f); -out: - return error; + if (IS_APPEND(file_inode(file))) + return -EPERM; + + error = security_file_truncate(file); + if (error) + return error; + + error = fsnotify_truncate_perm(&file->f_path, length); + if (error) + return error; + + scoped_guard(super_write, inode->i_sb) + return do_truncate(file_mnt_idmap(file), dentry, length, + ATTR_MTIME | ATTR_CTIME, file); } -SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length) +int do_sys_ftruncate(unsigned int fd, loff_t length, int small) +{ + if (length < 0) + return -EINVAL; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; + + return do_ftruncate(fd_file(f), length, small); +} + +SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length) { return do_sys_ftruncate(fd, length, 1); } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length) +COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length) { return do_sys_ftruncate(fd, length, 1); } @@ -243,45 +251,46 @@ COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd, int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - long ret; + int ret; + loff_t sum; if (offset < 0 || len <= 0) return -EINVAL; - /* Return error if mode is not supported */ - if (mode & ~FALLOC_FL_SUPPORTED_MASK) + if (mode & ~(FALLOC_FL_MODE_MASK | FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; - /* Punch hole and zero range are mutually exclusive */ - if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) == - (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) - return -EOPNOTSUPP; - - /* Punch hole must have keep size set */ - if ((mode & FALLOC_FL_PUNCH_HOLE) && - !(mode & FALLOC_FL_KEEP_SIZE)) + /* + * Modes are exclusive, even if that is not obvious from the encoding + * as bit masks and the mix with the flag in the same namespace. + * + * To make things even more complicated, FALLOC_FL_ALLOCATE_RANGE is + * encoded as no bit set. + */ + switch (mode & FALLOC_FL_MODE_MASK) { + case FALLOC_FL_ALLOCATE_RANGE: + case FALLOC_FL_UNSHARE_RANGE: + case FALLOC_FL_ZERO_RANGE: + break; + case FALLOC_FL_PUNCH_HOLE: + if (!(mode & FALLOC_FL_KEEP_SIZE)) + return -EOPNOTSUPP; + break; + case FALLOC_FL_COLLAPSE_RANGE: + case FALLOC_FL_INSERT_RANGE: + case FALLOC_FL_WRITE_ZEROES: + if (mode & FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + break; + default: return -EOPNOTSUPP; - - /* Collapse range should only be used exclusively. */ - if ((mode & FALLOC_FL_COLLAPSE_RANGE) && - (mode & ~FALLOC_FL_COLLAPSE_RANGE)) - return -EINVAL; - - /* Insert range should only be used exclusively. */ - if ((mode & FALLOC_FL_INSERT_RANGE) && - (mode & ~FALLOC_FL_INSERT_RANGE)) - return -EINVAL; - - /* Unshare range should only be used with allocate mode. */ - if ((mode & FALLOC_FL_UNSHARE_RANGE) && - (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE))) - return -EINVAL; + } if (!(file->f_mode & FMODE_WRITE)) return -EBADF; /* - * We can only allow pure fallocate on append only files + * On append-only files only space preallocation is supported. */ if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) return -EPERM; @@ -303,6 +312,10 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (ret) return ret; + ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len); + if (ret) + return ret; + if (S_ISFIFO(inode->i_mode)) return -ESPIPE; @@ -312,8 +325,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) return -ENODEV; - /* Check for wrap through zero too */ - if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) + /* Check for wraparound */ + if (check_add_overflow(offset, len, &sum)) + return -EFBIG; + + if (sum > inode->i_sb->s_maxbytes) return -EFBIG; if (!file->f_op->fallocate) @@ -339,14 +355,12 @@ EXPORT_SYMBOL_GPL(vfs_fallocate); int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len) { - struct fd f = fdget(fd); - int error = -EBADF; + CLASS(fd, f)(fd); - if (f.file) { - error = vfs_fallocate(f.file, mode, offset, len); - fdput(f); - } - return error; + if (fd_empty(f)) + return -EBADF; + + return vfs_fallocate(fd_file(f), mode, offset, len); } SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) @@ -367,16 +381,51 @@ COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and * switching the fsuid/fsgid around to the real ones. + * + * Creating new credentials is expensive, so we try to skip doing it, + * which we can if the result would match what we already got. */ +static bool access_need_override_creds(int flags) +{ + const struct cred *cred; + + if (flags & AT_EACCESS) + return false; + + cred = current_cred(); + if (!uid_eq(cred->fsuid, cred->uid) || + !gid_eq(cred->fsgid, cred->gid)) + return true; + + if (!issecure(SECURE_NO_SETUID_FIXUP)) { + kuid_t root_uid = make_kuid(cred->user_ns, 0); + if (!uid_eq(cred->uid, root_uid)) { + if (!cap_isclear(cred->cap_effective)) + return true; + } else { + if (!cap_isidentical(cred->cap_effective, + cred->cap_permitted)) + return true; + } + } + + return false; +} + static const struct cred *access_override_creds(void) { - const struct cred *old_cred; struct cred *override_cred; override_cred = prepare_creds(); if (!override_cred) return NULL; + /* + * XXX access_need_override_creds performs checks in hopes of skipping + * this work. Make sure it stays in sync if making any changes in this + * routine. + */ + override_cred->fsuid = override_cred->uid; override_cred->fsgid = override_cred->gid; @@ -405,19 +454,14 @@ static const struct cred *access_override_creds(void) * 'get_current_cred()' function), that will clear the * non_rcu field, because now that other user may be * expecting RCU freeing. But normal thread-synchronous - * cred accesses will keep things non-RCY. + * cred accesses will keep things non-racy to avoid RCU + * freeing. */ override_cred->non_rcu = 1; - - old_cred = override_creds(override_cred); - - /* override_cred() gets its own ref */ - put_cred(override_cred); - - return old_cred; + return override_creds(override_cred); } -static long do_faccessat(int dfd, const char __user *filename, int mode, int flags) +static int do_faccessat(int dfd, const char __user *filename, int mode, int flags) { struct path path; struct inode *inode; @@ -436,7 +480,7 @@ static long do_faccessat(int dfd, const char __user *filename, int mode, int fla if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; - if (!(flags & AT_EACCESS)) { + if (access_need_override_creds(flags)) { old_cred = access_override_creds(); if (!old_cred) return -ENOMEM; @@ -459,7 +503,7 @@ retry: goto out_path_release; } - res = inode_permission(mnt_user_ns(path.mnt), inode, mode | MAY_ACCESS); + res = inode_permission(mnt_idmap(path.mnt), inode, mode | MAY_ACCESS); /* SuS v2 requires we report a read only fs too */ if (res || !(mode & S_IWOTH) || special_file(inode->i_mode)) goto out_path_release; @@ -484,7 +528,7 @@ out_path_release: } out: if (old_cred) - revert_creds(old_cred); + put_cred(revert_creds(old_cred)); return res; } @@ -533,23 +577,18 @@ out: SYSCALL_DEFINE1(fchdir, unsigned int, fd) { - struct fd f = fdget_raw(fd); + CLASS(fd_raw, f)(fd); int error; - error = -EBADF; - if (!f.file) - goto out; + if (fd_empty(f)) + return -EBADF; - error = -ENOTDIR; - if (!d_can_lookup(f.file->f_path.dentry)) - goto out_putf; + if (!d_can_lookup(fd_file(f)->f_path.dentry)) + return -ENOTDIR; - error = file_permission(f.file, MAY_EXEC | MAY_CHDIR); + error = file_permission(fd_file(f), MAY_EXEC | MAY_CHDIR); if (!error) - set_fs_pwd(current->fs, &f.file->f_path); -out_putf: - fdput(f); -out: + set_fs_pwd(current->fs, &fd_file(f)->f_path); return error; } @@ -589,7 +628,7 @@ out: int chmod_common(const struct path *path, umode_t mode) { struct inode *inode = path->dentry->d_inode; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; struct iattr newattrs; int error; @@ -597,21 +636,24 @@ int chmod_common(const struct path *path, umode_t mode) if (error) return error; retry_deleg: - inode_lock(inode); + error = inode_lock_killable(inode); + if (error) + goto out_mnt_unlock; error = security_path_chmod(path, mode); if (error) goto out_unlock; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - error = notify_change(mnt_user_ns(path->mnt), path->dentry, + error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs, &delegated_inode); out_unlock: inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } +out_mnt_unlock: mnt_drop_write(path->mnt); return error; } @@ -624,21 +666,28 @@ int vfs_fchmod(struct file *file, umode_t mode) SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) { - struct fd f = fdget(fd); - int err = -EBADF; + CLASS(fd, f)(fd); - if (f.file) { - err = vfs_fchmod(f.file, mode); - fdput(f); - } - return err; + if (fd_empty(f)) + return -EBADF; + + return vfs_fchmod(fd_file(f), mode); } -static int do_fchmodat(int dfd, const char __user *filename, umode_t mode) +static int do_fchmodat(int dfd, const char __user *filename, umode_t mode, + unsigned int flags) { struct path path; int error; - unsigned int lookup_flags = LOOKUP_FOLLOW; + unsigned int lookup_flags; + + if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))) + return -EINVAL; + + lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (!error) { @@ -652,21 +701,24 @@ retry: return error; } +SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename, + umode_t, mode, unsigned int, flags) +{ + return do_fchmodat(dfd, filename, mode, flags); +} + SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode) { - return do_fchmodat(dfd, filename, mode); + return do_fchmodat(dfd, filename, mode, 0); } SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) { - return do_fchmodat(AT_FDCWD, filename, mode); + return do_fchmodat(AT_FDCWD, filename, mode, 0); } -/** - * setattr_vfsuid - check and set ia_fsuid attribute - * @kuid: new inode owner - * +/* * Check whether @kuid is valid and if so generate and set vfsuid_t in * ia_vfsuid. * @@ -681,10 +733,7 @@ static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid) return true; } -/** - * setattr_vfsgid - check and set ia_fsgid attribute - * @kgid: new inode owner - * +/* * Check whether @kgid is valid and if so generate and set vfsgid_t in * ia_vfsgid. * @@ -701,9 +750,10 @@ static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid) int chown_common(const struct path *path, uid_t user, gid_t group) { - struct user_namespace *mnt_userns, *fs_userns; + struct mnt_idmap *idmap; + struct user_namespace *fs_userns; struct inode *inode = path->dentry->d_inode; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; int error; struct iattr newattrs; kuid_t uid; @@ -712,7 +762,7 @@ int chown_common(const struct path *path, uid_t user, gid_t group) uid = make_kuid(current_user_ns(), user); gid = make_kgid(current_user_ns(), group); - mnt_userns = mnt_user_ns(path->mnt); + idmap = mnt_idmap(path->mnt); fs_userns = i_user_ns(inode); retry_deleg: @@ -723,20 +773,22 @@ retry_deleg: return -EINVAL; if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid)) return -EINVAL; - inode_lock(inode); + error = inode_lock_killable(inode); + if (error) + return error; if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | - setattr_should_drop_sgid(mnt_userns, inode); + setattr_should_drop_sgid(idmap, inode); /* Continue to send actual fs values, not the mount values. */ error = security_path_chown( path, - from_vfsuid(mnt_userns, fs_userns, newattrs.ia_vfsuid), - from_vfsgid(mnt_userns, fs_userns, newattrs.ia_vfsgid)); + from_vfsuid(idmap, fs_userns, newattrs.ia_vfsuid), + from_vfsgid(idmap, fs_userns, newattrs.ia_vfsgid)); if (!error) - error = notify_change(mnt_userns, path->dentry, &newattrs, + error = notify_change(idmap, path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; @@ -808,14 +860,12 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group) int ksys_fchown(unsigned int fd, uid_t user, gid_t group) { - struct fd f = fdget(fd); - int error = -EBADF; + CLASS(fd, f)(fd); - if (f.file) { - error = vfs_fchown(f.file, user, group); - fdput(f); - } - return error; + if (fd_empty(f)) + return -EBADF; + + return vfs_fchown(fd_file(f), user, group); } SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) @@ -823,11 +873,35 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) return ksys_fchown(fd, user, group); } +static inline int file_get_write_access(struct file *f) +{ + int error; + + error = get_write_access(f->f_inode); + if (unlikely(error)) + return error; + error = mnt_get_write_access(f->f_path.mnt); + if (unlikely(error)) + goto cleanup_inode; + if (unlikely(f->f_mode & FMODE_BACKING)) { + error = mnt_get_write_access(backing_file_user_path(f)->mnt); + if (unlikely(error)) + goto cleanup_mnt; + } + return 0; + +cleanup_mnt: + mnt_put_write_access(f->f_path.mnt); +cleanup_inode: + put_write_access(f->f_inode); + return error; +} + static int do_dentry_open(struct file *f, - struct inode *inode, int (*open)(struct inode *, struct file *)) { static const struct file_operations empty_fops = {}; + struct inode *inode = f->f_path.dentry->d_inode; int error; path_get(&f->f_path); @@ -838,6 +912,7 @@ static int do_dentry_open(struct file *f, if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH | FMODE_OPENED; + file_set_fsnotify_mode(f, FMODE_NONOTIFY); f->f_op = &empty_fops; return 0; } @@ -845,14 +920,9 @@ static int do_dentry_open(struct file *f, if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { i_readcount_inc(inode); } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { - error = get_write_access(inode); + error = file_get_write_access(f); if (unlikely(error)) goto cleanup_file; - error = __mnt_want_write(f->f_path.mnt); - if (unlikely(error)) { - put_write_access(inode); - goto cleanup_file; - } f->f_mode |= FMODE_WRITER; } @@ -867,11 +937,21 @@ static int do_dentry_open(struct file *f, } error = security_file_open(f); - if (error) + if (unlikely(error)) goto cleanup_all; - error = break_lease(locks_inode(f), f->f_flags); - if (error) + /* + * Call fsnotify open permission hook and set FMODE_NONOTIFY_* bits + * according to existing permission watches. + * If FMODE_NONOTIFY mode was already set for an fanotify fd or for a + * pseudo file, this call will not change the mode. + */ + error = fsnotify_open_perm_and_set_mode(f); + if (unlikely(error)) + goto cleanup_all; + + error = break_lease(file_inode(f), f->f_flags); + if (unlikely(error)) goto cleanup_all; /* normally all 3 are set; ->open() can clear them if needed */ @@ -909,12 +989,11 @@ static int do_dentry_open(struct file *f, */ if (f->f_mode & FMODE_WRITE) { /* - * Paired with smp_mb() in collapse_file() to ensure nr_thps - * is up to date and the update to i_writecount by - * get_write_access() is visible. Ensures subsequent insertion - * of THPs into the page cache will fail. + * Depends on full fence from get_write_access() to synchronize + * against collapse_file() regarding i_writecount and nr_thps + * updates. Ensures subsequent insertion of THPs into the page + * cache will fail. */ - smp_mb(); if (filemap_nr_thps(inode->i_mapping)) { struct address_space *mapping = inode->i_mapping; @@ -940,8 +1019,8 @@ cleanup_all: put_file_access(f); cleanup_file: path_put(&f->f_path); - f->f_path.mnt = NULL; - f->f_path.dentry = NULL; + f->__f_path.mnt = NULL; + f->__f_path.dentry = NULL; f->f_inode = NULL; return error; } @@ -951,7 +1030,6 @@ cleanup_file: * @file: file pointer * @dentry: pointer to dentry * @open: open callback - * @opened: state of open * * This can be used to finish opening a file passed to i_op->atomic_open(). * @@ -969,8 +1047,8 @@ int finish_open(struct file *file, struct dentry *dentry, { BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ - file->f_path.dentry = dentry; - return do_dentry_open(file, d_backing_inode(dentry), open); + file->__f_path.dentry = dentry; + return do_dentry_open(file, open); } EXPORT_SYMBOL(finish_open); @@ -978,19 +1056,21 @@ EXPORT_SYMBOL(finish_open); * finish_no_open - finish ->atomic_open() without opening the file * * @file: file pointer - * @dentry: dentry or NULL (as returned from ->lookup()) + * @dentry: dentry, ERR_PTR(-E...) or NULL (as returned from ->lookup()) * - * This can be used to set the result of a successful lookup in ->atomic_open(). + * This can be used to set the result of a lookup in ->atomic_open(). * * NB: unlike finish_open() this function does consume the dentry reference and * the caller need not dput() it. * - * Returns "0" which must be the return value of ->atomic_open() after having - * called this function. + * Returns 0 or -E..., which must be the return value of ->atomic_open() after + * having called this function. */ int finish_no_open(struct file *file, struct dentry *dentry) { - file->f_path.dentry = dentry; + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + file->__f_path.dentry = dentry; return 0; } EXPORT_SYMBOL(finish_no_open); @@ -1005,12 +1085,22 @@ EXPORT_SYMBOL(file_path); * vfs_open - open the file at the given path * @path: path to open * @file: newly allocated file with f_flag initialized - * @cred: credentials to use */ int vfs_open(const struct path *path, struct file *file) { - file->f_path = *path; - return do_dentry_open(file, d_backing_inode(path->dentry), NULL); + int ret; + + file->__f_path = *path; + ret = do_dentry_open(file, NULL); + if (!ret) { + /* + * Once we return a file with FMODE_OPENED, __fput() will call + * fsnotify_close(), so we need fsnotify_open() here for + * symmetry. + */ + fsnotify_open(file); + } + return ret; } struct file *dentry_open(const struct path *path, int flags, @@ -1019,8 +1109,6 @@ struct file *dentry_open(const struct path *path, int flags, int error; struct file *f; - validate_creds(cred); - /* We must always pass in a valid mount pointer. */ BUG_ON(!path->mnt); @@ -1036,6 +1124,23 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); +struct file *dentry_open_nonotify(const struct path *path, int flags, + const struct cred *cred) +{ + struct file *f = alloc_empty_file(flags, cred); + if (!IS_ERR(f)) { + int error; + + file_set_fsnotify_mode(f, FMODE_NONOTIFY); + error = vfs_open(path, f); + if (error) { + fput(f); + f = ERR_PTR(error); + } + } + return f; +} + /** * dentry_create - Create and open a file * @path: path to create @@ -1059,14 +1164,11 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode, struct file *f; int error; - validate_creds(cred); f = alloc_empty_file(flags, cred); if (IS_ERR(f)) return f; - error = vfs_create(mnt_user_ns(path->mnt), - d_inode(path->dentry->d_parent), - path->dentry, mode, true); + error = vfs_create(mnt_idmap(path->mnt), path->dentry, mode, NULL); if (!error) error = vfs_open(path, f); @@ -1078,23 +1180,36 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode, } EXPORT_SYMBOL(dentry_create); -struct file *open_with_fake_path(const struct path *path, int flags, - struct inode *inode, const struct cred *cred) +/** + * kernel_file_open - open a file for kernel internal use + * @path: path of the file to open + * @flags: open flags + * @cred: credentials for open + * + * Open a file for use by in-kernel consumers. The file is not accounted + * against nr_files and must not be installed into the file descriptor + * table. + * + * Return: Opened file on success, an error pointer on failure. + */ +struct file *kernel_file_open(const struct path *path, int flags, + const struct cred *cred) { - struct file *f = alloc_empty_file_noaccount(flags, cred); - if (!IS_ERR(f)) { - int error; + struct file *f; + int error; - f->f_path = *path; - error = do_dentry_open(f, inode, NULL); - if (error) { - fput(f); - f = ERR_PTR(error); - } + f = alloc_empty_file_noaccount(flags, cred); + if (IS_ERR(f)) + return f; + + error = vfs_open(path, f); + if (error) { + fput(f); + return ERR_PTR(error); } return f; } -EXPORT_SYMBOL(open_with_fake_path); +EXPORT_SYMBOL_GPL(kernel_file_open); #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) @@ -1118,7 +1233,7 @@ inline struct open_how build_open_how(int flags, umode_t mode) inline int build_open_flags(const struct open_how *how, struct open_flags *op) { u64 flags = how->flags; - u64 strip = FMODE_NONOTIFY | O_CLOEXEC; + u64 strip = O_CLOEXEC; int lookup_flags = 0; int acc_mode = ACC_MODE(flags); @@ -1126,9 +1241,7 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op) "struct open_flags doesn't yet handle flags > 32 bits"); /* - * Strip flags that either shouldn't be set by userspace like - * FMODE_NONOTIFY or that aren't relevant in determining struct - * open_flags like O_CLOEXEC. + * Strip flags that aren't relevant in determining struct open_flags. */ flags &= ~strip; @@ -1158,13 +1271,21 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op) } /* - * In order to ensure programs get explicit errors when trying to use - * O_TMPFILE on old kernels, O_TMPFILE is implemented such that it - * looks like (O_DIRECTORY|O_RDWR & ~O_CREAT) to old kernels. But we - * have to require userspace to explicitly set it. + * Block bugs where O_DIRECTORY | O_CREAT created regular files. + * Note, that blocking O_DIRECTORY | O_CREAT here also protects + * O_TMPFILE below which requires O_DIRECTORY being raised. */ + if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT)) + return -EINVAL; + + /* Now handle the creative implementation of O_TMPFILE. */ if (flags & __O_TMPFILE) { - if ((flags & O_TMPFILE_MASK) != O_TMPFILE) + /* + * In order to ensure programs get explicit errors when trying + * to use O_TMPFILE on old kernels we enforce that O_DIRECTORY + * is raised alongside __O_TMPFILE. + */ + if (!(flags & O_DIRECTORY)) return -EINVAL; if (!(acc_mode & MAY_WRITE)) return -EINVAL; @@ -1225,7 +1346,7 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op) lookup_flags |= LOOKUP_IN_ROOT; if (how->resolve & RESOLVE_CACHED) { /* Don't bother even trying for create/truncate/tmpfile open */ - if (flags & (O_TRUNC | O_CREAT | O_TMPFILE)) + if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE)) return -EAGAIN; lookup_flags |= LOOKUP_CACHED; } @@ -1270,7 +1391,7 @@ struct file *filp_open(const char *filename, int flags, umode_t mode) { struct filename *name = getname_kernel(filename); struct file *file = ERR_CAST(name); - + if (!IS_ERR(name)) { file = file_open_name(name, flags, mode); putname(name); @@ -1291,36 +1412,25 @@ struct file *file_open_root(const struct path *root, } EXPORT_SYMBOL(file_open_root); -static long do_sys_openat2(int dfd, const char __user *filename, - struct open_how *how) +static int do_sys_openat2(int dfd, const char __user *filename, + struct open_how *how) { struct open_flags op; - int fd = build_open_flags(how, &op); - struct filename *tmp; + struct filename *tmp __free(putname) = NULL; + int err; - if (fd) - return fd; + err = build_open_flags(how, &op); + if (unlikely(err)) + return err; tmp = getname(filename); if (IS_ERR(tmp)) return PTR_ERR(tmp); - fd = get_unused_fd_flags(how->flags); - if (fd >= 0) { - struct file *f = do_filp_open(dfd, tmp, &op); - if (IS_ERR(f)) { - put_unused_fd(fd); - fd = PTR_ERR(f); - } else { - fsnotify_open(f); - fd_install(fd, f); - } - } - putname(tmp); - return fd; + return FD_ADD(how->flags, do_filp_open(dfd, tmp, &op)); } -long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) +int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_how how = build_open_how(flags, mode); return do_sys_openat2(dfd, filename, &how); @@ -1353,6 +1463,8 @@ SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename, if (unlikely(usize < OPEN_HOW_SIZE_VER0)) return -EINVAL; + if (unlikely(usize > PAGE_SIZE)) + return -E2BIG; err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize); if (err) @@ -1407,12 +1519,13 @@ SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) * "id" is the POSIX thread ID. We use the * files pointer for this.. */ -int filp_close(struct file *filp, fl_owner_t id) +static int filp_flush(struct file *filp, fl_owner_t id) { int retval = 0; - if (!file_count(filp)) { - printk(KERN_ERR "VFS: Close: file count is 0\n"); + if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, filp, + "VFS: Close: file count is 0 (f_op=%ps)", + filp->f_op)) { return 0; } @@ -1423,10 +1536,18 @@ int filp_close(struct file *filp, fl_owner_t id) dnotify_flush(filp, id); locks_remove_posix(filp, id); } - fput(filp); return retval; } +int filp_close(struct file *filp, fl_owner_t id) +{ + int retval; + + retval = filp_flush(filp, id); + fput_close(filp); + + return retval; +} EXPORT_SYMBOL(filp_close); /* @@ -1436,35 +1557,34 @@ EXPORT_SYMBOL(filp_close); */ SYSCALL_DEFINE1(close, unsigned int, fd) { - int retval = close_fd(fd); + int retval; + struct file *file; + + file = file_close_fd(fd); + if (!file) + return -EBADF; + + retval = filp_flush(file, current->files); + + /* + * We're returning to user space. Don't bother + * with any delayed fput() cases. + */ + fput_close_sync(file); + + if (likely(retval == 0)) + return 0; /* can't restart close syscall because file table entry was cleared */ - if (unlikely(retval == -ERESTARTSYS || - retval == -ERESTARTNOINTR || - retval == -ERESTARTNOHAND || - retval == -ERESTART_RESTARTBLOCK)) + if (retval == -ERESTARTSYS || + retval == -ERESTARTNOINTR || + retval == -ERESTARTNOHAND || + retval == -ERESTART_RESTARTBLOCK) retval = -EINTR; return retval; } -/** - * close_range() - Close all file descriptors in a given range. - * - * @fd: starting file descriptor to close - * @max_fd: last file descriptor to close - * @flags: reserved for future extensions - * - * This closes a range of file descriptors. All file descriptors - * from @fd up to and including @max_fd are closed. - * Currently, errors to close a given file descriptor are ignored. - */ -SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, - unsigned int, flags) -{ - return __close_range(fd, max_fd, flags); -} - /* * This routine simulates a hangup on the tty, to arrange that users * are given clean terminals at login time. |
