diff options
Diffstat (limited to 'fs/open.c')
| -rw-r--r-- | fs/open.c | 1022 |
1 files changed, 729 insertions, 293 deletions
diff --git a/fs/open.c b/fs/open.c index 0285ce7dbd51..f328622061c5 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/open.c * @@ -28,14 +29,15 @@ #include <linux/audit.h> #include <linux/falloc.h> #include <linux/fs_struct.h> -#include <linux/ima.h> #include <linux/dnotify.h> #include <linux/compat.h> +#include <linux/mnt_idmapping.h> +#include <linux/filelock.h> #include "internal.h" -int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, - struct file *filp) +int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry, + loff_t length, unsigned int time_attrs, struct file *filp) { int ret; struct iattr newattrs; @@ -52,23 +54,27 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, } /* Remove suid, sgid, and file capabilities on truncate too */ - ret = dentry_needs_remove_privs(dentry); + ret = dentry_needs_remove_privs(idmap, dentry); if (ret < 0) return ret; if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; - inode_lock(dentry->d_inode); + ret = inode_lock_killable(dentry->d_inode); + if (ret) + return ret; + /* Note any delegations or leases have already been broken: */ - ret = notify_change(dentry, &newattrs, NULL); + ret = notify_change(idmap, dentry, &newattrs, NULL); inode_unlock(dentry->d_inode); return ret; } -long vfs_truncate(const struct path *path, loff_t length) +int vfs_truncate(const struct path *path, loff_t length) { + struct mnt_idmap *idmap; struct inode *inode; - long error; + int error; inode = path->dentry->d_inode; @@ -78,13 +84,18 @@ long vfs_truncate(const struct path *path, loff_t length) if (!S_ISREG(inode->i_mode)) return -EINVAL; - error = mnt_want_write(path->mnt); + idmap = mnt_idmap(path->mnt); + error = inode_permission(idmap, inode, MAY_WRITE); if (error) - goto out; + return error; - error = inode_permission(inode, MAY_WRITE); + error = fsnotify_truncate_perm(path, length); if (error) - goto mnt_drop_write_and_out; + return error; + + error = mnt_want_write(path->mnt); + if (error) + return error; error = -EPERM; if (IS_APPEND(inode)) @@ -102,22 +113,20 @@ long vfs_truncate(const struct path *path, loff_t length) if (error) goto put_write_and_out; - error = locks_verify_truncate(inode, NULL, length); - if (!error) - error = security_path_truncate(path); + error = security_path_truncate(path); if (!error) - error = do_truncate(path->dentry, length, 0, NULL); + error = do_truncate(idmap, path->dentry, length, 0, NULL); put_write_and_out: put_write_access(inode); mnt_drop_write_and_out: mnt_drop_write(path->mnt); -out: + return error; } EXPORT_SYMBOL_GPL(vfs_truncate); -long do_sys_truncate(const char __user *pathname, loff_t length) +int do_sys_truncate(const char __user *pathname, loff_t length) { unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; @@ -151,61 +160,60 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length } #endif -long do_sys_ftruncate(unsigned int fd, loff_t length, int small) +int do_ftruncate(struct file *file, loff_t length, int small) { struct inode *inode; struct dentry *dentry; - struct fd f; int error; - error = -EINVAL; - if (length < 0) - goto out; - error = -EBADF; - f = fdget(fd); - if (!f.file) - goto out; - /* explicitly opened as large or we are on 64-bit box */ - if (f.file->f_flags & O_LARGEFILE) + if (file->f_flags & O_LARGEFILE) small = 0; - dentry = f.file->f_path.dentry; + dentry = file->f_path.dentry; inode = dentry->d_inode; - error = -EINVAL; - if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE)) - goto out_putf; + if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE)) + return -EINVAL; - error = -EINVAL; /* Cannot ftruncate over 2^31 bytes without large file support */ if (small && length > MAX_NON_LFS) - goto out_putf; + return -EINVAL; - error = -EPERM; /* Check IS_APPEND on real upper inode */ - if (IS_APPEND(file_inode(f.file))) - goto out_putf; + if (IS_APPEND(file_inode(file))) + return -EPERM; - sb_start_write(inode->i_sb); - error = locks_verify_truncate(inode, f.file, length); - if (!error) - error = security_path_truncate(&f.file->f_path); - if (!error) - error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file); - sb_end_write(inode->i_sb); -out_putf: - fdput(f); -out: - return error; + error = security_file_truncate(file); + if (error) + return error; + + error = fsnotify_truncate_perm(&file->f_path, length); + if (error) + return error; + + scoped_guard(super_write, inode->i_sb) + return do_truncate(file_mnt_idmap(file), dentry, length, + ATTR_MTIME | ATTR_CTIME, file); } -SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length) +int do_sys_ftruncate(unsigned int fd, loff_t length, int small) +{ + if (length < 0) + return -EINVAL; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; + + return do_ftruncate(fd_file(f), length, small); +} + +SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length) { return do_sys_ftruncate(fd, length, 1); } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length) +COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length) { return do_sys_ftruncate(fd, length, 1); } @@ -224,49 +232,65 @@ SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length) } #endif /* BITS_PER_LONG == 32 */ +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64) +COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname, + compat_arg_u64_dual(length)) +{ + return ksys_truncate(pathname, compat_arg_u64_glue(length)); +} +#endif + +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64) +COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd, + compat_arg_u64_dual(length)) +{ + return ksys_ftruncate(fd, compat_arg_u64_glue(length)); +} +#endif int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - long ret; + int ret; + loff_t sum; if (offset < 0 || len <= 0) return -EINVAL; - /* Return error if mode is not supported */ - if (mode & ~FALLOC_FL_SUPPORTED_MASK) + if (mode & ~(FALLOC_FL_MODE_MASK | FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; - /* Punch hole and zero range are mutually exclusive */ - if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) == - (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) - return -EOPNOTSUPP; - - /* Punch hole must have keep size set */ - if ((mode & FALLOC_FL_PUNCH_HOLE) && - !(mode & FALLOC_FL_KEEP_SIZE)) + /* + * Modes are exclusive, even if that is not obvious from the encoding + * as bit masks and the mix with the flag in the same namespace. + * + * To make things even more complicated, FALLOC_FL_ALLOCATE_RANGE is + * encoded as no bit set. + */ + switch (mode & FALLOC_FL_MODE_MASK) { + case FALLOC_FL_ALLOCATE_RANGE: + case FALLOC_FL_UNSHARE_RANGE: + case FALLOC_FL_ZERO_RANGE: + break; + case FALLOC_FL_PUNCH_HOLE: + if (!(mode & FALLOC_FL_KEEP_SIZE)) + return -EOPNOTSUPP; + break; + case FALLOC_FL_COLLAPSE_RANGE: + case FALLOC_FL_INSERT_RANGE: + case FALLOC_FL_WRITE_ZEROES: + if (mode & FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + break; + default: return -EOPNOTSUPP; - - /* Collapse range should only be used exclusively. */ - if ((mode & FALLOC_FL_COLLAPSE_RANGE) && - (mode & ~FALLOC_FL_COLLAPSE_RANGE)) - return -EINVAL; - - /* Insert range should only be used exclusively. */ - if ((mode & FALLOC_FL_INSERT_RANGE) && - (mode & ~FALLOC_FL_INSERT_RANGE)) - return -EINVAL; - - /* Unshare range should only be used with allocate mode. */ - if ((mode & FALLOC_FL_UNSHARE_RANGE) && - (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE))) - return -EINVAL; + } if (!(file->f_mode & FMODE_WRITE)) return -EBADF; /* - * We can only allow pure fallocate on append only files + * On append-only files only space preallocation is supported. */ if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) return -EPERM; @@ -288,6 +312,10 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (ret) return ret; + ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len); + if (ret) + return ret; + if (S_ISFIFO(inode->i_mode)) return -ESPIPE; @@ -297,8 +325,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) return -ENODEV; - /* Check for wrap through zero too */ - if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) + /* Check for wraparound */ + if (check_add_overflow(offset, len, &sum)) + return -EFBIG; + + if (sum > inode->i_sb->s_maxbytes) return -EFBIG; if (!file->f_op->fallocate) @@ -324,14 +355,12 @@ EXPORT_SYMBOL_GPL(vfs_fallocate); int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len) { - struct fd f = fdget(fd); - int error = -EBADF; + CLASS(fd, f)(fd); - if (f.file) { - error = vfs_fallocate(f.file, mode, offset, len); - fdput(f); - } - return error; + if (fd_empty(f)) + return -EBADF; + + return vfs_fallocate(fd_file(f), mode, offset, len); } SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) @@ -339,26 +368,63 @@ SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) return ksys_fallocate(fd, mode, offset, len); } +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE) +COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset), + compat_arg_u64_dual(len)) +{ + return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset), + compat_arg_u64_glue(len)); +} +#endif + /* * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and * switching the fsuid/fsgid around to the real ones. + * + * Creating new credentials is expensive, so we try to skip doing it, + * which we can if the result would match what we already got. */ -long do_faccessat(int dfd, const char __user *filename, int mode) +static bool access_need_override_creds(int flags) { - const struct cred *old_cred; - struct cred *override_cred; - struct path path; - struct inode *inode; - int res; - unsigned int lookup_flags = LOOKUP_FOLLOW; + const struct cred *cred; - if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ - return -EINVAL; + if (flags & AT_EACCESS) + return false; + + cred = current_cred(); + if (!uid_eq(cred->fsuid, cred->uid) || + !gid_eq(cred->fsgid, cred->gid)) + return true; + + if (!issecure(SECURE_NO_SETUID_FIXUP)) { + kuid_t root_uid = make_kuid(cred->user_ns, 0); + if (!uid_eq(cred->uid, root_uid)) { + if (!cap_isclear(cred->cap_effective)) + return true; + } else { + if (!cap_isidentical(cred->cap_effective, + cred->cap_permitted)) + return true; + } + } + + return false; +} + +static const struct cred *access_override_creds(void) +{ + struct cred *override_cred; override_cred = prepare_creds(); if (!override_cred) - return -ENOMEM; + return NULL; + + /* + * XXX access_need_override_creds performs checks in hopes of skipping + * this work. Make sure it stays in sync if making any changes in this + * routine. + */ override_cred->fsuid = override_cred->uid; override_cred->fsgid = override_cred->gid; @@ -373,7 +439,53 @@ long do_faccessat(int dfd, const char __user *filename, int mode) override_cred->cap_permitted; } - old_cred = override_creds(override_cred); + /* + * The new set of credentials can *only* be used in + * task-synchronous circumstances, and does not need + * RCU freeing, unless somebody then takes a separate + * reference to it. + * + * NOTE! This is _only_ true because this credential + * is used purely for override_creds() that installs + * it as the subjective cred. Other threads will be + * accessing ->real_cred, not the subjective cred. + * + * If somebody _does_ make a copy of this (using the + * 'get_current_cred()' function), that will clear the + * non_rcu field, because now that other user may be + * expecting RCU freeing. But normal thread-synchronous + * cred accesses will keep things non-racy to avoid RCU + * freeing. + */ + override_cred->non_rcu = 1; + return override_creds(override_cred); +} + +static int do_faccessat(int dfd, const char __user *filename, int mode, int flags) +{ + struct path path; + struct inode *inode; + int res; + unsigned int lookup_flags = LOOKUP_FOLLOW; + const struct cred *old_cred = NULL; + + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ + return -EINVAL; + + if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) + return -EINVAL; + + if (flags & AT_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + if (access_need_override_creds(flags)) { + old_cred = access_override_creds(); + if (!old_cred) + return -ENOMEM; + } + retry: res = user_path_at(dfd, filename, lookup_flags, &path); if (res) @@ -391,7 +503,7 @@ retry: goto out_path_release; } - res = inode_permission(inode, mode | MAY_ACCESS); + res = inode_permission(mnt_idmap(path.mnt), inode, mode | MAY_ACCESS); /* SuS v2 requires we report a read only fs too */ if (res || !(mode & S_IWOTH) || special_file(inode->i_mode)) goto out_path_release; @@ -415,22 +527,29 @@ out_path_release: goto retry; } out: - revert_creds(old_cred); - put_cred(override_cred); + if (old_cred) + put_cred(revert_creds(old_cred)); + return res; } SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) { - return do_faccessat(dfd, filename, mode); + return do_faccessat(dfd, filename, mode, 0); +} + +SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode, + int, flags) +{ + return do_faccessat(dfd, filename, mode, flags); } SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) { - return do_faccessat(AT_FDCWD, filename, mode); + return do_faccessat(AT_FDCWD, filename, mode, 0); } -int ksys_chdir(const char __user *filename) +SYSCALL_DEFINE1(chdir, const char __user *, filename) { struct path path; int error; @@ -440,7 +559,7 @@ retry: if (error) goto out; - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); + error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; @@ -456,34 +575,24 @@ out: return error; } -SYSCALL_DEFINE1(chdir, const char __user *, filename) -{ - return ksys_chdir(filename); -} - SYSCALL_DEFINE1(fchdir, unsigned int, fd) { - struct fd f = fdget_raw(fd); + CLASS(fd_raw, f)(fd); int error; - error = -EBADF; - if (!f.file) - goto out; + if (fd_empty(f)) + return -EBADF; - error = -ENOTDIR; - if (!d_can_lookup(f.file->f_path.dentry)) - goto out_putf; + if (!d_can_lookup(fd_file(f)->f_path.dentry)) + return -ENOTDIR; - error = inode_permission(file_inode(f.file), MAY_EXEC | MAY_CHDIR); + error = file_permission(fd_file(f), MAY_EXEC | MAY_CHDIR); if (!error) - set_fs_pwd(current->fs, &f.file->f_path); -out_putf: - fdput(f); -out: + set_fs_pwd(current->fs, &fd_file(f)->f_path); return error; } -int ksys_chroot(const char __user *filename) +SYSCALL_DEFINE1(chroot, const char __user *, filename) { struct path path; int error; @@ -493,7 +602,7 @@ retry: if (error) goto out; - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); + error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; @@ -516,15 +625,10 @@ out: return error; } -SYSCALL_DEFINE1(chroot, const char __user *, filename) -{ - return ksys_chroot(filename); -} - -static int chmod_common(const struct path *path, umode_t mode) +int chmod_common(const struct path *path, umode_t mode) { struct inode *inode = path->dentry->d_inode; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; struct iattr newattrs; int error; @@ -532,47 +636,58 @@ static int chmod_common(const struct path *path, umode_t mode) if (error) return error; retry_deleg: - inode_lock(inode); + error = inode_lock_killable(inode); + if (error) + goto out_mnt_unlock; error = security_path_chmod(path, mode); if (error) goto out_unlock; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - error = notify_change(path->dentry, &newattrs, &delegated_inode); + error = notify_change(mnt_idmap(path->mnt), path->dentry, + &newattrs, &delegated_inode); out_unlock: inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } +out_mnt_unlock: mnt_drop_write(path->mnt); return error; } -int ksys_fchmod(unsigned int fd, umode_t mode) +int vfs_fchmod(struct file *file, umode_t mode) { - struct fd f = fdget(fd); - int err = -EBADF; - - if (f.file) { - audit_file(f.file); - err = chmod_common(&f.file->f_path, mode); - fdput(f); - } - return err; + audit_file(file); + return chmod_common(&file->f_path, mode); } SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) { - return ksys_fchmod(fd, mode); + CLASS(fd, f)(fd); + + if (fd_empty(f)) + return -EBADF; + + return vfs_fchmod(fd_file(f), mode); } -int do_fchmodat(int dfd, const char __user *filename, umode_t mode) +static int do_fchmodat(int dfd, const char __user *filename, umode_t mode, + unsigned int flags) { struct path path; int error; - unsigned int lookup_flags = LOOKUP_FOLLOW; + unsigned int lookup_flags; + + if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))) + return -EINVAL; + + lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (!error) { @@ -586,21 +701,59 @@ retry: return error; } +SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename, + umode_t, mode, unsigned int, flags) +{ + return do_fchmodat(dfd, filename, mode, flags); +} + SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode) { - return do_fchmodat(dfd, filename, mode); + return do_fchmodat(dfd, filename, mode, 0); } SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) { - return do_fchmodat(AT_FDCWD, filename, mode); + return do_fchmodat(AT_FDCWD, filename, mode, 0); } -static int chown_common(const struct path *path, uid_t user, gid_t group) +/* + * Check whether @kuid is valid and if so generate and set vfsuid_t in + * ia_vfsuid. + * + * Return: true if @kuid is valid, false if not. + */ +static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid) { + if (!uid_valid(kuid)) + return false; + attr->ia_valid |= ATTR_UID; + attr->ia_vfsuid = VFSUIDT_INIT(kuid); + return true; +} + +/* + * Check whether @kgid is valid and if so generate and set vfsgid_t in + * ia_vfsgid. + * + * Return: true if @kgid is valid, false if not. + */ +static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid) +{ + if (!gid_valid(kgid)) + return false; + attr->ia_valid |= ATTR_GID; + attr->ia_vfsgid = VFSGIDT_INIT(kgid); + return true; +} + +int chown_common(const struct path *path, uid_t user, gid_t group) +{ + struct mnt_idmap *idmap; + struct user_namespace *fs_userns; struct inode *inode = path->dentry->d_inode; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; int error; struct iattr newattrs; kuid_t uid; @@ -609,29 +762,33 @@ static int chown_common(const struct path *path, uid_t user, gid_t group) uid = make_kuid(current_user_ns(), user); gid = make_kgid(current_user_ns(), group); + idmap = mnt_idmap(path->mnt); + fs_userns = i_user_ns(inode); + retry_deleg: + newattrs.ia_vfsuid = INVALID_VFSUID; + newattrs.ia_vfsgid = INVALID_VFSGID; newattrs.ia_valid = ATTR_CTIME; - if (user != (uid_t) -1) { - if (!uid_valid(uid)) - return -EINVAL; - newattrs.ia_valid |= ATTR_UID; - newattrs.ia_uid = uid; - } - if (group != (gid_t) -1) { - if (!gid_valid(gid)) - return -EINVAL; - newattrs.ia_valid |= ATTR_GID; - newattrs.ia_gid = gid; - } + if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid)) + return -EINVAL; + if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid)) + return -EINVAL; + error = inode_lock_killable(inode); + if (error) + return error; if (!S_ISDIR(inode->i_mode)) - newattrs.ia_valid |= - ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; - inode_lock(inode); - error = security_path_chown(path, uid, gid); + newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | + setattr_should_drop_sgid(idmap, inode); + /* Continue to send actual fs values, not the mount values. */ + error = security_path_chown( + path, + from_vfsuid(idmap, fs_userns, newattrs.ia_vfsuid), + from_vfsgid(idmap, fs_userns, newattrs.ia_vfsgid)); if (!error) - error = notify_change(path->dentry, &newattrs, &delegated_inode); + error = notify_change(idmap, path->dentry, &newattrs, + &delegated_inode); inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; @@ -688,60 +845,84 @@ SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group AT_SYMLINK_NOFOLLOW); } -int ksys_fchown(unsigned int fd, uid_t user, gid_t group) +int vfs_fchown(struct file *file, uid_t user, gid_t group) { - struct fd f = fdget(fd); - int error = -EBADF; - - if (!f.file) - goto out; + int error; - error = mnt_want_write_file(f.file); + error = mnt_want_write_file(file); if (error) - goto out_fput; - audit_file(f.file); - error = chown_common(&f.file->f_path, user, group); - mnt_drop_write_file(f.file); -out_fput: - fdput(f); -out: + return error; + audit_file(file); + error = chown_common(&file->f_path, user, group); + mnt_drop_write_file(file); return error; } +int ksys_fchown(unsigned int fd, uid_t user, gid_t group) +{ + CLASS(fd, f)(fd); + + if (fd_empty(f)) + return -EBADF; + + return vfs_fchown(fd_file(f), user, group); +} + SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) { return ksys_fchown(fd, user, group); } +static inline int file_get_write_access(struct file *f) +{ + int error; + + error = get_write_access(f->f_inode); + if (unlikely(error)) + return error; + error = mnt_get_write_access(f->f_path.mnt); + if (unlikely(error)) + goto cleanup_inode; + if (unlikely(f->f_mode & FMODE_BACKING)) { + error = mnt_get_write_access(backing_file_user_path(f)->mnt); + if (unlikely(error)) + goto cleanup_mnt; + } + return 0; + +cleanup_mnt: + mnt_put_write_access(f->f_path.mnt); +cleanup_inode: + put_write_access(f->f_inode); + return error; +} + static int do_dentry_open(struct file *f, - struct inode *inode, int (*open)(struct inode *, struct file *)) { static const struct file_operations empty_fops = {}; + struct inode *inode = f->f_path.dentry->d_inode; int error; path_get(&f->f_path); f->f_inode = inode; f->f_mapping = inode->i_mapping; - - /* Ensure that we skip any errors that predate opening of the file */ f->f_wb_err = filemap_sample_wb_err(f->f_mapping); + f->f_sb_err = file_sample_sb_err(f); if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH | FMODE_OPENED; + file_set_fsnotify_mode(f, FMODE_NONOTIFY); f->f_op = &empty_fops; return 0; } - if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { - error = get_write_access(inode); + if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { + i_readcount_inc(inode); + } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { + error = file_get_write_access(f); if (unlikely(error)) goto cleanup_file; - error = __mnt_want_write(f->f_path.mnt); - if (unlikely(error)) { - put_write_access(inode); - goto cleanup_file; - } f->f_mode |= FMODE_WRITER; } @@ -750,17 +931,27 @@ static int do_dentry_open(struct file *f, f->f_mode |= FMODE_ATOMIC_POS; f->f_op = fops_get(inode->i_fop); - if (unlikely(WARN_ON(!f->f_op))) { + if (WARN_ON(!f->f_op)) { error = -ENODEV; goto cleanup_all; } error = security_file_open(f); - if (error) + if (unlikely(error)) goto cleanup_all; - error = break_lease(locks_inode(f), f->f_flags); - if (error) + /* + * Call fsnotify open permission hook and set FMODE_NONOTIFY_* bits + * according to existing permission watches. + * If FMODE_NONOTIFY mode was already set for an fanotify fd or for a + * pseudo file, this call will not change the mode. + */ + error = fsnotify_open_perm_and_set_mode(f); + if (unlikely(error)) + goto cleanup_all; + + error = break_lease(file_inode(f), f->f_flags); + if (unlikely(error)) goto cleanup_all; /* normally all 3 are set; ->open() can clear them if needed */ @@ -773,39 +964,63 @@ static int do_dentry_open(struct file *f, goto cleanup_all; } f->f_mode |= FMODE_OPENED; - if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) - i_readcount_inc(inode); if ((f->f_mode & FMODE_READ) && likely(f->f_op->read || f->f_op->read_iter)) f->f_mode |= FMODE_CAN_READ; if ((f->f_mode & FMODE_WRITE) && likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; + if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek) + f->f_mode &= ~FMODE_LSEEK; + if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO) + f->f_mode |= FMODE_CAN_ODIRECT; - f->f_write_hint = WRITE_LIFE_NOT_SET; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + f->f_iocb_flags = iocb_flags(f); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); - /* NB: we're sure to have correct a_ops only after f_op->open */ - if (f->f_flags & O_DIRECT) { - if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) - return -EINVAL; + if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; + + /* + * XXX: Huge page cache doesn't support writing yet. Drop all page + * cache for this file before processing writes. + */ + if (f->f_mode & FMODE_WRITE) { + /* + * Depends on full fence from get_write_access() to synchronize + * against collapse_file() regarding i_writecount and nr_thps + * updates. Ensures subsequent insertion of THPs into the page + * cache will fail. + */ + if (filemap_nr_thps(inode->i_mapping)) { + struct address_space *mapping = inode->i_mapping; + + filemap_invalidate_lock(inode->i_mapping); + /* + * unmap_mapping_range just need to be called once + * here, because the private pages is not need to be + * unmapped mapping (e.g. data segment of dynamic + * shared libraries here). + */ + unmap_mapping_range(mapping, 0, 0, 0); + truncate_inode_pages(mapping, 0); + filemap_invalidate_unlock(inode->i_mapping); + } } + return 0; cleanup_all: if (WARN_ON_ONCE(error > 0)) error = -EINVAL; fops_put(f->f_op); - if (f->f_mode & FMODE_WRITER) { - put_write_access(inode); - __mnt_drop_write(f->f_path.mnt); - } + put_file_access(f); cleanup_file: path_put(&f->f_path); - f->f_path.mnt = NULL; - f->f_path.dentry = NULL; + f->__f_path.mnt = NULL; + f->__f_path.dentry = NULL; f->f_inode = NULL; return error; } @@ -815,7 +1030,6 @@ cleanup_file: * @file: file pointer * @dentry: pointer to dentry * @open: open callback - * @opened: state of open * * This can be used to finish opening a file passed to i_op->atomic_open(). * @@ -826,9 +1040,6 @@ cleanup_file: * the return value of d_splice_alias(), then the caller needs to perform dput() * on it after finish_open(). * - * On successful return @file is a fully instantiated open file. After this, if - * an error occurs in ->atomic_open(), it needs to clean up with fput(). - * * Returns zero on success or -errno if the open failed. */ int finish_open(struct file *file, struct dentry *dentry, @@ -836,8 +1047,8 @@ int finish_open(struct file *file, struct dentry *dentry, { BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ - file->f_path.dentry = dentry; - return do_dentry_open(file, d_backing_inode(dentry), open); + file->__f_path.dentry = dentry; + return do_dentry_open(file, open); } EXPORT_SYMBOL(finish_open); @@ -845,19 +1056,21 @@ EXPORT_SYMBOL(finish_open); * finish_no_open - finish ->atomic_open() without opening the file * * @file: file pointer - * @dentry: dentry or NULL (as returned from ->lookup()) + * @dentry: dentry, ERR_PTR(-E...) or NULL (as returned from ->lookup()) * - * This can be used to set the result of a successful lookup in ->atomic_open(). + * This can be used to set the result of a lookup in ->atomic_open(). * * NB: unlike finish_open() this function does consume the dentry reference and * the caller need not dput() it. * - * Returns "0" which must be the return value of ->atomic_open() after having - * called this function. + * Returns 0 or -E..., which must be the return value of ->atomic_open() after + * having called this function. */ int finish_no_open(struct file *file, struct dentry *dentry) { - file->f_path.dentry = dentry; + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + file->__f_path.dentry = dentry; return 0; } EXPORT_SYMBOL(finish_no_open); @@ -872,12 +1085,22 @@ EXPORT_SYMBOL(file_path); * vfs_open - open the file at the given path * @path: path to open * @file: newly allocated file with f_flag initialized - * @cred: credentials to use */ int vfs_open(const struct path *path, struct file *file) { - file->f_path = *path; - return do_dentry_open(file, d_backing_inode(path->dentry), NULL); + int ret; + + file->__f_path = *path; + ret = do_dentry_open(file, NULL); + if (!ret) { + /* + * Once we return a file with FMODE_OPENED, __fput() will call + * fsnotify_close(), so we need fsnotify_open() here for + * symmetry. + */ + fsnotify_open(file); + } + return ret; } struct file *dentry_open(const struct path *path, int flags, @@ -886,8 +1109,6 @@ struct file *dentry_open(const struct path *path, int flags, int error; struct file *f; - validate_creds(cred); - /* We must always pass in a valid mount pointer. */ BUG_ON(!path->mnt); @@ -903,15 +1124,15 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); -struct file *open_with_fake_path(const struct path *path, int flags, - struct inode *inode, const struct cred *cred) +struct file *dentry_open_nonotify(const struct path *path, int flags, + const struct cred *cred) { - struct file *f = alloc_empty_file_noaccount(flags, cred); + struct file *f = alloc_empty_file(flags, cred); if (!IS_ERR(f)) { int error; - f->f_path = *path; - error = do_dentry_open(f, inode, NULL); + file_set_fsnotify_mode(f, FMODE_NONOTIFY); + error = vfs_open(path, f); if (error) { fput(f); f = ERR_PTR(error); @@ -919,50 +1140,172 @@ struct file *open_with_fake_path(const struct path *path, int flags, } return f; } -EXPORT_SYMBOL(open_with_fake_path); -static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) +/** + * dentry_create - Create and open a file + * @path: path to create + * @flags: O_ flags + * @mode: mode bits for new file + * @cred: credentials to use + * + * Caller must hold the parent directory's lock, and have prepared + * a negative dentry, placed in @path->dentry, for the new file. + * + * Caller sets @path->mnt to the vfsmount of the filesystem where + * the new file is to be created. The parent directory and the + * negative dentry must reside on the same filesystem instance. + * + * On success, returns a "struct file *". Otherwise a ERR_PTR + * is returned. + */ +struct file *dentry_create(const struct path *path, int flags, umode_t mode, + const struct cred *cred) { + struct file *f; + int error; + + f = alloc_empty_file(flags, cred); + if (IS_ERR(f)) + return f; + + error = vfs_create(mnt_idmap(path->mnt), path->dentry, mode, NULL); + if (!error) + error = vfs_open(path, f); + + if (unlikely(error)) { + fput(f); + return ERR_PTR(error); + } + return f; +} +EXPORT_SYMBOL(dentry_create); + +/** + * kernel_file_open - open a file for kernel internal use + * @path: path of the file to open + * @flags: open flags + * @cred: credentials for open + * + * Open a file for use by in-kernel consumers. The file is not accounted + * against nr_files and must not be installed into the file descriptor + * table. + * + * Return: Opened file on success, an error pointer on failure. + */ +struct file *kernel_file_open(const struct path *path, int flags, + const struct cred *cred) +{ + struct file *f; + int error; + + f = alloc_empty_file_noaccount(flags, cred); + if (IS_ERR(f)) + return f; + + error = vfs_open(path, f); + if (error) { + fput(f); + return ERR_PTR(error); + } + return f; +} +EXPORT_SYMBOL_GPL(kernel_file_open); + +#define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) +#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) + +inline struct open_how build_open_how(int flags, umode_t mode) +{ + struct open_how how = { + .flags = flags & VALID_OPEN_FLAGS, + .mode = mode & S_IALLUGO, + }; + + /* O_PATH beats everything else. */ + if (how.flags & O_PATH) + how.flags &= O_PATH_FLAGS; + /* Modes should only be set for create-like flags. */ + if (!WILL_CREATE(how.flags)) + how.mode = 0; + return how; +} + +inline int build_open_flags(const struct open_how *how, struct open_flags *op) +{ + u64 flags = how->flags; + u64 strip = O_CLOEXEC; int lookup_flags = 0; int acc_mode = ACC_MODE(flags); + BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS), + "struct open_flags doesn't yet handle flags > 32 bits"); + /* - * Clear out all open flags we don't know about so that we don't report - * them in fcntl(F_GETFD) or similar interfaces. + * Strip flags that aren't relevant in determining struct open_flags. */ - flags &= VALID_OPEN_FLAGS; + flags &= ~strip; - if (flags & (O_CREAT | __O_TMPFILE)) - op->mode = (mode & S_IALLUGO) | S_IFREG; - else - op->mode = 0; + /* + * Older syscalls implicitly clear all of the invalid flags or argument + * values before calling build_open_flags(), but openat2(2) checks all + * of its arguments. + */ + if (flags & ~VALID_OPEN_FLAGS) + return -EINVAL; + if (how->resolve & ~VALID_RESOLVE_FLAGS) + return -EINVAL; - /* Must never be set by userspace */ - flags &= ~FMODE_NONOTIFY & ~O_CLOEXEC; + /* Scoping flags are mutually exclusive. */ + if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT)) + return -EINVAL; + + /* Deal with the mode. */ + if (WILL_CREATE(flags)) { + if (how->mode & ~S_IALLUGO) + return -EINVAL; + op->mode = how->mode | S_IFREG; + } else { + if (how->mode != 0) + return -EINVAL; + op->mode = 0; + } /* - * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only - * check for O_DSYNC if the need any syncing at all we enforce it's - * always set instead of having to deal with possibly weird behaviour - * for malicious applications setting only __O_SYNC. + * Block bugs where O_DIRECTORY | O_CREAT created regular files. + * Note, that blocking O_DIRECTORY | O_CREAT here also protects + * O_TMPFILE below which requires O_DIRECTORY being raised. */ - if (flags & __O_SYNC) - flags |= O_DSYNC; + if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT)) + return -EINVAL; + /* Now handle the creative implementation of O_TMPFILE. */ if (flags & __O_TMPFILE) { - if ((flags & O_TMPFILE_MASK) != O_TMPFILE) + /* + * In order to ensure programs get explicit errors when trying + * to use O_TMPFILE on old kernels we enforce that O_DIRECTORY + * is raised alongside __O_TMPFILE. + */ + if (!(flags & O_DIRECTORY)) return -EINVAL; if (!(acc_mode & MAY_WRITE)) return -EINVAL; - } else if (flags & O_PATH) { - /* - * If we have O_PATH in the open flag. Then we - * cannot have anything other than the below set of flags - */ - flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; + } + if (flags & O_PATH) { + /* O_PATH only permits certain other flags to be set. */ + if (flags & ~O_PATH_FLAGS) + return -EINVAL; acc_mode = 0; } + /* + * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only + * check for O_DSYNC if the need any syncing at all we enforce it's + * always set instead of having to deal with possibly weird behaviour + * for malicious applications setting only __O_SYNC. + */ + if (flags & __O_SYNC) + flags |= O_DSYNC; + op->open_flag = flags; /* O_TRUNC implies we need access checks for write permissions */ @@ -980,14 +1323,34 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o if (flags & O_CREAT) { op->intent |= LOOKUP_CREATE; - if (flags & O_EXCL) + if (flags & O_EXCL) { op->intent |= LOOKUP_EXCL; + flags |= O_NOFOLLOW; + } } if (flags & O_DIRECTORY) lookup_flags |= LOOKUP_DIRECTORY; if (!(flags & O_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; + + if (how->resolve & RESOLVE_NO_XDEV) + lookup_flags |= LOOKUP_NO_XDEV; + if (how->resolve & RESOLVE_NO_MAGICLINKS) + lookup_flags |= LOOKUP_NO_MAGICLINKS; + if (how->resolve & RESOLVE_NO_SYMLINKS) + lookup_flags |= LOOKUP_NO_SYMLINKS; + if (how->resolve & RESOLVE_BENEATH) + lookup_flags |= LOOKUP_BENEATH; + if (how->resolve & RESOLVE_IN_ROOT) + lookup_flags |= LOOKUP_IN_ROOT; + if (how->resolve & RESOLVE_CACHED) { + /* Don't bother even trying for create/truncate/tmpfile open */ + if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE)) + return -EAGAIN; + lookup_flags |= LOOKUP_CACHED; + } + op->lookup_flags = lookup_flags; return 0; } @@ -1006,8 +1369,11 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o struct file *file_open_name(struct filename *name, int flags, umode_t mode) { struct open_flags op; - int err = build_open_flags(flags, mode, &op); - return err ? ERR_PTR(err) : do_filp_open(AT_FDCWD, name, &op); + struct open_how how = build_open_how(flags, mode); + int err = build_open_flags(&how, &op); + if (err) + return ERR_PTR(err); + return do_filp_open(AT_FDCWD, name, &op); } /** @@ -1025,7 +1391,7 @@ struct file *filp_open(const char *filename, int flags, umode_t mode) { struct filename *name = getname_kernel(filename); struct file *file = ERR_CAST(name); - + if (!IS_ERR(name)) { file = file_open_name(name, flags, mode); putname(name); @@ -1034,50 +1400,47 @@ struct file *filp_open(const char *filename, int flags, umode_t mode) } EXPORT_SYMBOL(filp_open); -struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, +struct file *file_open_root(const struct path *root, const char *filename, int flags, umode_t mode) { struct open_flags op; - int err = build_open_flags(flags, mode, &op); + struct open_how how = build_open_how(flags, mode); + int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); - return do_file_open_root(dentry, mnt, filename, &op); + return do_file_open_root(root, filename, &op); } EXPORT_SYMBOL(file_open_root); -long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) +static int do_sys_openat2(int dfd, const char __user *filename, + struct open_how *how) { struct open_flags op; - int fd = build_open_flags(flags, mode, &op); - struct filename *tmp; + struct filename *tmp __free(putname) = NULL; + int err; - if (fd) - return fd; + err = build_open_flags(how, &op); + if (unlikely(err)) + return err; tmp = getname(filename); if (IS_ERR(tmp)) return PTR_ERR(tmp); - fd = get_unused_fd_flags(flags); - if (fd >= 0) { - struct file *f = do_filp_open(dfd, tmp, &op); - if (IS_ERR(f)) { - put_unused_fd(fd); - fd = PTR_ERR(f); - } else { - fsnotify_open(f); - fd_install(fd, f); - } - } - putname(tmp); - return fd; + return FD_ADD(how->flags, do_filp_open(dfd, tmp, &op)); } +int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) +{ + struct open_how how = build_open_how(flags, mode); + return do_sys_openat2(dfd, filename, &how); +} + + SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { if (force_o_largefile()) flags |= O_LARGEFILE; - return do_sys_open(AT_FDCWD, filename, flags, mode); } @@ -1086,10 +1449,36 @@ SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, { if (force_o_largefile()) flags |= O_LARGEFILE; - return do_sys_open(dfd, filename, flags, mode); } +SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename, + struct open_how __user *, how, size_t, usize) +{ + int err; + struct open_how tmp; + + BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0); + BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST); + + if (unlikely(usize < OPEN_HOW_SIZE_VER0)) + return -EINVAL; + if (unlikely(usize > PAGE_SIZE)) + return -E2BIG; + + err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize); + if (err) + return err; + + audit_openat2_how(&tmp); + + /* O_LARGEFILE is only allowed for non-O_PATH. */ + if (!(tmp.flags & O_PATH) && force_o_largefile()) + tmp.flags |= O_LARGEFILE; + + return do_sys_openat2(dfd, filename, &tmp); +} + #ifdef CONFIG_COMPAT /* * Exactly like sys_open(), except that it doesn't set the @@ -1118,21 +1507,25 @@ COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, fla */ SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) { - return ksys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode); -} + int flags = O_CREAT | O_WRONLY | O_TRUNC; + if (force_o_largefile()) + flags |= O_LARGEFILE; + return do_sys_open(AT_FDCWD, pathname, flags, mode); +} #endif /* * "id" is the POSIX thread ID. We use the * files pointer for this.. */ -int filp_close(struct file *filp, fl_owner_t id) +static int filp_flush(struct file *filp, fl_owner_t id) { int retval = 0; - if (!file_count(filp)) { - printk(KERN_ERR "VFS: Close: file count is 0\n"); + if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, filp, + "VFS: Close: file count is 0 (f_op=%ps)", + filp->f_op)) { return 0; } @@ -1143,10 +1536,18 @@ int filp_close(struct file *filp, fl_owner_t id) dnotify_flush(filp, id); locks_remove_posix(filp, id); } - fput(filp); return retval; } +int filp_close(struct file *filp, fl_owner_t id) +{ + int retval; + + retval = filp_flush(filp, id); + fput_close(filp); + + return retval; +} EXPORT_SYMBOL(filp_close); /* @@ -1156,13 +1557,29 @@ EXPORT_SYMBOL(filp_close); */ SYSCALL_DEFINE1(close, unsigned int, fd) { - int retval = __close_fd(current->files, fd); + int retval; + struct file *file; + + file = file_close_fd(fd); + if (!file) + return -EBADF; + + retval = filp_flush(file, current->files); + + /* + * We're returning to user space. Don't bother + * with any delayed fput() cases. + */ + fput_close_sync(file); + + if (likely(retval == 0)) + return 0; /* can't restart close syscall because file table entry was cleared */ - if (unlikely(retval == -ERESTARTSYS || - retval == -ERESTARTNOINTR || - retval == -ERESTARTNOHAND || - retval == -ERESTART_RESTARTBLOCK)) + if (retval == -ERESTARTSYS || + retval == -ERESTARTNOINTR || + retval == -ERESTARTNOHAND || + retval == -ERESTART_RESTARTBLOCK) retval = -EINTR; return retval; @@ -1209,3 +1626,22 @@ int nonseekable_open(struct inode *inode, struct file *filp) } EXPORT_SYMBOL(nonseekable_open); + +/* + * stream_open is used by subsystems that want stream-like file descriptors. + * Such file descriptors are not seekable and don't have notion of position + * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL). + * Contrary to file descriptors of other regular files, .read() and .write() + * can run simultaneously. + * + * stream_open never fails and is marked to return int so that it could be + * directly used as file_operations.open . + */ +int stream_open(struct inode *inode, struct file *filp) +{ + filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS); + filp->f_mode |= FMODE_STREAM; + return 0; +} + +EXPORT_SYMBOL(stream_open); |
