diff options
Diffstat (limited to 'fs/fcntl.c')
| -rw-r--r-- | fs/fcntl.c | 400 |
1 files changed, 271 insertions, 129 deletions
diff --git a/fs/fcntl.c b/fs/fcntl.c index 146c9ab0cd4b..f93dbca08435 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -10,8 +10,8 @@ #include <linux/mm.h> #include <linux/sched/task.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/capability.h> #include <linux/dnotify.h> #include <linux/slab.h> @@ -26,14 +26,17 @@ #include <linux/memfd.h> #include <linux/compat.h> #include <linux/mount.h> +#include <linux/rw_hint.h> #include <linux/poll.h> #include <asm/siginfo.h> #include <linux/uaccess.h> +#include "internal.h" + #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) -static int setfl(int fd, struct file * filp, unsigned long arg) +static int setfl(int fd, struct file * filp, unsigned int arg) { struct inode * inode = file_inode(filp); int error = 0; @@ -47,7 +50,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg) /* O_NOATIME can only be set by the owner or superuser */ if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME)) - if (!inode_owner_or_capable(file_mnt_user_ns(filp), inode)) + if (!inode_owner_or_capable(file_mnt_idmap(filp), inode)) return -EPERM; /* required for strict SunOS emulation */ @@ -85,37 +88,75 @@ static int setfl(int fd, struct file * filp, unsigned long arg) return error; } -static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, - int force) +/* + * Allocate an file->f_owner struct if it doesn't exist, handling racing + * allocations correctly. + */ +int file_f_owner_allocate(struct file *file) { - write_lock_irq(&filp->f_owner.lock); - if (force || !filp->f_owner.pid) { - put_pid(filp->f_owner.pid); - filp->f_owner.pid = get_pid(pid); - filp->f_owner.pid_type = type; + struct fown_struct *f_owner; - if (pid) { - const struct cred *cred = current_cred(); - filp->f_owner.uid = cred->uid; - filp->f_owner.euid = cred->euid; - } + f_owner = file_f_owner(file); + if (f_owner) + return 0; + + f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL); + if (!f_owner) + return -ENOMEM; + + rwlock_init(&f_owner->lock); + f_owner->file = file; + /* If someone else raced us, drop our allocation. */ + if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner))) + kfree(f_owner); + return 0; +} +EXPORT_SYMBOL(file_f_owner_allocate); + +void file_f_owner_release(struct file *file) +{ + struct fown_struct *f_owner; + + f_owner = file_f_owner(file); + if (f_owner) { + put_pid(f_owner->pid); + kfree(f_owner); } - write_unlock_irq(&filp->f_owner.lock); } void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, int force) { - security_file_set_fowner(filp); - f_modown(filp, pid, type, force); + struct fown_struct *f_owner; + + f_owner = file_f_owner(filp); + if (WARN_ON_ONCE(!f_owner)) + return; + + write_lock_irq(&f_owner->lock); + if (force || !f_owner->pid) { + put_pid(f_owner->pid); + f_owner->pid = get_pid(pid); + f_owner->pid_type = type; + + if (pid) { + const struct cred *cred = current_cred(); + security_file_set_fowner(filp); + f_owner->uid = cred->uid; + f_owner->euid = cred->euid; + } + } + write_unlock_irq(&f_owner->lock); } EXPORT_SYMBOL(__f_setown); -int f_setown(struct file *filp, unsigned long arg, int force) +int f_setown(struct file *filp, int who, int force) { enum pid_type type; struct pid *pid = NULL; - int who = arg, ret = 0; + int ret = 0; + + might_sleep(); type = PIDTYPE_TGID; if (who < 0) { @@ -127,6 +168,10 @@ int f_setown(struct file *filp, unsigned long arg, int force) who = -who; } + ret = file_f_owner_allocate(filp); + if (ret) + return ret; + rcu_read_lock(); if (who) { pid = find_vpid(who); @@ -144,22 +189,27 @@ EXPORT_SYMBOL(f_setown); void f_delown(struct file *filp) { - f_modown(filp, NULL, PIDTYPE_TGID, 1); + __f_setown(filp, NULL, PIDTYPE_TGID, 1); } pid_t f_getown(struct file *filp) { pid_t pid = 0; + struct fown_struct *f_owner; - read_lock_irq(&filp->f_owner.lock); + f_owner = file_f_owner(filp); + if (!f_owner) + return pid; + + read_lock_irq(&f_owner->lock); rcu_read_lock(); - if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) { - pid = pid_vnr(filp->f_owner.pid); - if (filp->f_owner.pid_type == PIDTYPE_PGID) + if (pid_task(f_owner->pid, f_owner->pid_type)) { + pid = pid_vnr(f_owner->pid); + if (f_owner->pid_type == PIDTYPE_PGID) pid = -pid; } rcu_read_unlock(); - read_unlock_irq(&filp->f_owner.lock); + read_unlock_irq(&f_owner->lock); return pid; } @@ -192,6 +242,10 @@ static int f_setown_ex(struct file *filp, unsigned long arg) return -EINVAL; } + ret = file_f_owner_allocate(filp); + if (ret) + return ret; + rcu_read_lock(); pid = find_vpid(owner.pid); if (owner.pid && !pid) @@ -208,13 +262,20 @@ static int f_getown_ex(struct file *filp, unsigned long arg) struct f_owner_ex __user *owner_p = (void __user *)arg; struct f_owner_ex owner = {}; int ret = 0; + struct fown_struct *f_owner; + enum pid_type pid_type = PIDTYPE_PID; - read_lock_irq(&filp->f_owner.lock); - rcu_read_lock(); - if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) - owner.pid = pid_vnr(filp->f_owner.pid); - rcu_read_unlock(); - switch (filp->f_owner.pid_type) { + f_owner = file_f_owner(filp); + if (f_owner) { + read_lock_irq(&f_owner->lock); + rcu_read_lock(); + if (pid_task(f_owner->pid, f_owner->pid_type)) + owner.pid = pid_vnr(f_owner->pid); + rcu_read_unlock(); + pid_type = f_owner->pid_type; + } + + switch (pid_type) { case PIDTYPE_PID: owner.type = F_OWNER_TID; break; @@ -232,7 +293,8 @@ static int f_getown_ex(struct file *filp, unsigned long arg) ret = -EINVAL; break; } - read_unlock_irq(&filp->f_owner.lock); + if (f_owner) + read_unlock_irq(&f_owner->lock); if (!ret) { ret = copy_to_user(owner_p, &owner, sizeof(owner)); @@ -246,14 +308,18 @@ static int f_getown_ex(struct file *filp, unsigned long arg) static int f_getowner_uids(struct file *filp, unsigned long arg) { struct user_namespace *user_ns = current_user_ns(); + struct fown_struct *f_owner; uid_t __user *dst = (void __user *)arg; - uid_t src[2]; + uid_t src[2] = {0, 0}; int err; - read_lock_irq(&filp->f_owner.lock); - src[0] = from_kuid(user_ns, filp->f_owner.uid); - src[1] = from_kuid(user_ns, filp->f_owner.euid); - read_unlock_irq(&filp->f_owner.lock); + f_owner = file_f_owner(filp); + if (f_owner) { + read_lock_irq(&f_owner->lock); + src[0] = from_kuid(user_ns, f_owner->uid); + src[1] = from_kuid(user_ns, f_owner->euid); + read_unlock_irq(&f_owner->lock); + } err = put_user(src[0], &dst[0]); err |= put_user(src[1], &dst[1]); @@ -267,8 +333,15 @@ static int f_getowner_uids(struct file *filp, unsigned long arg) } #endif -static bool rw_hint_valid(enum rw_hint hint) +static bool rw_hint_valid(u64 hint) { + BUILD_BUG_ON(WRITE_LIFE_NOT_SET != RWH_WRITE_LIFE_NOT_SET); + BUILD_BUG_ON(WRITE_LIFE_NONE != RWH_WRITE_LIFE_NONE); + BUILD_BUG_ON(WRITE_LIFE_SHORT != RWH_WRITE_LIFE_SHORT); + BUILD_BUG_ON(WRITE_LIFE_MEDIUM != RWH_WRITE_LIFE_MEDIUM); + BUILD_BUG_ON(WRITE_LIFE_LONG != RWH_WRITE_LIFE_LONG); + BUILD_BUG_ON(WRITE_LIFE_EXTREME != RWH_WRITE_LIFE_EXTREME); + switch (hint) { case RWH_WRITE_LIFE_NOT_SET: case RWH_WRITE_LIFE_NONE: @@ -282,62 +355,126 @@ static bool rw_hint_valid(enum rw_hint hint) } } -static long fcntl_rw_hint(struct file *file, unsigned int cmd, - unsigned long arg) +static long fcntl_get_rw_hint(struct file *file, unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; - enum rw_hint hint; - u64 h; + u64 hint = READ_ONCE(inode->i_write_hint); - switch (cmd) { - case F_GET_RW_HINT: - h = inode->i_write_hint; - if (copy_to_user(argp, &h, sizeof(*argp))) - return -EFAULT; - return 0; - case F_SET_RW_HINT: - if (copy_from_user(&h, argp, sizeof(h))) - return -EFAULT; - hint = (enum rw_hint) h; - if (!rw_hint_valid(hint)) - return -EINVAL; + if (copy_to_user(argp, &hint, sizeof(*argp))) + return -EFAULT; + return 0; +} - inode_lock(inode); - inode->i_write_hint = hint; - inode_unlock(inode); - return 0; - default: +static long fcntl_set_rw_hint(struct file *file, unsigned long arg) +{ + struct inode *inode = file_inode(file); + u64 __user *argp = (u64 __user *)arg; + u64 hint; + + if (!inode_owner_or_capable(file_mnt_idmap(file), inode)) + return -EPERM; + + if (copy_from_user(&hint, argp, sizeof(hint))) + return -EFAULT; + if (!rw_hint_valid(hint)) return -EINVAL; + + WRITE_ONCE(inode->i_write_hint, hint); + + /* + * file->f_mapping->host may differ from inode. As an example, + * blkdev_open() modifies file->f_mapping. + */ + if (file->f_mapping->host != inode) + WRITE_ONCE(file->f_mapping->host->i_write_hint, hint); + + return 0; +} + +/* Is the file descriptor a dup of the file? */ +static long f_dupfd_query(int fd, struct file *filp) +{ + CLASS(fd_raw, f)(fd); + + if (fd_empty(f)) + return -EBADF; + + /* + * We can do the 'fdput()' immediately, as the only thing that + * matters is the pointer value which isn't changed by the fdput. + * + * Technically we didn't need a ref at all, and 'fdget()' was + * overkill, but given our lockless file pointer lookup, the + * alternatives are complicated. + */ + return fd_file(f) == filp; +} + +/* Let the caller figure out whether a given file was just created. */ +static long f_created_query(const struct file *filp) +{ + return !!(filp->f_mode & FMODE_CREATED); +} + +static int f_owner_sig(struct file *filp, int signum, bool setsig) +{ + int ret = 0; + struct fown_struct *f_owner; + + might_sleep(); + + if (setsig) { + if (!valid_signal(signum)) + return -EINVAL; + + ret = file_f_owner_allocate(filp); + if (ret) + return ret; } + + f_owner = file_f_owner(filp); + if (setsig) + f_owner->signum = signum; + else if (f_owner) + ret = f_owner->signum; + return ret; } static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { void __user *argp = (void __user *)arg; + struct delegation deleg; + int argi = (int)arg; struct flock flock; long err = -EINVAL; switch (cmd) { + case F_CREATED_QUERY: + err = f_created_query(filp); + break; case F_DUPFD: - err = f_dupfd(arg, filp, 0); + err = f_dupfd(argi, filp, 0); break; case F_DUPFD_CLOEXEC: - err = f_dupfd(arg, filp, O_CLOEXEC); + err = f_dupfd(argi, filp, O_CLOEXEC); + break; + case F_DUPFD_QUERY: + err = f_dupfd_query(argi, filp); break; case F_GETFD: err = get_close_on_exec(fd) ? FD_CLOEXEC : 0; break; case F_SETFD: err = 0; - set_close_on_exec(fd, arg & FD_CLOEXEC); + set_close_on_exec(fd, argi & FD_CLOEXEC); break; case F_GETFL: err = filp->f_flags; break; case F_SETFL: - err = setfl(fd, filp, arg); + err = setfl(fd, filp, argi); break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ @@ -374,7 +511,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, force_successful_syscall_return(); break; case F_SETOWN: - err = f_setown(filp, arg, 1); + err = f_setown(filp, argi, 1); break; case F_GETOWN_EX: err = f_getown_ex(filp, arg); @@ -386,36 +523,45 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, err = f_getowner_uids(filp, arg); break; case F_GETSIG: - err = filp->f_owner.signum; + err = f_owner_sig(filp, 0, false); break; case F_SETSIG: - /* arg == 0 restores default behaviour. */ - if (!valid_signal(arg)) { - break; - } - err = 0; - filp->f_owner.signum = arg; + err = f_owner_sig(filp, argi, true); break; case F_GETLEASE: err = fcntl_getlease(filp); break; case F_SETLEASE: - err = fcntl_setlease(fd, filp, arg); + err = fcntl_setlease(fd, filp, argi); break; case F_NOTIFY: - err = fcntl_dirnotify(fd, filp, arg); + err = fcntl_dirnotify(fd, filp, argi); break; case F_SETPIPE_SZ: case F_GETPIPE_SZ: - err = pipe_fcntl(filp, cmd, arg); + err = pipe_fcntl(filp, cmd, argi); break; case F_ADD_SEALS: case F_GET_SEALS: - err = memfd_fcntl(filp, cmd, arg); + err = memfd_fcntl(filp, cmd, argi); break; case F_GET_RW_HINT: + err = fcntl_get_rw_hint(filp, arg); + break; case F_SET_RW_HINT: - err = fcntl_rw_hint(filp, cmd, arg); + err = fcntl_set_rw_hint(filp, arg); + break; + case F_GETDELEG: + if (copy_from_user(&deleg, argp, sizeof(deleg))) + return -EFAULT; + err = fcntl_getdeleg(filp, &deleg); + if (!err && copy_to_user(argp, &deleg, sizeof(deleg))) + return -EFAULT; + break; + case F_SETDELEG: + if (copy_from_user(&deleg, argp, sizeof(deleg))) + return -EFAULT; + err = fcntl_setdeleg(fd, filp, &deleg); break; default: break; @@ -426,8 +572,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, static int check_fcntl_cmd(unsigned cmd) { switch (cmd) { + case F_CREATED_QUERY: case F_DUPFD: case F_DUPFD_CLOEXEC: + case F_DUPFD_QUERY: case F_GETFD: case F_SETFD: case F_GETFL: @@ -438,24 +586,21 @@ static int check_fcntl_cmd(unsigned cmd) SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { - struct fd f = fdget_raw(fd); - long err = -EBADF; + CLASS(fd_raw, f)(fd); + long err; - if (!f.file) - goto out; + if (fd_empty(f)) + return -EBADF; - if (unlikely(f.file->f_mode & FMODE_PATH)) { + if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) - goto out1; + return -EBADF; } - err = security_file_fcntl(f.file, cmd, arg); + err = security_file_fcntl(fd_file(f), cmd, arg); if (!err) - err = do_fcntl(fd, cmd, arg, f.file); + err = do_fcntl(fd, cmd, arg, fd_file(f)); -out1: - fdput(f); -out: return err; } @@ -464,21 +609,21 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { void __user *argp = (void __user *)arg; - struct fd f = fdget_raw(fd); + CLASS(fd_raw, f)(fd); struct flock64 flock; - long err = -EBADF; + long err; - if (!f.file) - goto out; + if (fd_empty(f)) + return -EBADF; - if (unlikely(f.file->f_mode & FMODE_PATH)) { + if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) - goto out1; + return -EBADF; } - err = security_file_fcntl(f.file, cmd, arg); + err = security_file_fcntl(fd_file(f), cmd, arg); if (err) - goto out1; + return err; switch (cmd) { case F_GETLK64: @@ -486,7 +631,7 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, err = -EFAULT; if (copy_from_user(&flock, argp, sizeof(flock))) break; - err = fcntl_getlk64(f.file, cmd, &flock); + err = fcntl_getlk64(fd_file(f), cmd, &flock); if (!err && copy_to_user(argp, &flock, sizeof(flock))) err = -EFAULT; break; @@ -497,15 +642,12 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, err = -EFAULT; if (copy_from_user(&flock, argp, sizeof(flock))) break; - err = fcntl_setlk64(fd, f.file, cmd, &flock); + err = fcntl_setlk64(fd, fd_file(f), cmd, &flock); break; default: - err = do_fcntl(fd, cmd, arg, f.file); + err = do_fcntl(fd, cmd, arg, fd_file(f)); break; } -out1: - fdput(f); -out: return err; } #endif @@ -601,28 +743,28 @@ static int fixup_compat_flock(struct flock *flock) static long do_compat_fcntl64(unsigned int fd, unsigned int cmd, compat_ulong_t arg) { - struct fd f = fdget_raw(fd); + CLASS(fd_raw, f)(fd); struct flock flock; - long err = -EBADF; + long err; - if (!f.file) - return err; + if (fd_empty(f)) + return -EBADF; - if (unlikely(f.file->f_mode & FMODE_PATH)) { + if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) - goto out_put; + return -EBADF; } - err = security_file_fcntl(f.file, cmd, arg); + err = security_file_fcntl(fd_file(f), cmd, arg); if (err) - goto out_put; + return err; switch (cmd) { case F_GETLK: err = get_compat_flock(&flock, compat_ptr(arg)); if (err) break; - err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock); + err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock); if (err) break; err = fixup_compat_flock(&flock); @@ -634,7 +776,7 @@ static long do_compat_fcntl64(unsigned int fd, unsigned int cmd, err = get_compat_flock64(&flock, compat_ptr(arg)); if (err) break; - err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock); + err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock); if (!err) err = put_compat_flock64(&flock, compat_ptr(arg)); break; @@ -643,7 +785,7 @@ static long do_compat_fcntl64(unsigned int fd, unsigned int cmd, err = get_compat_flock(&flock, compat_ptr(arg)); if (err) break; - err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock); + err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock); break; case F_SETLK64: case F_SETLKW64: @@ -652,14 +794,12 @@ static long do_compat_fcntl64(unsigned int fd, unsigned int cmd, err = get_compat_flock64(&flock, compat_ptr(arg)); if (err) break; - err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock); + err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock); break; default: - err = do_fcntl(fd, cmd, arg, f.file); + err = do_fcntl(fd, cmd, arg, fd_file(f)); break; } -out_put: - fdput(f); return err; } @@ -806,14 +946,19 @@ static void send_sigurg_to_task(struct task_struct *p, do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type); } -int send_sigurg(struct fown_struct *fown) +int send_sigurg(struct file *file) { + struct fown_struct *fown; struct task_struct *p; enum pid_type type; struct pid *pid; unsigned long flags; int ret = 0; + fown = file_f_owner(file); + if (!fown) + return 0; + read_lock_irqsave(&fown->lock, flags); type = fown->pid_type; @@ -842,13 +987,7 @@ int send_sigurg(struct fown_struct *fown) } static DEFINE_SPINLOCK(fasync_lock); -static struct kmem_cache *fasync_cache __read_mostly; - -static void fasync_free_rcu(struct rcu_head *head) -{ - kmem_cache_free(fasync_cache, - container_of(head, struct fasync_struct, fa_rcu)); -} +static struct kmem_cache *fasync_cache __ro_after_init; /* * Remove a fasync entry. If successfully removed, return @@ -875,7 +1014,7 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) write_unlock_irq(&fa->fa_lock); *fp = fa->fa_next; - call_rcu(&fa->fa_rcu, fasync_free_rcu); + kfree_rcu(fa, fa_rcu); filp->f_flags &= ~FASYNC; result = 1; break; @@ -995,13 +1134,16 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) } read_lock_irqsave(&fa->fa_lock, flags); if (fa->fa_file) { - fown = &fa->fa_file->f_owner; + fown = file_f_owner(fa->fa_file); + if (!fown) + goto next; /* Don't send SIGURG to processes which have not set a queued signum: SIGURG has its own default signalling mechanism. */ if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); } +next: read_unlock_irqrestore(&fa->fa_lock, flags); fa = rcu_dereference(fa->fa_next); } @@ -1027,10 +1169,10 @@ static int __init fcntl_init(void) * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ - BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != + BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) | - __FMODE_EXEC | __FMODE_NONOTIFY)); + __FMODE_EXEC)); fasync_cache = kmem_cache_create("fasync_cache", sizeof(struct fasync_struct), 0, |
