summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/char_dev.c2
-rw-r--r--fs/file.c153
-rw-r--r--fs/file_table.c49
-rw-r--r--fs/fs-writeback.c41
-rw-r--r--fs/gfs2/glock.c11
-rw-r--r--fs/init.c6
-rw-r--r--fs/inode.c8
-rw-r--r--fs/internal.h22
-rw-r--r--fs/namei.c31
-rw-r--r--fs/namespace.c40
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/notify/dnotify/dnotify.c6
-rw-r--r--fs/open.c52
-rw-r--r--fs/overlayfs/super.c24
-rw-r--r--fs/pipe.c64
-rw-r--r--fs/proc/base.c2
-rw-r--r--fs/proc/fd.c11
-rw-r--r--fs/proc/nommu.c2
-rw-r--r--fs/proc/task_mmu.c4
-rw-r--r--fs/proc/task_nommu.c2
20 files changed, 358 insertions, 174 deletions
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 950b6919fb87..6ba032442b39 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -350,7 +350,7 @@ static struct kobject *cdev_get(struct cdev *p)
struct module *owner = p->owner;
struct kobject *kobj;
- if (owner && !try_module_get(owner))
+ if (!try_module_get(owner))
return NULL;
kobj = kobject_get_unless_zero(&p->kobj);
if (!kobj)
diff --git a/fs/file.c b/fs/file.c
index 3e4a4dfa38fc..5fb0b146e79e 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -604,6 +604,9 @@ void fd_install(unsigned int fd, struct file *file)
struct files_struct *files = current->files;
struct fdtable *fdt;
+ if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING)))
+ return;
+
rcu_read_lock_sched();
if (unlikely(files->resize_in_progress)) {
@@ -853,8 +856,104 @@ void do_close_on_exec(struct files_struct *files)
spin_unlock(&files->file_lock);
}
+static struct file *__get_file_rcu(struct file __rcu **f)
+{
+ struct file __rcu *file;
+ struct file __rcu *file_reloaded;
+ struct file __rcu *file_reloaded_cmp;
+
+ file = rcu_dereference_raw(*f);
+ if (!file)
+ return NULL;
+
+ if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
+ return ERR_PTR(-EAGAIN);
+
+ file_reloaded = rcu_dereference_raw(*f);
+
+ /*
+ * Ensure that all accesses have a dependency on the load from
+ * rcu_dereference_raw() above so we get correct ordering
+ * between reuse/allocation and the pointer check below.
+ */
+ file_reloaded_cmp = file_reloaded;
+ OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
+
+ /*
+ * atomic_long_inc_not_zero() above provided a full memory
+ * barrier when we acquired a reference.
+ *
+ * This is paired with the write barrier from assigning to the
+ * __rcu protected file pointer so that if that pointer still
+ * matches the current file, we know we have successfully
+ * acquired a reference to the right file.
+ *
+ * If the pointers don't match the file has been reallocated by
+ * SLAB_TYPESAFE_BY_RCU.
+ */
+ if (file == file_reloaded_cmp)
+ return file_reloaded;
+
+ fput(file);
+ return ERR_PTR(-EAGAIN);
+}
+
+/**
+ * get_file_rcu - try go get a reference to a file under rcu
+ * @f: the file to get a reference on
+ *
+ * This function tries to get a reference on @f carefully verifying that
+ * @f hasn't been reused.
+ *
+ * This function should rarely have to be used and only by users who
+ * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
+ *
+ * Return: Returns @f with the reference count increased or NULL.
+ */
+struct file *get_file_rcu(struct file __rcu **f)
+{
+ for (;;) {
+ struct file __rcu *file;
+
+ file = __get_file_rcu(f);
+ if (unlikely(!file))
+ return NULL;
+
+ if (unlikely(IS_ERR(file)))
+ continue;
+
+ return file;
+ }
+}
+EXPORT_SYMBOL_GPL(get_file_rcu);
+
+/**
+ * get_file_active - try go get a reference to a file
+ * @f: the file to get a reference on
+ *
+ * In contast to get_file_rcu() the pointer itself isn't part of the
+ * reference counting.
+ *
+ * This function should rarely have to be used and only by users who
+ * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
+ *
+ * Return: Returns @f with the reference count increased or NULL.
+ */
+struct file *get_file_active(struct file **f)
+{
+ struct file __rcu *file;
+
+ rcu_read_lock();
+ file = __get_file_rcu(f);
+ rcu_read_unlock();
+ if (IS_ERR(file))
+ file = NULL;
+ return file;
+}
+EXPORT_SYMBOL_GPL(get_file_active);
+
static inline struct file *__fget_files_rcu(struct files_struct *files,
- unsigned int fd, fmode_t mask)
+ unsigned int fd, fmode_t mask)
{
for (;;) {
struct file *file;
@@ -865,12 +964,6 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
return NULL;
fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
- file = rcu_dereference_raw(*fdentry);
- if (unlikely(!file))
- return NULL;
-
- if (unlikely(file->f_mode & mask))
- return NULL;
/*
* Ok, we have a file pointer. However, because we do
@@ -879,10 +972,15 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
*
* Such a race can take two forms:
*
- * (a) the file ref already went down to zero,
- * and get_file_rcu() fails. Just try again:
+ * (a) the file ref already went down to zero and the
+ * file hasn't been reused yet or the file count
+ * isn't zero but the file has already been reused.
*/
- if (unlikely(!get_file_rcu(file)))
+ file = __get_file_rcu(fdentry);
+ if (unlikely(!file))
+ return NULL;
+
+ if (unlikely(IS_ERR(file)))
continue;
/*
@@ -893,13 +991,21 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
*
* If so, we need to put our ref and try again.
*/
- if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
- unlikely(rcu_dereference_raw(*fdentry) != file)) {
+ if (unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
fput(file);
continue;
}
/*
+ * This isn't the file we're looking for or we're not
+ * allowed to get a reference to it.
+ */
+ if (unlikely(file->f_mode & mask)) {
+ fput(file);
+ return NULL;
+ }
+
+ /*
* Ok, we have a ref to the file, and checked that it
* still exists.
*/
@@ -948,7 +1054,14 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
return file;
}
-struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
+struct file *lookup_fdget_rcu(unsigned int fd)
+{
+ return __fget_files_rcu(current->files, fd, 0);
+
+}
+EXPORT_SYMBOL_GPL(lookup_fdget_rcu);
+
+struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd)
{
/* Must be called with rcu_read_lock held */
struct files_struct *files;
@@ -957,13 +1070,13 @@ struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
task_lock(task);
files = task->files;
if (files)
- file = files_lookup_fd_rcu(files, fd);
+ file = __fget_files_rcu(files, fd, 0);
task_unlock(task);
return file;
}
-struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd)
+struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd)
{
/* Must be called with rcu_read_lock held */
struct files_struct *files;
@@ -974,7 +1087,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
files = task->files;
if (files) {
for (; fd < files_fdtable(files)->max_fds; fd++) {
- file = files_lookup_fd_rcu(files, fd);
+ file = __fget_files_rcu(files, fd, 0);
if (file)
break;
}
@@ -983,7 +1096,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
*ret_fd = fd;
return file;
}
-EXPORT_SYMBOL(task_lookup_next_fd_rcu);
+EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
/*
* Lightweight file lookup - no refcnt increment if fd table isn't shared.
@@ -1272,12 +1385,16 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
{
if (unlikely(newfd == oldfd)) { /* corner case */
struct files_struct *files = current->files;
+ struct file *f;
int retval = oldfd;
rcu_read_lock();
- if (!files_lookup_fd_rcu(files, oldfd))
+ f = __fget_files_rcu(files, oldfd, 0);
+ if (!f)
retval = -EBADF;
rcu_read_unlock();
+ if (f)
+ fput(f);
return retval;
}
return ksys_dup3(oldfd, newfd, 0);
diff --git a/fs/file_table.c b/fs/file_table.c
index ee21b3da9d08..fa92743ba6a9 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -44,10 +44,10 @@ static struct kmem_cache *filp_cachep __read_mostly;
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
-/* Container for backing file with optional real path */
+/* Container for backing file with optional user path */
struct backing_file {
struct file file;
- struct path real_path;
+ struct path user_path;
};
static inline struct backing_file *backing_file(struct file *f)
@@ -55,31 +55,36 @@ static inline struct backing_file *backing_file(struct file *f)
return container_of(f, struct backing_file, file);
}
-struct path *backing_file_real_path(struct file *f)
+struct path *backing_file_user_path(struct file *f)
{
- return &backing_file(f)->real_path;
+ return &backing_file(f)->user_path;
}
-EXPORT_SYMBOL_GPL(backing_file_real_path);
+EXPORT_SYMBOL_GPL(backing_file_user_path);
-static void file_free_rcu(struct rcu_head *head)
+static inline void file_free(struct file *f)
{
- struct file *f = container_of(head, struct file, f_rcuhead);
-
+ security_file_free(f);
+ if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
+ percpu_counter_dec(&nr_files);
put_cred(f->f_cred);
- if (unlikely(f->f_mode & FMODE_BACKING))
+ if (unlikely(f->f_mode & FMODE_BACKING)) {
+ path_put(backing_file_user_path(f));
kfree(backing_file(f));
- else
+ } else {
kmem_cache_free(filp_cachep, f);
+ }
}
-static inline void file_free(struct file *f)
+void release_empty_file(struct file *f)
{
- security_file_free(f);
- if (unlikely(f->f_mode & FMODE_BACKING))
- path_put(backing_file_real_path(f));
- if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
- percpu_counter_dec(&nr_files);
- call_rcu(&f->f_rcuhead, file_free_rcu);
+ WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED));
+ if (atomic_long_dec_and_test(&f->f_count)) {
+ security_file_free(f);
+ put_cred(f->f_cred);
+ if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
+ percpu_counter_dec(&nr_files);
+ kmem_cache_free(filp_cachep, f);
+ }
}
/*
@@ -164,7 +169,6 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
return error;
}
- atomic_long_set(&f->f_count, 1);
rwlock_init(&f->f_owner.lock);
spin_lock_init(&f->f_lock);
mutex_init(&f->f_pos_lock);
@@ -172,6 +176,12 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
f->f_mode = OPEN_FMODE(flags);
/* f->f_version: 0 */
+ /*
+ * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
+ * fget-rcu pattern users need to be able to handle spurious
+ * refcount bumps we should reinitialize the reused file first.
+ */
+ atomic_long_set(&f->f_count, 1);
return 0;
}
@@ -471,7 +481,8 @@ EXPORT_SYMBOL(__fput_sync);
void __init files_init(void)
{
filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
- SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
+ SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN |
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c1af01b2c42d..1767493dffda 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -613,6 +613,24 @@ out_free:
kfree(isw);
}
+static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
+ struct list_head *list, int *nr)
+{
+ struct inode *inode;
+
+ list_for_each_entry(inode, list, i_io_list) {
+ if (!inode_prepare_wbs_switch(inode, isw->new_wb))
+ continue;
+
+ isw->inodes[*nr] = inode;
+ (*nr)++;
+
+ if (*nr >= WB_MAX_INODES_PER_ISW - 1)
+ return true;
+ }
+ return false;
+}
+
/**
* cleanup_offline_cgwb - detach associated inodes
* @wb: target wb
@@ -625,7 +643,6 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
{
struct cgroup_subsys_state *memcg_css;
struct inode_switch_wbs_context *isw;
- struct inode *inode;
int nr;
bool restart = false;
@@ -647,17 +664,17 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
nr = 0;
spin_lock(&wb->list_lock);
- list_for_each_entry(inode, &wb->b_attached, i_io_list) {
- if (!inode_prepare_wbs_switch(inode, isw->new_wb))
- continue;
-
- isw->inodes[nr++] = inode;
-
- if (nr >= WB_MAX_INODES_PER_ISW - 1) {
- restart = true;
- break;
- }
- }
+ /*
+ * In addition to the inodes that have completed writeback, also switch
+ * cgwbs for those inodes only with dirty timestamps. Otherwise, those
+ * inodes won't be written back for a long time when lazytime is
+ * enabled, and thus pinning the dying cgwbs. It won't break the
+ * bandwidth restrictions, as writeback of inode metadata is not
+ * accounted for.
+ */
+ restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
+ if (!restart)
+ restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
spin_unlock(&wb->list_lock);
/* no attached inodes? bail out */
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 4a280be229a6..3772a5d9e85c 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -2719,16 +2719,19 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i)
for(;; i->fd++) {
struct inode *inode;
- i->file = task_lookup_next_fd_rcu(i->task, &i->fd);
+ i->file = task_lookup_next_fdget_rcu(i->task, &i->fd);
if (!i->file) {
i->fd = 0;
break;
}
+
inode = file_inode(i->file);
- if (inode->i_sb != i->sb)
- continue;
- if (get_file_rcu(i->file))
+ if (inode->i_sb == i->sb)
break;
+
+ rcu_read_unlock();
+ fput(i->file);
+ rcu_read_lock();
}
rcu_read_unlock();
return i->file;
diff --git a/fs/init.c b/fs/init.c
index 9684406a8416..e9387b6c4f30 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -153,8 +153,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev)
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- if (!IS_POSIXACL(path.dentry->d_inode))
- mode &= ~current_umask();
+ mode = mode_strip_umask(d_inode(path.dentry), mode);
error = security_path_mknod(&path, dentry, mode, dev);
if (!error)
error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode,
@@ -229,8 +228,7 @@ int __init init_mkdir(const char *pathname, umode_t mode)
dentry = kern_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- if (!IS_POSIXACL(path.dentry->d_inode))
- mode &= ~current_umask();
+ mode = mode_strip_umask(d_inode(path.dentry), mode);
error = security_path_mkdir(&path, dentry, mode);
if (!error)
error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
diff --git a/fs/inode.c b/fs/inode.c
index 84bc3c76e5cc..3bb6193f436c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2006,7 +2006,7 @@ void touch_atime(const struct path *path)
if (!sb_start_write_trylock(inode->i_sb))
return;
- if (__mnt_want_write(mnt) != 0)
+ if (mnt_get_write_access(mnt) != 0)
goto skip_update;
/*
* File systems can error out when updating inodes if they need to
@@ -2018,7 +2018,7 @@ void touch_atime(const struct path *path)
* of the fs read only, e.g. subvolumes in Btrfs.
*/
inode_update_time(inode, S_ATIME);
- __mnt_drop_write(mnt);
+ mnt_put_write_access(mnt);
skip_update:
sb_end_write(inode->i_sb);
}
@@ -2131,9 +2131,9 @@ static int __file_update_time(struct file *file, int sync_mode)
struct inode *inode = file_inode(file);
/* try to update time settings */
- if (!__mnt_want_write_file(file)) {
+ if (!mnt_get_write_access_file(file)) {
ret = inode_update_time(inode, sync_mode);
- __mnt_drop_write_file(file);
+ mnt_put_write_access_file(file);
}
return ret;
diff --git a/fs/internal.h b/fs/internal.h
index d64ae03998cc..58e43341aebf 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -73,8 +73,8 @@ extern int sb_prepare_remount_readonly(struct super_block *);
extern void __init mnt_init(void);
-extern int __mnt_want_write_file(struct file *);
-extern void __mnt_drop_write_file(struct file *);
+int mnt_get_write_access_file(struct file *file);
+void mnt_put_write_access_file(struct file *file);
extern void dissolve_on_fput(struct vfsmount *);
extern bool may_mount(void);
@@ -94,14 +94,22 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
struct file *alloc_empty_file(int flags, const struct cred *cred);
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
struct file *alloc_empty_backing_file(int flags, const struct cred *cred);
+void release_empty_file(struct file *f);
+
+static inline void file_put_write_access(struct file *file)
+{
+ put_write_access(file->f_inode);
+ mnt_put_write_access(file->f_path.mnt);
+ if (unlikely(file->f_mode & FMODE_BACKING))
+ mnt_put_write_access(backing_file_user_path(file)->mnt);
+}
static inline void put_file_access(struct file *file)
{
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
i_readcount_dec(file->f_inode);
} else if (file->f_mode & FMODE_WRITER) {
- put_write_access(file->f_inode);
- __mnt_drop_write(file->f_path.mnt);
+ file_put_write_access(file);
}
}
@@ -130,9 +138,9 @@ static inline void sb_start_ro_state_change(struct super_block *sb)
* mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY
* cleared, it will see s_readonly_remount set.
* For RW->RO transition, the barrier pairs with the barrier in
- * __mnt_want_write() before the mnt_is_readonly() check. The barrier
- * makes sure if __mnt_want_write() sees MNT_WRITE_HOLD already
- * cleared, it will see s_readonly_remount set.
+ * mnt_get_write_access() before the mnt_is_readonly() check.
+ * The barrier makes sure if mnt_get_write_access() sees MNT_WRITE_HOLD
+ * already cleared, it will see s_readonly_remount set.
*/
smp_wmb();
}
diff --git a/fs/namei.c b/fs/namei.c
index 94565bd7e73f..71c13b2990b4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3105,25 +3105,6 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
EXPORT_SYMBOL(unlock_rename);
/**
- * mode_strip_umask - handle vfs umask stripping
- * @dir: parent directory of the new inode
- * @mode: mode of the new inode to be created in @dir
- *
- * Umask stripping depends on whether or not the filesystem supports POSIX
- * ACLs. If the filesystem doesn't support it umask stripping is done directly
- * in here. If the filesystem does support POSIX ACLs umask stripping is
- * deferred until the filesystem calls posix_acl_create().
- *
- * Returns: mode
- */
-static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode)
-{
- if (!IS_POSIXACL(dir))
- mode &= ~current_umask();
- return mode;
-}
-
-/**
* vfs_prepare_mode - prepare the mode to be used for a new inode
* @idmap: idmap of the mount the inode was found from
* @dir: parent directory of the new inode
@@ -3536,7 +3517,8 @@ static const char *open_last_lookups(struct nameidata *nd,
if (likely(dentry))
goto finish_lookup;
- BUG_ON(nd->flags & LOOKUP_RCU);
+ if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
+ return ERR_PTR(-ECHILD);
} else {
/* create side of things */
if (nd->flags & LOOKUP_RCU) {
@@ -3803,7 +3785,10 @@ static struct file *path_openat(struct nameidata *nd,
WARN_ON(1);
error = -EINVAL;
}
- fput(file);
+ if (unlikely(file->f_mode & FMODE_OPENED))
+ fput(file);
+ else
+ release_empty_file(file);
if (error == -EOPENSTALE) {
if (flags & LOOKUP_RCU)
error = -ECHILD;
@@ -4387,11 +4372,9 @@ retry_deleg:
if (!IS_ERR(dentry)) {
/* Why not before? Because we want correct error value */
- if (last.name[last.len])
+ if (last.name[last.len] || d_is_negative(dentry))
goto slashes;
inode = dentry->d_inode;
- if (d_is_negative(dentry))
- goto slashes;
ihold(inode);
error = security_path_unlink(&path, dentry);
if (error)
diff --git a/fs/namespace.c b/fs/namespace.c
index e157efc54023..6bde71735efa 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -330,16 +330,16 @@ static int mnt_is_readonly(struct vfsmount *mnt)
* can determine when writes are able to occur to a filesystem.
*/
/**
- * __mnt_want_write - get write access to a mount without freeze protection
+ * mnt_get_write_access - get write access to a mount without freeze protection
* @m: the mount on which to take a write
*
* This tells the low-level filesystem that a write is about to be performed to
* it, and makes sure that writes are allowed (mnt it read-write) before
* returning success. This operation does not protect against filesystem being
- * frozen. When the write operation is finished, __mnt_drop_write() must be
+ * frozen. When the write operation is finished, mnt_put_write_access() must be
* called. This is effectively a refcount.
*/
-int __mnt_want_write(struct vfsmount *m)
+int mnt_get_write_access(struct vfsmount *m)
{
struct mount *mnt = real_mount(m);
int ret = 0;
@@ -386,6 +386,7 @@ int __mnt_want_write(struct vfsmount *m)
return ret;
}
+EXPORT_SYMBOL_GPL(mnt_get_write_access);
/**
* mnt_want_write - get write access to a mount
@@ -401,7 +402,7 @@ int mnt_want_write(struct vfsmount *m)
int ret;
sb_start_write(m->mnt_sb);
- ret = __mnt_want_write(m);
+ ret = mnt_get_write_access(m);
if (ret)
sb_end_write(m->mnt_sb);
return ret;
@@ -409,15 +410,15 @@ int mnt_want_write(struct vfsmount *m)
EXPORT_SYMBOL_GPL(mnt_want_write);
/**
- * __mnt_want_write_file - get write access to a file's mount
+ * mnt_get_write_access_file - get write access to a file's mount
* @file: the file who's mount on which to take a write
*
- * This is like __mnt_want_write, but if the file is already open for writing it
+ * This is like mnt_get_write_access, but if @file is already open for write it
* skips incrementing mnt_writers (since the open file already has a reference)
* and instead only does the check for emergency r/o remounts. This must be
- * paired with __mnt_drop_write_file.
+ * paired with mnt_put_write_access_file.
*/
-int __mnt_want_write_file(struct file *file)
+int mnt_get_write_access_file(struct file *file)
{
if (file->f_mode & FMODE_WRITER) {
/*
@@ -428,7 +429,7 @@ int __mnt_want_write_file(struct file *file)
return -EROFS;
return 0;
}
- return __mnt_want_write(file->f_path.mnt);
+ return mnt_get_write_access(file->f_path.mnt);
}
/**
@@ -445,7 +446,7 @@ int mnt_want_write_file(struct file *file)
int ret;
sb_start_write(file_inode(file)->i_sb);
- ret = __mnt_want_write_file(file);
+ ret = mnt_get_write_access_file(file);
if (ret)
sb_end_write(file_inode(file)->i_sb);
return ret;
@@ -453,19 +454,20 @@ int mnt_want_write_file(struct file *file)
EXPORT_SYMBOL_GPL(mnt_want_write_file);
/**
- * __mnt_drop_write - give up write access to a mount
+ * mnt_put_write_access - give up write access to a mount
* @mnt: the mount on which to give up write access
*
* Tells the low-level filesystem that we are done
* performing writes to it. Must be matched with
- * __mnt_want_write() call above.
+ * mnt_get_write_access() call above.
*/
-void __mnt_drop_write(struct vfsmount *mnt)
+void mnt_put_write_access(struct vfsmount *mnt)
{
preempt_disable();
mnt_dec_writers(real_mount(mnt));
preempt_enable();
}
+EXPORT_SYMBOL_GPL(mnt_put_write_access);
/**
* mnt_drop_write - give up write access to a mount
@@ -477,20 +479,20 @@ void __mnt_drop_write(struct vfsmount *mnt)
*/
void mnt_drop_write(struct vfsmount *mnt)
{
- __mnt_drop_write(mnt);
+ mnt_put_write_access(mnt);
sb_end_write(mnt->mnt_sb);
}
EXPORT_SYMBOL_GPL(mnt_drop_write);
-void __mnt_drop_write_file(struct file *file)
+void mnt_put_write_access_file(struct file *file)
{
if (!(file->f_mode & FMODE_WRITER))
- __mnt_drop_write(file->f_path.mnt);
+ mnt_put_write_access(file->f_path.mnt);
}
void mnt_drop_write_file(struct file *file)
{
- __mnt_drop_write_file(file);
+ mnt_put_write_access_file(file);
sb_end_write(file_inode(file)->i_sb);
}
EXPORT_SYMBOL(mnt_drop_write_file);
@@ -1344,9 +1346,9 @@ void mntput(struct vfsmount *mnt)
{
if (mnt) {
struct mount *m = real_mount(mnt);
- /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
+ /* avoid cacheline pingpong */
if (unlikely(m->mnt_expiry_mark))
- m->mnt_expiry_mark = 0;
+ WRITE_ONCE(m->mnt_expiry_mark, 0);
mntput_no_expire(m);
}
}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0d6473cb00cb..9b1cfca8112a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1071,7 +1071,7 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx)
sb->s_export_op = &nfs_export_ops;
break;
case 4:
- sb->s_flags |= SB_POSIXACL;
+ sb->s_iflags |= SB_I_NOUMASK;
sb->s_time_gran = 1;
sb->s_time_min = S64_MIN;
sb->s_time_max = S64_MAX;
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index ebdcc25df0f7..869b016014d2 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -265,7 +265,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
struct dnotify_struct *dn;
struct inode *inode;
fl_owner_t id = current->files;
- struct file *f;
+ struct file *f = NULL;
int destroy = 0, error = 0;
__u32 mask;
@@ -345,7 +345,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
}
rcu_read_lock();
- f = lookup_fd_rcu(fd);
+ f = lookup_fdget_rcu(fd);
rcu_read_unlock();
/* if (f != filp) means that we lost a race and another task/thread
@@ -392,6 +392,8 @@ out_err:
fsnotify_put_mark(new_fsn_mark);
if (dn)
kmem_cache_free(dnotify_struct_cache, dn);
+ if (f)
+ fput(f);
return error;
}
diff --git a/fs/open.c b/fs/open.c
index 98f6601fbac6..02dc608d40d8 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -870,6 +870,30 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
return ksys_fchown(fd, user, group);
}
+static inline int file_get_write_access(struct file *f)
+{
+ int error;
+
+ error = get_write_access(f->f_inode);
+ if (unlikely(error))
+ return error;
+ error = mnt_get_write_access(f->f_path.mnt);
+ if (unlikely(error))
+ goto cleanup_inode;
+ if (unlikely(f->f_mode & FMODE_BACKING)) {
+ error = mnt_get_write_access(backing_file_user_path(f)->mnt);
+ if (unlikely(error))
+ goto cleanup_mnt;
+ }
+ return 0;
+
+cleanup_mnt:
+ mnt_put_write_access(f->f_path.mnt);
+cleanup_inode:
+ put_write_access(f->f_inode);
+ return error;
+}
+
static int do_dentry_open(struct file *f,
struct inode *inode,
int (*open)(struct inode *, struct file *))
@@ -892,14 +916,9 @@ static int do_dentry_open(struct file *f,
if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
i_readcount_inc(inode);
} else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
- error = get_write_access(inode);
+ error = file_get_write_access(f);
if (unlikely(error))
goto cleanup_file;
- error = __mnt_want_write(f->f_path.mnt);
- if (unlikely(error)) {
- put_write_access(inode);
- goto cleanup_file;
- }
f->f_mode |= FMODE_WRITER;
}
@@ -1163,20 +1182,19 @@ EXPORT_SYMBOL_GPL(kernel_file_open);
/**
* backing_file_open - open a backing file for kernel internal use
- * @path: path of the file to open
+ * @user_path: path that the user reuqested to open
* @flags: open flags
* @real_path: path of the backing file
* @cred: credentials for open
*
* Open a backing file for a stackable filesystem (e.g., overlayfs).
- * @path may be on the stackable filesystem and backing inode on the
- * underlying filesystem. In this case, we want to be able to return
- * the @real_path of the backing inode. This is done by embedding the
- * returned file into a container structure that also stores the path of
- * the backing inode on the underlying filesystem, which can be
- * retrieved using backing_file_real_path().
+ * @user_path may be on the stackable filesystem and @real_path on the
+ * underlying filesystem. In this case, we want to be able to return the
+ * @user_path of the stackable filesystem. This is done by embedding the
+ * returned file into a container structure that also stores the stacked
+ * file's path, which can be retrieved using backing_file_user_path().
*/
-struct file *backing_file_open(const struct path *path, int flags,
+struct file *backing_file_open(const struct path *user_path, int flags,
const struct path *real_path,
const struct cred *cred)
{
@@ -1187,9 +1205,9 @@ struct file *backing_file_open(const struct path *path, int flags,
if (IS_ERR(f))
return f;
- f->f_path = *path;
- path_get(real_path);
- *backing_file_real_path(f) = *real_path;
+ path_get(user_path);
+ *backing_file_user_path(f) = *user_path;
+ f->f_path = *real_path;
error = do_dentry_open(f, d_inode(real_path->dentry), NULL);
if (error) {
fput(f);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 3fa2416264a4..c6c39ea267c5 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -34,14 +34,22 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
struct dentry *real = NULL, *lower;
int err;
- /* It's an overlay file */
+ /*
+ * vfs is only expected to call d_real() with NULL from d_real_inode()
+ * and with overlay inode from file_dentry() on an overlay file.
+ *
+ * TODO: remove @inode argument from d_real() API, remove code in this
+ * function that deals with non-NULL @inode and remove d_real() call
+ * from file_dentry().
+ */
if (inode && d_inode(dentry) == inode)
return dentry;
+ else if (inode)
+ goto bug;
if (!d_is_reg(dentry)) {
- if (!inode || inode == d_inode(dentry))
- return dentry;
- goto bug;
+ /* d_real_inode() is only relevant for regular files */
+ return dentry;
}
real = ovl_dentry_upper(dentry);
@@ -1488,8 +1496,16 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers :
ovl_trusted_xattr_handlers;
sb->s_fs_info = ofs;
+#ifdef CONFIG_FS_POSIX_ACL
sb->s_flags |= SB_POSIXACL;
+#endif
sb->s_iflags |= SB_I_SKIP_SYNC | SB_I_IMA_UNVERIFIABLE_SIGNATURE;
+ /*
+ * Ensure that umask handling is done by the filesystems used
+ * for the the upper layer instead of overlayfs as that would
+ * lead to unexpected results.
+ */
+ sb->s_iflags |= SB_I_NOUMASK;
err = -ENOMEM;
root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe);
diff --git a/fs/pipe.c b/fs/pipe.c
index 139190165a1c..485e3be8903c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -227,6 +227,36 @@ static inline bool pipe_readable(const struct pipe_inode_info *pipe)
return !pipe_empty(head, tail) || !writers;
}
+static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf,
+ unsigned int tail)
+{
+ pipe_buf_release(pipe, buf);
+
+ /*
+ * If the pipe has a watch_queue, we need additional protection
+ * by the spinlock because notifications get posted with only
+ * this spinlock, no mutex
+ */
+ if (pipe_has_watch_queue(pipe)) {
+ spin_lock_irq(&pipe->rd_wait.lock);
+#ifdef CONFIG_WATCH_QUEUE
+ if (buf->flags & PIPE_BUF_FLAG_LOSS)
+ pipe->note_loss = true;
+#endif
+ pipe->tail = ++tail;
+ spin_unlock_irq(&pipe->rd_wait.lock);
+ return tail;
+ }
+
+ /*
+ * Without a watch_queue, we can simply increment the tail
+ * without the spinlock - the mutex is enough.
+ */
+ pipe->tail = ++tail;
+ return tail;
+}
+
static ssize_t
pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
@@ -320,17 +350,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
buf->len = 0;
}
- if (!buf->len) {
- pipe_buf_release(pipe, buf);
- spin_lock_irq(&pipe->rd_wait.lock);
-#ifdef CONFIG_WATCH_QUEUE
- if (buf->flags & PIPE_BUF_FLAG_LOSS)
- pipe->note_loss = true;
-#endif
- tail++;
- pipe->tail = tail;
- spin_unlock_irq(&pipe->rd_wait.lock);
- }
+ if (!buf->len)
+ tail = pipe_update_tail(pipe, buf, tail);
total_len -= chars;
if (!total_len)
break; /* common path: read succeeded */
@@ -437,12 +458,10 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
-#ifdef CONFIG_WATCH_QUEUE
- if (pipe->watch_queue) {
+ if (pipe_has_watch_queue(pipe)) {
ret = -EXDEV;
goto out;
}
-#endif
/*
* If it wasn't empty we try to merge new data into
@@ -507,16 +526,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
* it, either the reader will consume it or it'll still
* be there for the next write.
*/
- spin_lock_irq(&pipe->rd_wait.lock);
-
- head = pipe->head;
- if (pipe_full(head, pipe->tail, pipe->max_usage)) {
- spin_unlock_irq(&pipe->rd_wait.lock);
- continue;
- }
-
pipe->head = head + 1;
- spin_unlock_irq(&pipe->rd_wait.lock);
/* Insert it into the buffer array */
buf = &pipe->bufs[head & mask];
@@ -1324,10 +1334,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg)
unsigned int nr_slots, size;
long ret = 0;
-#ifdef CONFIG_WATCH_QUEUE
- if (pipe->watch_queue)
+ if (pipe_has_watch_queue(pipe))
return -EBUSY;
-#endif
size = round_pipe_size(arg);
nr_slots = size >> PAGE_SHIFT;
@@ -1379,10 +1387,8 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
if (file->f_op != &pipefifo_fops || !pipe)
return NULL;
-#ifdef CONFIG_WATCH_QUEUE
- if (for_splice && pipe->watch_queue)
+ if (for_splice && pipe_has_watch_queue(pipe))
return NULL;
-#endif
return pipe;
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ffd54617c354..20695c928ee6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2218,7 +2218,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
rc = -ENOENT;
vma = find_exact_vma(mm, vm_start, vm_end);
if (vma && vma->vm_file) {
- *path = vma->vm_file->f_path;
+ *path = *file_user_path(vma->vm_file);
path_get(path);
rc = 0;
}
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 6276b3938842..6e72e5ad42bc 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -113,10 +113,12 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
struct file *file;
rcu_read_lock();
- file = task_lookup_fd_rcu(task, fd);
- if (file)
- *mode = file->f_mode;
+ file = task_lookup_fdget_rcu(task, fd);
rcu_read_unlock();
+ if (file) {
+ *mode = file->f_mode;
+ fput(file);
+ }
return !!file;
}
@@ -259,12 +261,13 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
char name[10 + 1];
unsigned int len;
- f = task_lookup_next_fd_rcu(p, &fd);
+ f = task_lookup_next_fdget_rcu(p, &fd);
ctx->pos = fd + 2LL;
if (!f)
break;
data.mode = f->f_mode;
rcu_read_unlock();
+ fput(f);
data.fd = fd;
len = snprintf(name, sizeof(name), "%u", fd);
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 4d3493579458..c6e7ebc63756 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -58,7 +58,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
if (file) {
seq_pad(m, ' ');
- seq_file_path(m, file, "");
+ seq_path(m, file_user_path(file), "");
}
seq_putc(m, '\n');
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3dd5be96691b..1593940ca01e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -296,7 +296,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
if (anon_name)
seq_printf(m, "[anon_shmem:%s]", anon_name->name);
else
- seq_file_path(m, file, "\n");
+ seq_path(m, file_user_path(file), "\n");
goto done;
}
@@ -1967,7 +1967,7 @@ static int show_numa_map(struct seq_file *m, void *v)
if (file) {
seq_puts(m, " file=");
- seq_file_path(m, file, "\n\t= ");
+ seq_path(m, file_user_path(file), "\n\t= ");
} else if (vma_is_initial_heap(vma)) {
seq_puts(m, " heap");
} else if (vma_is_initial_stack(vma)) {
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 7cebd397cc26..bce674533000 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -157,7 +157,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
if (file) {
seq_pad(m, ' ');
- seq_file_path(m, file, "");
+ seq_path(m, file_user_path(file), "");
} else if (mm && vma_is_initial_stack(vma)) {
seq_pad(m, ' ');
seq_puts(m, "[stack]");