summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/files.rst53
-rw-r--r--arch/powerpc/platforms/cell/spufs/coredump.c11
-rw-r--r--drivers/gpu/drm/i915/gem/i915_gem_mman.c4
-rw-r--r--fs/file.c125
-rw-r--r--fs/file_table.c40
-rw-r--r--fs/gfs2/glock.c11
-rw-r--r--fs/notify/dnotify/dnotify.c6
-rw-r--r--fs/proc/fd.c11
-rw-r--r--include/linux/fdtable.h17
-rw-r--r--include/linux/fs.h4
-rw-r--r--kernel/bpf/task_iter.c4
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/kcmp.c4
13 files changed, 191 insertions, 103 deletions
diff --git a/Documentation/filesystems/files.rst b/Documentation/filesystems/files.rst
index bcf84459917f..9e38e4c221ca 100644
--- a/Documentation/filesystems/files.rst
+++ b/Documentation/filesystems/files.rst
@@ -62,7 +62,7 @@ the fdtable structure -
be held.
4. To look up the file structure given an fd, a reader
- must use either lookup_fd_rcu() or files_lookup_fd_rcu() APIs. These
+ must use either lookup_fdget_rcu() or files_lookup_fdget_rcu() APIs. These
take care of barrier requirements due to lock-free lookup.
An example::
@@ -70,43 +70,22 @@ the fdtable structure -
struct file *file;
rcu_read_lock();
- file = lookup_fd_rcu(fd);
- if (file) {
- ...
- }
- ....
+ file = lookup_fdget_rcu(fd);
rcu_read_unlock();
-
-5. Handling of the file structures is special. Since the look-up
- of the fd (fget()/fget_light()) are lock-free, it is possible
- that look-up may race with the last put() operation on the
- file structure. This is avoided using atomic_long_inc_not_zero()
- on ->f_count::
-
- rcu_read_lock();
- file = files_lookup_fd_rcu(files, fd);
if (file) {
- if (atomic_long_inc_not_zero(&file->f_count))
- *fput_needed = 1;
- else
- /* Didn't get the reference, someone's freed */
- file = NULL;
+ ...
+ fput(file);
}
- rcu_read_unlock();
....
- return file;
-
- atomic_long_inc_not_zero() detects if refcounts is already zero or
- goes to zero during increment. If it does, we fail
- fget()/fget_light().
-6. Since both fdtable and file structures can be looked up
+5. Since both fdtable and file structures can be looked up
lock-free, they must be installed using rcu_assign_pointer()
API. If they are looked up lock-free, rcu_dereference()
must be used. However it is advisable to use files_fdtable()
- and lookup_fd_rcu()/files_lookup_fd_rcu() which take care of these issues.
+ and lookup_fdget_rcu()/files_lookup_fdget_rcu() which take care of these
+ issues.
-7. While updating, the fdtable pointer must be looked up while
+6. While updating, the fdtable pointer must be looked up while
holding files->file_lock. If ->file_lock is dropped, then
another thread expand the files thereby creating a new
fdtable and making the earlier fdtable pointer stale.
@@ -126,3 +105,19 @@ the fdtable structure -
Since locate_fd() can drop ->file_lock (and reacquire ->file_lock),
the fdtable pointer (fdt) must be loaded after locate_fd().
+On newer kernels rcu based file lookup has been switched to rely on
+SLAB_TYPESAFE_BY_RCU instead of call_rcu(). It isn't sufficient anymore
+to just acquire a reference to the file in question under rcu using
+atomic_long_inc_not_zero() since the file might have already been
+recycled and someone else might have bumped the reference. In other
+words, callers might see reference count bumps from newer users. For
+this is reason it is necessary to verify that the pointer is the same
+before and after the reference count increment. This pattern can be seen
+in get_file_rcu() and __files_get_rcu().
+
+In addition, it isn't possible to access or check fields in struct file
+without first aqcuiring a reference on it under rcu lookup. Not doing
+that was always very dodgy and it was only usable for non-pointer data
+in struct file. With SLAB_TYPESAFE_BY_RCU it is necessary that callers
+either first acquire a reference or they must hold the files_lock of the
+fdtable.
diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index 1a587618015c..18daafbe2e65 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -66,7 +66,7 @@ static int match_context(const void *v, struct file *file, unsigned fd)
*/
static struct spu_context *coredump_next_context(int *fd)
{
- struct spu_context *ctx;
+ struct spu_context *ctx = NULL;
struct file *file;
int n = iterate_fd(current->files, *fd, match_context, NULL);
if (!n)
@@ -74,10 +74,13 @@ static struct spu_context *coredump_next_context(int *fd)
*fd = n - 1;
rcu_read_lock();
- file = lookup_fd_rcu(*fd);
- ctx = SPUFS_I(file_inode(file))->i_ctx;
- get_spu_context(ctx);
+ file = lookup_fdget_rcu(*fd);
rcu_read_unlock();
+ if (file) {
+ ctx = SPUFS_I(file_inode(file))->i_ctx;
+ get_spu_context(ctx);
+ fput(file);
+ }
return ctx;
}
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index aa4d842d4c5a..97bf10861cad 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -916,9 +916,7 @@ static struct file *mmap_singleton(struct drm_i915_private *i915)
struct file *file;
rcu_read_lock();
- file = READ_ONCE(i915->gem.mmap_singleton);
- if (file && !get_file_rcu(file))
- file = NULL;
+ file = get_file_rcu(&i915->gem.mmap_singleton);
rcu_read_unlock();
if (file)
return file;
diff --git a/fs/file.c b/fs/file.c
index 3e4a4dfa38fc..4eb026631673 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -853,8 +853,79 @@ void do_close_on_exec(struct files_struct *files)
spin_unlock(&files->file_lock);
}
+static struct file *__get_file_rcu(struct file __rcu **f)
+{
+ struct file __rcu *file;
+ struct file __rcu *file_reloaded;
+ struct file __rcu *file_reloaded_cmp;
+
+ file = rcu_dereference_raw(*f);
+ if (!file)
+ return NULL;
+
+ if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
+ return ERR_PTR(-EAGAIN);
+
+ file_reloaded = rcu_dereference_raw(*f);
+
+ /*
+ * Ensure that all accesses have a dependency on the load from
+ * rcu_dereference_raw() above so we get correct ordering
+ * between reuse/allocation and the pointer check below.
+ */
+ file_reloaded_cmp = file_reloaded;
+ OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
+
+ /*
+ * atomic_long_inc_not_zero() above provided a full memory
+ * barrier when we acquired a reference.
+ *
+ * This is paired with the write barrier from assigning to the
+ * __rcu protected file pointer so that if that pointer still
+ * matches the current file, we know we have successfully
+ * acquired a reference to the right file.
+ *
+ * If the pointers don't match the file has been reallocated by
+ * SLAB_TYPESAFE_BY_RCU.
+ */
+ if (file == file_reloaded_cmp)
+ return file_reloaded;
+
+ fput(file);
+ return ERR_PTR(-EAGAIN);
+}
+
+/**
+ * get_file_rcu - try go get a reference to a file under rcu
+ * @f: the file to get a reference on
+ *
+ * This function tries to get a reference on @f carefully verifying that
+ * @f hasn't been reused.
+ *
+ * This function should rarely have to be used and only by users who
+ * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
+ *
+ * Return: Returns @f with the reference count increased or NULL.
+ */
+struct file *get_file_rcu(struct file __rcu **f)
+{
+ for (;;) {
+ struct file __rcu *file;
+
+ file = __get_file_rcu(f);
+ if (unlikely(!file))
+ return NULL;
+
+ if (unlikely(IS_ERR(file)))
+ continue;
+
+ return file;
+ }
+}
+EXPORT_SYMBOL_GPL(get_file_rcu);
+
static inline struct file *__fget_files_rcu(struct files_struct *files,
- unsigned int fd, fmode_t mask)
+ unsigned int fd, fmode_t mask)
{
for (;;) {
struct file *file;
@@ -865,12 +936,6 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
return NULL;
fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
- file = rcu_dereference_raw(*fdentry);
- if (unlikely(!file))
- return NULL;
-
- if (unlikely(file->f_mode & mask))
- return NULL;
/*
* Ok, we have a file pointer. However, because we do
@@ -879,10 +944,15 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
*
* Such a race can take two forms:
*
- * (a) the file ref already went down to zero,
- * and get_file_rcu() fails. Just try again:
+ * (a) the file ref already went down to zero and the
+ * file hasn't been reused yet or the file count
+ * isn't zero but the file has already been reused.
*/
- if (unlikely(!get_file_rcu(file)))
+ file = __get_file_rcu(fdentry);
+ if (unlikely(!file))
+ return NULL;
+
+ if (unlikely(IS_ERR(file)))
continue;
/*
@@ -893,13 +963,21 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
*
* If so, we need to put our ref and try again.
*/
- if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
- unlikely(rcu_dereference_raw(*fdentry) != file)) {
+ if (unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
fput(file);
continue;
}
/*
+ * This isn't the file we're looking for or we're not
+ * allowed to get a reference to it.
+ */
+ if (unlikely(file->f_mode & mask)) {
+ fput(file);
+ return NULL;
+ }
+
+ /*
* Ok, we have a ref to the file, and checked that it
* still exists.
*/
@@ -948,7 +1026,14 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
return file;
}
-struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
+struct file *lookup_fdget_rcu(unsigned int fd)
+{
+ return __fget_files_rcu(current->files, fd, 0);
+
+}
+EXPORT_SYMBOL_GPL(lookup_fdget_rcu);
+
+struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd)
{
/* Must be called with rcu_read_lock held */
struct files_struct *files;
@@ -957,13 +1042,13 @@ struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
task_lock(task);
files = task->files;
if (files)
- file = files_lookup_fd_rcu(files, fd);
+ file = __fget_files_rcu(files, fd, 0);
task_unlock(task);
return file;
}
-struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd)
+struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd)
{
/* Must be called with rcu_read_lock held */
struct files_struct *files;
@@ -974,7 +1059,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
files = task->files;
if (files) {
for (; fd < files_fdtable(files)->max_fds; fd++) {
- file = files_lookup_fd_rcu(files, fd);
+ file = __fget_files_rcu(files, fd, 0);
if (file)
break;
}
@@ -983,7 +1068,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
*ret_fd = fd;
return file;
}
-EXPORT_SYMBOL(task_lookup_next_fd_rcu);
+EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
/*
* Lightweight file lookup - no refcnt increment if fd table isn't shared.
@@ -1272,12 +1357,16 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
{
if (unlikely(newfd == oldfd)) { /* corner case */
struct files_struct *files = current->files;
+ struct file *f;
int retval = oldfd;
rcu_read_lock();
- if (!files_lookup_fd_rcu(files, oldfd))
+ f = __fget_files_rcu(files, oldfd, 0);
+ if (!f)
retval = -EBADF;
rcu_read_unlock();
+ if (f)
+ fput(f);
return retval;
}
return ksys_dup3(oldfd, newfd, 0);
diff --git a/fs/file_table.c b/fs/file_table.c
index e68e97d4f00a..a79a80031343 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -65,33 +65,33 @@ static void file_free_rcu(struct rcu_head *head)
{
struct file *f = container_of(head, struct file, f_rcuhead);
- put_cred(f->f_cred);
- if (unlikely(f->f_mode & FMODE_BACKING))
- kfree(backing_file(f));
- else
- kmem_cache_free(filp_cachep, f);
+ kfree(backing_file(f));
}
static inline void file_free(struct file *f)
{
security_file_free(f);
- if (unlikely(f->f_mode & FMODE_BACKING))
- path_put(backing_file_real_path(f));
if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
percpu_counter_dec(&nr_files);
- call_rcu(&f->f_rcuhead, file_free_rcu);
+ put_cred(f->f_cred);
+ if (unlikely(f->f_mode & FMODE_BACKING)) {
+ path_put(backing_file_real_path(f));
+ call_rcu(&f->f_rcuhead, file_free_rcu);
+ } else {
+ kmem_cache_free(filp_cachep, f);
+ }
}
void release_empty_file(struct file *f)
{
WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED));
- /* Uhm, we better find out who grabs references to an unopened file. */
- WARN_ON_ONCE(atomic_long_cmpxchg(&f->f_count, 1, 0) != 1);
- security_file_free(f);
- put_cred(f->f_cred);
- if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
- percpu_counter_dec(&nr_files);
- kmem_cache_free(filp_cachep, f);
+ if (atomic_long_dec_and_test(&f->f_count)) {
+ security_file_free(f);
+ put_cred(f->f_cred);
+ if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
+ percpu_counter_dec(&nr_files);
+ kmem_cache_free(filp_cachep, f);
+ }
}
/*
@@ -176,7 +176,6 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
return error;
}
- atomic_long_set(&f->f_count, 1);
rwlock_init(&f->f_owner.lock);
spin_lock_init(&f->f_lock);
mutex_init(&f->f_pos_lock);
@@ -184,6 +183,12 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
f->f_mode = OPEN_FMODE(flags);
/* f->f_version: 0 */
+ /*
+ * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
+ * fget-rcu pattern users need to be able to handle spurious
+ * refcount bumps we should reinitialize the reused file first.
+ */
+ atomic_long_set(&f->f_count, 1);
return 0;
}
@@ -483,7 +488,8 @@ EXPORT_SYMBOL(__fput_sync);
void __init files_init(void)
{
filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
- SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
+ SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN |
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9cbf8d98489a..b4bc873aab7d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -2717,16 +2717,19 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i)
for(;; i->fd++) {
struct inode *inode;
- i->file = task_lookup_next_fd_rcu(i->task, &i->fd);
+ i->file = task_lookup_next_fdget_rcu(i->task, &i->fd);
if (!i->file) {
i->fd = 0;
break;
}
+
inode = file_inode(i->file);
- if (inode->i_sb != i->sb)
- continue;
- if (get_file_rcu(i->file))
+ if (inode->i_sb == i->sb)
break;
+
+ rcu_read_unlock();
+ fput(i->file);
+ rcu_read_lock();
}
rcu_read_unlock();
return i->file;
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index ebdcc25df0f7..869b016014d2 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -265,7 +265,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
struct dnotify_struct *dn;
struct inode *inode;
fl_owner_t id = current->files;
- struct file *f;
+ struct file *f = NULL;
int destroy = 0, error = 0;
__u32 mask;
@@ -345,7 +345,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
}
rcu_read_lock();
- f = lookup_fd_rcu(fd);
+ f = lookup_fdget_rcu(fd);
rcu_read_unlock();
/* if (f != filp) means that we lost a race and another task/thread
@@ -392,6 +392,8 @@ out_err:
fsnotify_put_mark(new_fsn_mark);
if (dn)
kmem_cache_free(dnotify_struct_cache, dn);
+ if (f)
+ fput(f);
return error;
}
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 6276b3938842..6e72e5ad42bc 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -113,10 +113,12 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
struct file *file;
rcu_read_lock();
- file = task_lookup_fd_rcu(task, fd);
- if (file)
- *mode = file->f_mode;
+ file = task_lookup_fdget_rcu(task, fd);
rcu_read_unlock();
+ if (file) {
+ *mode = file->f_mode;
+ fput(file);
+ }
return !!file;
}
@@ -259,12 +261,13 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
char name[10 + 1];
unsigned int len;
- f = task_lookup_next_fd_rcu(p, &fd);
+ f = task_lookup_next_fdget_rcu(p, &fd);
ctx->pos = fd + 2LL;
if (!f)
break;
data.mode = f->f_mode;
rcu_read_unlock();
+ fput(f);
data.fd = fd;
len = snprintf(name, sizeof(name), "%u", fd);
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index e066816f3519..bc4c3287a65e 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -98,20 +98,9 @@ static inline struct file *files_lookup_fd_locked(struct files_struct *files, un
return files_lookup_fd_raw(files, fd);
}
-static inline struct file *files_lookup_fd_rcu(struct files_struct *files, unsigned int fd)
-{
- RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
- "suspicious rcu_dereference_check() usage");
- return files_lookup_fd_raw(files, fd);
-}
-
-static inline struct file *lookup_fd_rcu(unsigned int fd)
-{
- return files_lookup_fd_rcu(current->files, fd);
-}
-
-struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd);
-struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *fd);
+struct file *lookup_fdget_rcu(unsigned int fd);
+struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd);
+struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *fd);
struct task_struct;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 58dea591a341..ceafc40cc25f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1042,7 +1042,9 @@ static inline struct file *get_file(struct file *f)
atomic_long_inc(&f->f_count);
return f;
}
-#define get_file_rcu(x) atomic_long_inc_not_zero(&(x)->f_count)
+
+struct file *get_file_rcu(struct file __rcu **f);
+
#define file_count(x) atomic_long_read(&(x)->f_count)
#define MAX_NON_LFS ((1UL<<31) - 1)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index c4ab9d6cdbe9..82ad23b1d257 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -308,11 +308,9 @@ again:
rcu_read_lock();
for (;; curr_fd++) {
struct file *f;
- f = task_lookup_next_fd_rcu(curr_task, &curr_fd);
+ f = task_lookup_next_fdget_rcu(curr_task, &curr_fd);
if (!f)
break;
- if (!get_file_rcu(f))
- continue;
/* set info->fd */
info->fd = curr_fd;
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b6d20dfb9a8..640123767726 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1492,9 +1492,7 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
struct file *exe_file;
rcu_read_lock();
- exe_file = rcu_dereference(mm->exe_file);
- if (exe_file && !get_file_rcu(exe_file))
- exe_file = NULL;
+ exe_file = get_file_rcu(&mm->exe_file);
rcu_read_unlock();
return exe_file;
}
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 5353edfad8e1..b0639f21041f 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -64,8 +64,10 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx)
struct file *file;
rcu_read_lock();
- file = task_lookup_fd_rcu(task, idx);
+ file = task_lookup_fdget_rcu(task, idx);
rcu_read_unlock();
+ if (file)
+ fput(file);
return file;
}