summaryrefslogtreecommitdiff
path: root/fs/file_table.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-11-18 10:30:29 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-11-18 10:30:29 -0800
commit4c797b11a88297b9b0010b2c6645b191bac2350c (patch)
tree3eca58d86d27164682ee732cfb49e1404b315584 /fs/file_table.c
parent8dcf44fcad5ef5c1ff915628255c19cbe91f2588 (diff)
parentaab154a442f9ba2a08fc130dbc8d178a33e10345 (diff)
Merge tag 'vfs-6.13.file' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs file updates from Christian Brauner: "This contains changes the changes for files for this cycle: - Introduce a new reference counting mechanism for files. As atomic_inc_not_zero() is implemented with a try_cmpxchg() loop it has O(N^2) behaviour under contention with N concurrent operations and it is in a hot path in __fget_files_rcu(). The rcuref infrastructures remedies this problem by using an unconditional increment relying on safe- and dead zones to make this work and requiring rcu protection for the data structure in question. This not just scales better it also introduces overflow protection. However, in contrast to generic rcuref, files require a memory barrier and thus cannot rely on *_relaxed() atomic operations and also require to be built on atomic_long_t as having massive amounts of reference isn't unheard of even if it is just an attack. This adds a file specific variant instead of making this a generic library. This has been tested by various people and it gives consistent improvement up to 3-5% on workloads with loads of threads. - Add a fastpath for find_next_zero_bit(). Skip 2-levels searching via find_next_zero_bit() when there is a free slot in the word that contains the next fd. This improves pts/blogbench-1.1.0 read by 8% and write by 4% on Intel ICX 160. - Conditionally clear full_fds_bits since it's very likely that a bit in full_fds_bits has been cleared during __clear_open_fds(). This improves pts/blogbench-1.1.0 read up to 13%, and write up to 5% on Intel ICX 160. - Get rid of all lookup_*_fdget_rcu() variants. They were used to lookup files without taking a reference count. That became invalid once files were switched to SLAB_TYPESAFE_BY_RCU and now we're always taking a reference count. Switch to an already existing helper and remove the legacy variants. - Remove pointless includes of <linux/fdtable.h>. - Avoid cmpxchg() in close_files() as nobody else has a reference to the files_struct at that point. - Move close_range() into fs/file.c and fold __close_range() into it. - Cleanup calling conventions of alloc_fdtable() and expand_files(). - Merge __{set,clear}_close_on_exec() into one. - Make __set_open_fd() set cloexec as well instead of doing it in two separate steps" * tag 'vfs-6.13.file' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: selftests: add file SLAB_TYPESAFE_BY_RCU recycling stressor fs: port files to file_ref fs: add file_ref expand_files(): simplify calling conventions make __set_open_fd() set cloexec state as well fs: protect backing files with rcu file.c: merge __{set,clear}_close_on_exec() alloc_fdtable(): change calling conventions. fs/file.c: add fast path in find_next_fd() fs/file.c: conditionally clear full_fds fs/file.c: remove sanity_check and add likely/unlikely in alloc_fd() move close_range(2) into fs/file.c, fold __close_range() into it close_files(): don't bother with xchg() remove pointless includes of <linux/fdtable.h> get rid of ...lookup...fdget_rcu() family
Diffstat (limited to 'fs/file_table.c')
-rw-r--r--fs/file_table.c50
1 files changed, 37 insertions, 13 deletions
diff --git a/fs/file_table.c b/fs/file_table.c
index eed5ffad9997..976736be47cb 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -9,7 +9,6 @@
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
@@ -40,13 +39,17 @@ static struct files_stat_struct files_stat = {
/* SLAB cache for file structures */
static struct kmem_cache *filp_cachep __ro_after_init;
+static struct kmem_cache *bfilp_cachep __ro_after_init;
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
/* Container for backing file with optional user path */
struct backing_file {
struct file file;
- struct path user_path;
+ union {
+ struct path user_path;
+ freeptr_t bf_freeptr;
+ };
};
static inline struct backing_file *backing_file(struct file *f)
@@ -68,7 +71,7 @@ static inline void file_free(struct file *f)
put_cred(f->f_cred);
if (unlikely(f->f_mode & FMODE_BACKING)) {
path_put(backing_file_user_path(f));
- kfree(backing_file(f));
+ kmem_cache_free(bfilp_cachep, backing_file(f));
} else {
kmem_cache_free(filp_cachep, f);
}
@@ -165,16 +168,32 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
* the respective member when opening the file.
*/
mutex_init(&f->f_pos_lock);
- f->f_flags = flags;
- f->f_mode = OPEN_FMODE(flags);
- /* f->f_version: 0 */
+ memset(&f->f_path, 0, sizeof(f->f_path));
+ memset(&f->f_ra, 0, sizeof(f->f_ra));
+
+ f->f_flags = flags;
+ f->f_mode = OPEN_FMODE(flags);
+
+ f->f_op = NULL;
+ f->f_mapping = NULL;
+ f->private_data = NULL;
+ f->f_inode = NULL;
+ f->f_owner = NULL;
+#ifdef CONFIG_EPOLL
+ f->f_ep = NULL;
+#endif
+
+ f->f_iocb_flags = 0;
+ f->f_pos = 0;
+ f->f_wb_err = 0;
+ f->f_sb_err = 0;
/*
* We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
* fget-rcu pattern users need to be able to handle spurious
* refcount bumps we should reinitialize the reused file first.
*/
- atomic_long_set(&f->f_count, 1);
+ file_ref_init(&f->f_ref, 1);
return 0;
}
@@ -206,7 +225,7 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
goto over;
}
- f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
+ f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
if (unlikely(!f))
return ERR_PTR(-ENOMEM);
@@ -240,7 +259,7 @@ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
struct file *f;
int error;
- f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
+ f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
if (unlikely(!f))
return ERR_PTR(-ENOMEM);
@@ -267,13 +286,13 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
struct backing_file *ff;
int error;
- ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL);
+ ff = kmem_cache_alloc(bfilp_cachep, GFP_KERNEL);
if (unlikely(!ff))
return ERR_PTR(-ENOMEM);
error = init_file(&ff->file, flags, cred);
if (unlikely(error)) {
- kfree(ff);
+ kmem_cache_free(bfilp_cachep, ff);
return ERR_PTR(error);
}
@@ -479,7 +498,7 @@ static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
void fput(struct file *file)
{
- if (atomic_long_dec_and_test(&file->f_count)) {
+ if (file_ref_put(&file->f_ref)) {
struct task_struct *task = current;
if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
@@ -512,7 +531,7 @@ void fput(struct file *file)
*/
void __fput_sync(struct file *file)
{
- if (atomic_long_dec_and_test(&file->f_count))
+ if (file_ref_put(&file->f_ref))
__fput(file);
}
@@ -529,6 +548,11 @@ void __init files_init(void)
filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args,
SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
+
+ args.freeptr_offset = offsetof(struct backing_file, bf_freeptr);
+ bfilp_cachep = kmem_cache_create("bfilp", sizeof(struct backing_file),
+ &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
+ SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}