diff options
Diffstat (limited to 'fs/file_table.c')
| -rw-r--r-- | fs/file_table.c | 369 |
1 files changed, 279 insertions, 90 deletions
diff --git a/fs/file_table.c b/fs/file_table.c index dd88701e54a9..cd4a3db4659a 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -9,10 +9,10 @@ #include <linux/string.h> #include <linux/slab.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/init.h> #include <linux/module.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/eventpoll.h> @@ -25,7 +25,6 @@ #include <linux/percpu_counter.h> #include <linux/percpu.h> #include <linux/task_work.h> -#include <linux/ima.h> #include <linux/swap.h> #include <linux/kmemleak.h> @@ -39,24 +38,46 @@ static struct files_stat_struct files_stat = { }; /* SLAB cache for file structures */ -static struct kmem_cache *filp_cachep __read_mostly; +static struct kmem_cache *filp_cachep __ro_after_init; +static struct kmem_cache *bfilp_cachep __ro_after_init; static struct percpu_counter nr_files __cacheline_aligned_in_smp; -static void file_free_rcu(struct rcu_head *head) +/* Container for backing file with optional user path */ +struct backing_file { + struct file file; + union { + struct path user_path; + freeptr_t bf_freeptr; + }; +}; + +#define backing_file(f) container_of(f, struct backing_file, file) + +const struct path *backing_file_user_path(const struct file *f) { - struct file *f = container_of(head, struct file, f_rcuhead); + return &backing_file(f)->user_path; +} +EXPORT_SYMBOL_GPL(backing_file_user_path); - put_cred(f->f_cred); - kmem_cache_free(filp_cachep, f); +void backing_file_set_user_path(struct file *f, const struct path *path) +{ + backing_file(f)->user_path = *path; } +EXPORT_SYMBOL_GPL(backing_file_set_user_path); static inline void file_free(struct file *f) { security_file_free(f); - if (!(f->f_mode & FMODE_NOACCOUNT)) + if (likely(!(f->f_mode & FMODE_NOACCOUNT))) percpu_counter_dec(&nr_files); - call_rcu(&f->f_rcuhead, file_free_rcu); + put_cred(f->f_cred); + if (unlikely(f->f_mode & FMODE_BACKING)) { + path_put(backing_file_user_path(f)); + kmem_cache_free(bfilp_cachep, backing_file(f)); + } else { + kmem_cache_free(filp_cachep, f); + } } /* @@ -81,14 +102,14 @@ EXPORT_SYMBOL_GPL(get_max_files); /* * Handle nr_files sysctl */ -static int proc_nr_files(struct ctl_table *table, int write, void *buffer, +static int proc_nr_files(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - files_stat.nr_files = get_nr_files(); + files_stat.nr_files = percpu_counter_sum_positive(&nr_files); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } -static struct ctl_table fs_stat_sysctls[] = { +static const struct ctl_table fs_stat_sysctls[] = { { .procname = "file-nr", .data = &files_stat, @@ -110,11 +131,10 @@ static struct ctl_table fs_stat_sysctls[] = { .data = &sysctl_nr_open, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_douintvec_minmax, .extra1 = &sysctl_nr_open_min, .extra2 = &sysctl_nr_open_max, }, - { } }; static int __init init_fs_stat_sysctls(void) @@ -122,6 +142,7 @@ static int __init init_fs_stat_sysctls(void) register_sysctl_init("fs", fs_stat_sysctls); if (IS_ENABLED(CONFIG_BINFMT_MISC)) { struct ctl_table_header *hdr; + hdr = register_sysctl_mount_point("fs/binfmt_misc"); kmemleak_not_leak(hdr); } @@ -130,31 +151,58 @@ static int __init init_fs_stat_sysctls(void) fs_initcall(init_fs_stat_sysctls); #endif -static struct file *__alloc_file(int flags, const struct cred *cred) +static int init_file(struct file *f, int flags, const struct cred *cred) { - struct file *f; int error; - f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); - if (unlikely(!f)) - return ERR_PTR(-ENOMEM); - f->f_cred = get_cred(cred); error = security_file_alloc(f); if (unlikely(error)) { - file_free_rcu(&f->f_rcuhead); - return ERR_PTR(error); + put_cred(f->f_cred); + return error; } - atomic_long_set(&f->f_count, 1); - rwlock_init(&f->f_owner.lock); spin_lock_init(&f->f_lock); + /* + * Note that f_pos_lock is only used for files raising + * FMODE_ATOMIC_POS and directories. Other files such as pipes + * don't need it and since f_pos_lock is in a union may reuse + * the space for other purposes. They are expected to initialize + * the respective member when opening the file. + */ mutex_init(&f->f_pos_lock); - f->f_flags = flags; - f->f_mode = OPEN_FMODE(flags); - /* f->f_version: 0 */ + memset(&f->__f_path, 0, sizeof(f->f_path)); + memset(&f->f_ra, 0, sizeof(f->f_ra)); + + f->f_flags = flags; + f->f_mode = OPEN_FMODE(flags); + + f->f_op = NULL; + f->f_mapping = NULL; + f->private_data = NULL; + f->f_inode = NULL; + f->f_owner = NULL; +#ifdef CONFIG_EPOLL + f->f_ep = NULL; +#endif - return f; + f->f_iocb_flags = 0; + f->f_pos = 0; + f->f_wb_err = 0; + f->f_sb_err = 0; + + /* + * We're SLAB_TYPESAFE_BY_RCU so initialize f_ref last. While + * fget-rcu pattern users need to be able to handle spurious + * refcount bumps we should reinitialize the reused file first. + */ + file_ref_init(&f->f_ref, 1); + /* + * Disable permission and pre-content events for all files by default. + * They may be enabled later by fsnotify_open_perm_and_set_mode(). + */ + file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM); + return 0; } /* Find an unused file structure and return a pointer to it. @@ -171,11 +219,13 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) { static long old_max; struct file *f; + int error; /* * Privileged users can go above max_files */ - if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { + if (unlikely(get_nr_files() >= files_stat.max_files) && + !capable(CAP_SYS_ADMIN)) { /* * percpu_counters are inaccurate. Do an expensive check before * we go and fail. @@ -184,9 +234,17 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) goto over; } - f = __alloc_file(flags, cred); - if (!IS_ERR(f)) - percpu_counter_inc(&nr_files); + f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); + if (unlikely(!f)) + return ERR_PTR(-ENOMEM); + + error = init_file(f, flags, cred); + if (unlikely(error)) { + kmem_cache_free(filp_cachep, f); + return ERR_PTR(error); + } + + percpu_counter_inc(&nr_files); return f; @@ -202,35 +260,66 @@ over: /* * Variant of alloc_empty_file() that doesn't check and modify nr_files. * - * Should not be used unless there's a very good reason to do so. + * This is only for kernel internal use, and the allocate file must not be + * installed into file tables or such. */ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) { - struct file *f = __alloc_file(flags, cred); + struct file *f; + int error; - if (!IS_ERR(f)) - f->f_mode |= FMODE_NOACCOUNT; + f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); + if (unlikely(!f)) + return ERR_PTR(-ENOMEM); + + error = init_file(f, flags, cred); + if (unlikely(error)) { + kmem_cache_free(filp_cachep, f); + return ERR_PTR(error); + } + + f->f_mode |= FMODE_NOACCOUNT; return f; } +/* + * Variant of alloc_empty_file() that allocates a backing_file container + * and doesn't check and modify nr_files. + * + * This is only for kernel internal use, and the allocate file must not be + * installed into file tables or such. + */ +struct file *alloc_empty_backing_file(int flags, const struct cred *cred) +{ + struct backing_file *ff; + int error; + + ff = kmem_cache_alloc(bfilp_cachep, GFP_KERNEL); + if (unlikely(!ff)) + return ERR_PTR(-ENOMEM); + + error = init_file(&ff->file, flags, cred); + if (unlikely(error)) { + kmem_cache_free(bfilp_cachep, ff); + return ERR_PTR(error); + } + + ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT; + return &ff->file; +} + /** - * alloc_file - allocate and initialize a 'struct file' + * file_init_path - initialize a 'struct file' based on path * + * @file: the file to set up * @path: the (dentry, vfsmount) pair for the new file - * @flags: O_... flags with which the new file will be opened * @fop: the 'struct file_operations' for the new file */ -static struct file *alloc_file(const struct path *path, int flags, - const struct file_operations *fop) +static void file_init_path(struct file *file, const struct path *path, + const struct file_operations *fop) { - struct file *file; - - file = alloc_empty_file(flags, current_cred()); - if (IS_ERR(file)) - return file; - - file->f_path = *path; + file->__f_path = *path; file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); @@ -248,40 +337,99 @@ static struct file *alloc_file(const struct path *path, int flags, file->f_op = fop; if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(path->dentry->d_inode); +} + +/** + * alloc_file - allocate and initialize a 'struct file' + * + * @path: the (dentry, vfsmount) pair for the new file + * @flags: O_... flags with which the new file will be opened + * @fop: the 'struct file_operations' for the new file + */ +static struct file *alloc_file(const struct path *path, int flags, + const struct file_operations *fop) +{ + struct file *file; + + file = alloc_empty_file(flags, current_cred()); + if (!IS_ERR(file)) + file_init_path(file, path, fop); return file; } +static inline int alloc_path_pseudo(const char *name, struct inode *inode, + struct vfsmount *mnt, struct path *path) +{ + path->dentry = d_alloc_pseudo(mnt->mnt_sb, &QSTR(name)); + if (!path->dentry) + return -ENOMEM; + path->mnt = mntget(mnt); + d_instantiate(path->dentry, inode); + return 0; +} + struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, - const char *name, int flags, - const struct file_operations *fops) + const char *name, int flags, + const struct file_operations *fops) { - static const struct dentry_operations anon_ops = { - .d_dname = simple_dname - }; - struct qstr this = QSTR_INIT(name, strlen(name)); + int ret; struct path path; struct file *file; - path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this); - if (!path.dentry) - return ERR_PTR(-ENOMEM); - if (!mnt->mnt_sb->s_d_op) - d_set_d_op(path.dentry, &anon_ops); - path.mnt = mntget(mnt); - d_instantiate(path.dentry, inode); + ret = alloc_path_pseudo(name, inode, mnt, &path); + if (ret) + return ERR_PTR(ret); + file = alloc_file(&path, flags, fops); if (IS_ERR(file)) { ihold(inode); path_put(&path); + return file; } + /* + * Disable all fsnotify events for pseudo files by default. + * They may be enabled by caller with file_set_fsnotify_mode(). + */ + file_set_fsnotify_mode(file, FMODE_NONOTIFY); return file; } EXPORT_SYMBOL(alloc_file_pseudo); +struct file *alloc_file_pseudo_noaccount(struct inode *inode, + struct vfsmount *mnt, const char *name, + int flags, + const struct file_operations *fops) +{ + int ret; + struct path path; + struct file *file; + + ret = alloc_path_pseudo(name, inode, mnt, &path); + if (ret) + return ERR_PTR(ret); + + file = alloc_empty_file_noaccount(flags, current_cred()); + if (IS_ERR(file)) { + ihold(inode); + path_put(&path); + return file; + } + file_init_path(file, &path, fops); + /* + * Disable all fsnotify events for pseudo files by default. + * They may be enabled by caller with file_set_fsnotify_mode(). + */ + file_set_fsnotify_mode(file, FMODE_NONOTIFY); + return file; +} +EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount); + struct file *alloc_file_clone(struct file *base, int flags, const struct file_operations *fops) { - struct file *f = alloc_file(&base->f_path, flags, fops); + struct file *f; + + f = alloc_file(&base->f_path, flags, fops); if (!IS_ERR(f)) { path_get(&f->f_path); f->f_mapping = base->f_mapping; @@ -311,7 +459,7 @@ static void __fput(struct file *file) eventpoll_release(file); locks_remove_file(file); - ima_file_free(file); + security_file_release(file); if (unlikely(file->f_flags & FASYNC)) { if (file->f_op->fasync) file->f_op->fasync(-1, file, 0); @@ -323,7 +471,7 @@ static void __fput(struct file *file) cdev_put(inode->i_cdev); } fops_put(file->f_op); - put_pid(file->f_owner.pid); + file_f_owner_release(file); put_file_access(file); dput(dentry); if (unlikely(mode & FMODE_NEED_UNMOUNT)) @@ -345,9 +493,11 @@ static void delayed_fput(struct work_struct *unused) static void ____fput(struct callback_head *work) { - __fput(container_of(work, struct file, f_rcuhead)); + __fput(container_of(work, struct file, f_task_work)); } +static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); + /* * If kernel thread really needs to have the final fput() it has done * to complete, call this. The only user right now is the boot - we @@ -361,31 +511,40 @@ static void ____fput(struct callback_head *work) void flush_delayed_fput(void) { delayed_fput(NULL); + flush_delayed_work(&delayed_fput_work); } EXPORT_SYMBOL_GPL(flush_delayed_fput); -static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); +static void __fput_deferred(struct file *file) +{ + struct task_struct *task = current; + + if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { + file_free(file); + return; + } + + if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { + init_task_work(&file->f_task_work, ____fput); + if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) + return; + /* + * After this task has run exit_task_work(), + * task_work_add() will fail. Fall through to delayed + * fput to avoid leaking *file. + */ + } + + if (llist_add(&file->f_llist, &delayed_fput_list)) + schedule_delayed_work(&delayed_fput_work, 1); +} void fput(struct file *file) { - if (atomic_long_dec_and_test(&file->f_count)) { - struct task_struct *task = current; - - if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { - init_task_work(&file->f_rcuhead, ____fput); - if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME)) - return; - /* - * After this task has run exit_task_work(), - * task_work_add() will fail. Fall through to delayed - * fput to avoid leaking *file. - */ - } - - if (llist_add(&file->f_llist, &delayed_fput_list)) - schedule_delayed_work(&delayed_fput_work, 1); - } + if (unlikely(file_ref_put(&file->f_ref))) + __fput_deferred(file); } +EXPORT_SYMBOL(fput); /* * synchronous analog of fput(); for kernel threads that might be needed @@ -397,20 +556,50 @@ void fput(struct file *file) */ void __fput_sync(struct file *file) { - if (atomic_long_dec_and_test(&file->f_count)) { - struct task_struct *task = current; - BUG_ON(!(task->flags & PF_KTHREAD)); + if (file_ref_put(&file->f_ref)) __fput(file); - } } - -EXPORT_SYMBOL(fput); EXPORT_SYMBOL(__fput_sync); +/* + * Equivalent to __fput_sync(), but optimized for being called with the last + * reference. + * + * See file_ref_put_close() for details. + */ +void fput_close_sync(struct file *file) +{ + if (likely(file_ref_put_close(&file->f_ref))) + __fput(file); +} + +/* + * Equivalent to fput(), but optimized for being called with the last + * reference. + * + * See file_ref_put_close() for details. + */ +void fput_close(struct file *file) +{ + if (file_ref_put_close(&file->f_ref)) + __fput_deferred(file); +} + void __init files_init(void) { - filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); + struct kmem_cache_args args = { + .use_freeptr_offset = true, + .freeptr_offset = offsetof(struct file, f_freeptr), + }; + + filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | + SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); + + args.freeptr_offset = offsetof(struct backing_file, bf_freeptr); + bfilp_cachep = kmem_cache_create("bfilp", sizeof(struct backing_file), + &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | + SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } |
