diff options
Diffstat (limited to 'fs/file.c')
| -rw-r--r-- | fs/file.c | 1288 |
1 files changed, 936 insertions, 352 deletions
diff --git a/fs/file.c b/fs/file.c index 4a78f981557a..0a4f3bdb2dec 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/file.c * @@ -9,47 +10,100 @@ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fs.h> +#include <linux/kernel.h> #include <linux/mm.h> -#include <linux/mmzone.h> -#include <linux/time.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/slab.h> -#include <linux/vmalloc.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/bitops.h> -#include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> -#include <linux/workqueue.h> +#include <linux/close_range.h> +#include <linux/file_ref.h> +#include <net/sock.h> +#include <linux/init_task.h> -int sysctl_nr_open __read_mostly = 1024*1024; -int sysctl_nr_open_min = BITS_PER_LONG; -int sysctl_nr_open_max = 1024 * 1024; /* raised later */ +#include "internal.h" -static void *alloc_fdmem(size_t size) +static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt) { /* - * Very large allocations can stress page reclaim, so fall back to - * vmalloc() if the allocation size will be considered "large" by the VM. + * If the reference count was already in the dead zone, then this + * put() operation is imbalanced. Warn, put the reference count back to + * DEAD and tell the caller to not deconstruct the object. */ - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN); - if (data != NULL) - return data; + if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { + atomic_long_set(&ref->refcnt, FILE_REF_DEAD); + return false; } - return vmalloc(size); + + /* + * This is a put() operation on a saturated refcount. Restore the + * mean saturation value and tell the caller to not deconstruct the + * object. + */ + if (cnt > FILE_REF_MAXREF) + atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); + return false; } -static void free_fdmem(void *ptr) +/** + * __file_ref_put - Slowpath of file_ref_put() + * @ref: Pointer to the reference count + * @cnt: Current reference count + * + * Invoked when the reference count is outside of the valid zone. + * + * Return: + * True if this was the last reference with no future references + * possible. This signals the caller that it can safely schedule the + * object, which is protected by the reference counter, for + * deconstruction. + * + * False if there are still active references or the put() raced + * with a concurrent get()/put() pair. Caller is not allowed to + * deconstruct the protected object. + */ +bool __file_ref_put(file_ref_t *ref, unsigned long cnt) { - is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr); + /* Did this drop the last reference? */ + if (likely(cnt == FILE_REF_NOREF)) { + /* + * Carefully try to set the reference count to FILE_REF_DEAD. + * + * This can fail if a concurrent get() operation has + * elevated it again or the corresponding put() even marked + * it dead already. Both are valid situations and do not + * require a retry. If this fails the caller is not + * allowed to deconstruct the object. + */ + if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD)) + return false; + + /* + * The caller can safely schedule the object for + * deconstruction. Provide acquire ordering. + */ + smp_acquire__after_ctrl_dep(); + return true; + } + + return __file_ref_put_badval(ref, cnt); } +EXPORT_SYMBOL_GPL(__file_ref_put); + +unsigned int sysctl_nr_open __read_mostly = 1024*1024; +unsigned int sysctl_nr_open_min = BITS_PER_LONG; +/* our min() is unusable in constant expressions ;-/ */ +#define __const_min(x, y) ((x) < (y) ? (x) : (y)) +unsigned int sysctl_nr_open_max = + __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; static void __free_fdtable(struct fdtable *fdt) { - free_fdmem(fdt->fd); - free_fdmem(fdt->open_fds); + kvfree(fdt->fd); + kvfree(fdt->open_fds); kfree(fdt); } @@ -58,32 +112,57 @@ static void free_fdtable_rcu(struct rcu_head *rcu) __free_fdtable(container_of(rcu, struct fdtable, rcu)); } +#define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) +#define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) + +#define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds /* - * Expand the fdset in the files_struct. Called with the files spinlock - * held for write. + * Copy 'count' fd bits from the old table to the new table and clear the extra + * space if any. This does not copy the file pointers. Called with the files + * spinlock held for write. + */ +static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, + unsigned int copy_words) +{ + unsigned int nwords = fdt_words(nfdt); + + bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds, + copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); + bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec, + copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); + bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits, + copy_words, nwords); +} + +/* + * Copy all file descriptors from the old table to the new, expanded table and + * clear the extra space. Called with the files spinlock held for write. */ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) { - unsigned int cpy, set; + size_t cpy, set; BUG_ON(nfdt->max_fds < ofdt->max_fds); cpy = ofdt->max_fds * sizeof(struct file *); set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); memcpy(nfdt->fd, ofdt->fd, cpy); - memset((char *)(nfdt->fd) + cpy, 0, set); + memset((char *)nfdt->fd + cpy, 0, set); - cpy = ofdt->max_fds / BITS_PER_BYTE; - set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE; - memcpy(nfdt->open_fds, ofdt->open_fds, cpy); - memset((char *)(nfdt->open_fds) + cpy, 0, set); - memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); - memset((char *)(nfdt->close_on_exec) + cpy, 0, set); + copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt)); } -static struct fdtable * alloc_fdtable(unsigned int nr) +/* + * Note how the fdtable bitmap allocations very much have to be a multiple of + * BITS_PER_LONG. This is not only because we walk those things in chunks of + * 'unsigned long' in some places, but simply because that is how the Linux + * kernel bitmaps are defined to work: they are not "bits in an array of bytes", + * they are very much "bits in an array of unsigned long". + */ +static struct fdtable *alloc_fdtable(unsigned int slots_wanted) { struct fdtable *fdt; + unsigned int nr; void *data; /* @@ -91,183 +170,243 @@ static struct fdtable * alloc_fdtable(unsigned int nr) * Allocation steps are keyed to the size of the fdarray, since it * grows far faster than any of the other dynamic data. We try to fit * the fdarray into comfortable page-tuned chunks: starting at 1024B - * and growing in powers of two from there on. + * and growing in powers of two from there on. Since we called only + * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab + * already gives BITS_PER_LONG slots), the above boils down to + * 1. use the smallest power of two large enough to give us that many + * slots. + * 2. on 32bit skip 64 and 128 - the minimal capacity we want there is + * 256 slots (i.e. 1Kb fd array). + * 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there + * and we are never going to be asked for 64 or less. */ - nr /= (1024 / sizeof(struct file *)); - nr = roundup_pow_of_two(nr + 1); - nr *= (1024 / sizeof(struct file *)); + if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256) + nr = 256; + else + nr = roundup_pow_of_two(slots_wanted); /* * Note that this can drive nr *below* what we had passed if sysctl_nr_open - * had been set lower between the check in expand_files() and here. Deal - * with that in caller, it's cheaper that way. + * had been set lower between the check in expand_files() and here. * * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise * bitmaps handling below becomes unpleasant, to put it mildly... */ - if (unlikely(nr > sysctl_nr_open)) - nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; + if (unlikely(nr > sysctl_nr_open)) { + nr = round_down(sysctl_nr_open, BITS_PER_LONG); + if (nr < slots_wanted) + return ERR_PTR(-EMFILE); + } + + /* + * Check if the allocation size would exceed INT_MAX. kvmalloc_array() + * and kvmalloc() will warn if the allocation size is greater than + * INT_MAX, as filp_cachep objects are not __GFP_NOWARN. + * + * This can happen when sysctl_nr_open is set to a very high value and + * a process tries to use a file descriptor near that limit. For example, + * if sysctl_nr_open is set to 1073741816 (0x3ffffff8) - which is what + * systemd typically sets it to - then trying to use a file descriptor + * close to that value will require allocating a file descriptor table + * that exceeds 8GB in size. + */ + if (unlikely(nr > INT_MAX / sizeof(struct file *))) + return ERR_PTR(-EMFILE); - fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); + fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) goto out; fdt->max_fds = nr; - data = alloc_fdmem(nr * sizeof(struct file *)); + data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT); if (!data) goto out_fdt; fdt->fd = data; - data = alloc_fdmem(max_t(size_t, - 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES)); + data = kvmalloc(max_t(size_t, + 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES), + GFP_KERNEL_ACCOUNT); if (!data) goto out_arr; fdt->open_fds = data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = data; + data += nr / BITS_PER_BYTE; + fdt->full_fds_bits = data; return fdt; out_arr: - free_fdmem(fdt->fd); + kvfree(fdt->fd); out_fdt: kfree(fdt); out: - return NULL; + return ERR_PTR(-ENOMEM); } /* * Expand the file descriptor table. * This function will allocate a new fdtable and both fd array and fdset, of * the given size. - * Return <0 error code on error; 1 on successful completion. + * Return <0 error code on error; 0 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ -static int expand_fdtable(struct files_struct *files, int nr) +static int expand_fdtable(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *new_fdt, *cur_fdt; spin_unlock(&files->file_lock); - new_fdt = alloc_fdtable(nr); - spin_lock(&files->file_lock); - if (!new_fdt) - return -ENOMEM; - /* - * extremely unlikely race - sysctl_nr_open decreased between the check in - * caller and alloc_fdtable(). Cheaper to catch it here... - */ - if (unlikely(new_fdt->max_fds <= nr)) { - __free_fdtable(new_fdt); - return -EMFILE; - } - /* - * Check again since another task may have expanded the fd table while - * we dropped the lock + new_fdt = alloc_fdtable(nr + 1); + + /* make sure all fd_install() have seen resize_in_progress + * or have finished their rcu_read_lock_sched() section. */ + if (atomic_read(&files->count) > 1) + synchronize_rcu(); + + spin_lock(&files->file_lock); + if (IS_ERR(new_fdt)) + return PTR_ERR(new_fdt); cur_fdt = files_fdtable(files); - if (nr >= cur_fdt->max_fds) { - /* Continue as planned */ - copy_fdtable(new_fdt, cur_fdt); - rcu_assign_pointer(files->fdt, new_fdt); - if (cur_fdt != &files->fdtab) - call_rcu(&cur_fdt->rcu, free_fdtable_rcu); - } else { - /* Somebody else expanded, so undo our attempt */ - __free_fdtable(new_fdt); - } - return 1; + BUG_ON(nr < cur_fdt->max_fds); + copy_fdtable(new_fdt, cur_fdt); + rcu_assign_pointer(files->fdt, new_fdt); + if (cur_fdt != &files->fdtab) + call_rcu(&cur_fdt->rcu, free_fdtable_rcu); + /* coupled with smp_rmb() in fd_install() */ + smp_wmb(); + return 0; } /* * Expand files. * This function will expand the file structures, if the requested size exceeds * the current capacity and there is room for expansion. - * Return <0 error code on error; 0 when nothing done; 1 when files were - * expanded and execution may have blocked. + * Return <0 error code on error; 0 on success. * The files->file_lock should be held on entry, and will be held on exit. */ -static int expand_files(struct files_struct *files, int nr) +static int expand_files(struct files_struct *files, unsigned int nr) + __releases(files->file_lock) + __acquires(files->file_lock) { struct fdtable *fdt; + int error; +repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) return 0; + if (unlikely(files->resize_in_progress)) { + spin_unlock(&files->file_lock); + wait_event(files->resize_wait, !files->resize_in_progress); + spin_lock(&files->file_lock); + goto repeat; + } + /* Can we expand? */ - if (nr >= sysctl_nr_open) + if (unlikely(nr >= sysctl_nr_open)) return -EMFILE; /* All good, so we try */ - return expand_fdtable(files, nr); -} + files->resize_in_progress = true; + error = expand_fdtable(files, nr); + files->resize_in_progress = false; -static inline void __set_close_on_exec(int fd, struct fdtable *fdt) -{ - __set_bit(fd, fdt->close_on_exec); + wake_up_all(&files->resize_wait); + return error; } -static inline void __clear_close_on_exec(int fd, struct fdtable *fdt) +static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt, + bool set) { - __clear_bit(fd, fdt->close_on_exec); + if (set) { + __set_bit(fd, fdt->close_on_exec); + } else { + if (test_bit(fd, fdt->close_on_exec)) + __clear_bit(fd, fdt->close_on_exec); + } } -static inline void __set_open_fd(int fd, struct fdtable *fdt) +static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set) { __set_bit(fd, fdt->open_fds); + __set_close_on_exec(fd, fdt, set); + fd /= BITS_PER_LONG; + if (!~fdt->open_fds[fd]) + __set_bit(fd, fdt->full_fds_bits); } -static inline void __clear_open_fd(int fd, struct fdtable *fdt) +static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) { __clear_bit(fd, fdt->open_fds); + fd /= BITS_PER_LONG; + if (test_bit(fd, fdt->full_fds_bits)) + __clear_bit(fd, fdt->full_fds_bits); } -static int count_open_files(struct fdtable *fdt) +static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) { - int size = fdt->max_fds; - int i; + return test_bit(fd, fdt->open_fds); +} - /* Find the last open fd */ - for (i = size / BITS_PER_LONG; i > 0; ) { - if (fdt->open_fds[--i]) - break; +/* + * Note that a sane fdtable size always has to be a multiple of + * BITS_PER_LONG, since we have bitmaps that are sized by this. + * + * punch_hole is optional - when close_range() is asked to unshare + * and close, we don't need to copy descriptors in that range, so + * a smaller cloned descriptor table might suffice if the last + * currently opened descriptor falls into that range. + */ +static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole) +{ + unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds); + + if (last == fdt->max_fds) + return NR_OPEN_DEFAULT; + if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) { + last = find_last_bit(fdt->open_fds, punch_hole->from); + if (last == punch_hole->from) + return NR_OPEN_DEFAULT; } - i = (i + 1) * BITS_PER_LONG; - return i; + return ALIGN(last + 1, BITS_PER_LONG); } /* - * Allocate a new files structure and copy contents from the - * passed in files structure. - * errorp will be valid only when the returned files_struct is NULL. + * Allocate a new descriptor table and copy contents from the passed in + * instance. Returns a pointer to cloned table on success, ERR_PTR() + * on failure. For 'punch_hole' see sane_fdtable_size(). */ -struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) +struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole) { struct files_struct *newf; struct file **old_fds, **new_fds; - int open_files, size, i; + unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; - *errorp = -ENOMEM; newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) - goto out; + return ERR_PTR(-ENOMEM); atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); + newf->resize_in_progress = false; + init_waitqueue_head(&newf->resize_wait); newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; new_fdt->close_on_exec = newf->close_on_exec_init; new_fdt->open_fds = newf->open_fds_init; + new_fdt->full_fds_bits = newf->full_fds_bits_init; new_fdt->fd = &newf->fd_array[0]; spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); - open_files = count_open_files(old_fdt); + open_files = sane_fdtable_size(old_fdt, punch_hole); /* * Check whether we need to allocate a larger fd array and fd set. @@ -278,17 +417,10 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) if (new_fdt != &newf->fdtab) __free_fdtable(new_fdt); - new_fdt = alloc_fdtable(open_files - 1); - if (!new_fdt) { - *errorp = -ENOMEM; - goto out_release; - } - - /* beyond sysctl_nr_open; nothing to do */ - if (unlikely(new_fdt->max_fds < open_files)) { - __free_fdtable(new_fdt); - *errorp = -EMFILE; - goto out_release; + new_fdt = alloc_fdtable(open_files); + if (IS_ERR(new_fdt)) { + kmem_cache_free(files_cachep, newf); + return ERR_CAST(new_fdt); } /* @@ -298,71 +430,57 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) */ spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); - open_files = count_open_files(old_fdt); + open_files = sane_fdtable_size(old_fdt, punch_hole); } + copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG); + old_fds = old_fdt->fd; new_fds = new_fdt->fd; - memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8); - memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8); - + /* + * We may be racing against fd allocation from other threads using this + * files_struct, despite holding ->file_lock. + * + * alloc_fd() might have already claimed a slot, while fd_install() + * did not populate it yet. Note the latter operates locklessly, so + * the file can show up as we are walking the array below. + * + * At the same time we know no files will disappear as all other + * operations take the lock. + * + * Instead of trying to placate userspace racing with itself, we + * ref the file if we see it and mark the fd slot as unused otherwise. + */ for (i = open_files; i != 0; i--) { - struct file *f = *old_fds++; + struct file *f = rcu_dereference_raw(*old_fds++); if (f) { get_file(f); } else { - /* - * The fd may be claimed in the fd bitmap but not yet - * instantiated in the files array if a sibling thread - * is partway through open(). So make sure that this - * fd is available to the new process. - */ __clear_open_fd(open_files - i, new_fdt); } rcu_assign_pointer(*new_fds++, f); } spin_unlock(&oldf->file_lock); - /* compute the remainder to be cleared */ - size = (new_fdt->max_fds - open_files) * sizeof(struct file *); - - /* This is long word aligned thus could use a optimized version */ - memset(new_fds, 0, size); - - if (new_fdt->max_fds > open_files) { - int left = (new_fdt->max_fds - open_files) / 8; - int start = open_files / BITS_PER_LONG; - - memset(&new_fdt->open_fds[start], 0, left); - memset(&new_fdt->close_on_exec[start], 0, left); - } + /* clear the remainder */ + memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *)); rcu_assign_pointer(newf->fdt, new_fdt); return newf; - -out_release: - kmem_cache_free(files_cachep, newf); -out: - return NULL; } -static void close_files(struct files_struct * files) +static struct fdtable *close_files(struct files_struct * files) { - int i, j; - struct fdtable *fdt; - - j = 0; - /* * It is safe to dereference the fd table without RCU or * ->file_lock because this is the last reference to the - * files structure. But use RCU to shut RCU-lockdep up. + * files structure. */ - rcu_read_lock(); - fdt = files_fdtable(files); - rcu_read_unlock(); + struct fdtable *fdt = rcu_dereference_raw(files->fdt); + unsigned int i, j = 0; + for (;;) { unsigned long set; i = j * BITS_PER_LONG; @@ -371,7 +489,7 @@ static void close_files(struct files_struct * files) set = fdt->open_fds[j++]; while (set) { if (set & 1) { - struct file * file = xchg(&fdt->fd[i], NULL); + struct file *file = fdt->fd[i]; if (file) { filp_close(file, files); cond_resched(); @@ -381,31 +499,15 @@ static void close_files(struct files_struct * files) set >>= 1; } } -} - -struct files_struct *get_files_struct(struct task_struct *task) -{ - struct files_struct *files; - - task_lock(task); - files = task->files; - if (files) - atomic_inc(&files->count); - task_unlock(task); - return files; + return fdt; } void put_files_struct(struct files_struct *files) { - struct fdtable *fdt; - if (atomic_dec_and_test(&files->count)) { - close_files(files); - /* not really needed, since nobody can see us */ - rcu_read_lock(); - fdt = files_fdtable(files); - rcu_read_unlock(); + struct fdtable *fdt = close_files(files); + /* free the arrays if they are not embedded */ if (fdt != &files->fdtab) __free_fdtable(fdt); @@ -413,18 +515,6 @@ void put_files_struct(struct files_struct *files) } } -void reset_files_struct(struct files_struct *files) -{ - struct task_struct *tsk = current; - struct files_struct *old; - - old = tsk->files; - task_lock(tsk); - tsk->files = files; - task_unlock(tsk); - put_files_struct(old); -} - void exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; @@ -437,12 +527,6 @@ void exit_files(struct task_struct *tsk) } } -void __init files_defer_init(void) -{ - sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) & - -BITS_PER_LONG; -} - struct files_struct init_files = { .count = ATOMIC_INIT(1), .fdt = &init_files.fdtab, @@ -451,16 +535,41 @@ struct files_struct init_files = { .fd = &init_files.fd_array[0], .close_on_exec = init_files.close_on_exec_init, .open_fds = init_files.open_fds_init, + .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), + .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), }; +static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) +{ + unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */ + unsigned int maxbit = maxfd / BITS_PER_LONG; + unsigned int bitbit = start / BITS_PER_LONG; + unsigned int bit; + + /* + * Try to avoid looking at the second level bitmap + */ + bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG, + start & (BITS_PER_LONG - 1)); + if (bit < BITS_PER_LONG) + return bit + bitbit * BITS_PER_LONG; + + bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; + if (bitbit >= maxfd) + return maxfd; + if (bitbit > start) + start = bitbit; + return find_next_zero_bit(fdt->open_fds, maxfd, start); +} + /* * allocate a file descriptor, mark it busy. */ -int __alloc_fd(struct files_struct *files, - unsigned start, unsigned end, unsigned flags) +static int alloc_fd(unsigned start, unsigned end, unsigned flags) { + struct files_struct *files = current->files; unsigned int fd; int error; struct fdtable *fdt; @@ -472,58 +581,45 @@ repeat: if (fd < files->next_fd) fd = files->next_fd; - if (fd < fdt->max_fds) - fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd); + if (likely(fd < fdt->max_fds)) + fd = find_next_fd(fdt, fd); /* * N.B. For clone tasks sharing a files structure, this test * will limit the total number of files that can be opened. */ error = -EMFILE; - if (fd >= end) + if (unlikely(fd >= end)) goto out; - error = expand_files(files, fd); - if (error < 0) - goto out; + if (unlikely(fd >= fdt->max_fds)) { + error = expand_files(files, fd); + if (error < 0) + goto out; - /* - * If we needed to expand the fs array we - * might have blocked - try again. - */ - if (error) goto repeat; + } if (start <= files->next_fd) files->next_fd = fd + 1; - __set_open_fd(fd, fdt); - if (flags & O_CLOEXEC) - __set_close_on_exec(fd, fdt); - else - __clear_close_on_exec(fd, fdt); + __set_open_fd(fd, fdt, flags & O_CLOEXEC); error = fd; -#if 1 - /* Sanity check */ - if (rcu_dereference_raw(fdt->fd[fd]) != NULL) { - printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); - rcu_assign_pointer(fdt->fd[fd], NULL); - } -#endif + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); out: spin_unlock(&files->file_lock); return error; } -static int alloc_fd(unsigned start, unsigned flags) +int __get_unused_fd_flags(unsigned flags, unsigned long nofile) { - return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags); + return alloc_fd(0, nofile, flags); } int get_unused_fd_flags(unsigned flags) { - return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags); + return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE)); } EXPORT_SYMBOL(get_unused_fd_flags); @@ -546,67 +642,244 @@ void put_unused_fd(unsigned int fd) EXPORT_SYMBOL(put_unused_fd); /* - * Install a file pointer in the fd array. - * - * The VFS is full of places where we drop the files lock between - * setting the open_fds bitmap and installing the file in the file - * array. At any such point, we are vulnerable to a dup2() race - * installing a file in the array before us. We need to detect this and - * fput() the struct file we are about to overwrite in this case. - * - * It should never happen - if we allow dup2() do it, _really_ bad things - * will follow. - * - * NOTE: __fd_install() variant is really, really low-level; don't - * use it unless you are forced to by truly lousy API shoved down - * your throat. 'files' *MUST* be either current->files or obtained - * by get_files_struct(current) done by whoever had given it to you, - * or really bad things will happen. Normally you want to use - * fd_install() instead. + * Install a file pointer in the fd array while it is being resized. + * + * We need to make sure our update to the array does not get lost as the resizing + * thread can be copying the content as we modify it. + * + * We have two ways to do it: + * - go off CPU waiting for resize_in_progress to clear + * - take the spin lock + * + * The latter is trivial to implement and saves us from having to might_sleep() + * for debugging purposes. + * + * This is moved out of line from fd_install() to convince gcc to optimize that + * routine better. */ - -void __fd_install(struct files_struct *files, unsigned int fd, - struct file *file) +static void noinline fd_install_slowpath(unsigned int fd, struct file *file) { + struct files_struct *files = current->files; struct fdtable *fdt; + spin_lock(&files->file_lock); fdt = files_fdtable(files); - BUG_ON(fdt->fd[fd] != NULL); + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); } +/** + * fd_install - install a file pointer in the fd array + * @fd: file descriptor to install the file in + * @file: the file to install + * + * This consumes the "file" refcount, so callers should treat it + * as if they had called fput(file). + */ void fd_install(unsigned int fd, struct file *file) { - __fd_install(current->files, fd, file); + struct files_struct *files = current->files; + struct fdtable *fdt; + + if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING))) + return; + + rcu_read_lock_sched(); + if (unlikely(files->resize_in_progress)) { + rcu_read_unlock_sched(); + fd_install_slowpath(fd, file); + return; + } + /* coupled with smp_wmb() in expand_fdtable() */ + smp_rmb(); + fdt = rcu_dereference_sched(files->fdt); + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); + rcu_assign_pointer(fdt->fd[fd], file); + rcu_read_unlock_sched(); } EXPORT_SYMBOL(fd_install); -/* - * The same warnings as for __alloc_fd()/__fd_install() apply here... +/** + * file_close_fd_locked - return file associated with fd + * @files: file struct to retrieve file from + * @fd: file descriptor to retrieve file for + * + * Doesn't take a separate reference count. + * + * Context: files_lock must be held. + * + * Returns: The file associated with @fd (NULL if @fd is not open) */ -int __close_fd(struct files_struct *files, unsigned fd) +struct file *file_close_fd_locked(struct files_struct *files, unsigned fd) { + struct fdtable *fdt = files_fdtable(files); struct file *file; - struct fdtable *fdt; - spin_lock(&files->file_lock); - fdt = files_fdtable(files); + lockdep_assert_held(&files->file_lock); + if (fd >= fdt->max_fds) - goto out_unlock; - file = fdt->fd[fd]; - if (!file) - goto out_unlock; - rcu_assign_pointer(fdt->fd[fd], NULL); - __clear_close_on_exec(fd, fdt); - __put_unused_fd(files, fd); + return NULL; + + fd = array_index_nospec(fd, fdt->max_fds); + file = rcu_dereference_raw(fdt->fd[fd]); + if (file) { + rcu_assign_pointer(fdt->fd[fd], NULL); + __put_unused_fd(files, fd); + } + return file; +} + +int close_fd(unsigned fd) +{ + struct files_struct *files = current->files; + struct file *file; + + spin_lock(&files->file_lock); + file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); + if (!file) + return -EBADF; + return filp_close(file, files); +} +EXPORT_SYMBOL(close_fd); -out_unlock: +/** + * last_fd - return last valid index into fd table + * @fdt: File descriptor table. + * + * Context: Either rcu read lock or files_lock must be held. + * + * Returns: Last valid index into fdtable. + */ +static inline unsigned last_fd(struct fdtable *fdt) +{ + return fdt->max_fds - 1; +} + +static inline void __range_cloexec(struct files_struct *cur_fds, + unsigned int fd, unsigned int max_fd) +{ + struct fdtable *fdt; + + /* make sure we're using the correct maximum value */ + spin_lock(&cur_fds->file_lock); + fdt = files_fdtable(cur_fds); + max_fd = min(last_fd(fdt), max_fd); + if (fd <= max_fd) + bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); + spin_unlock(&cur_fds->file_lock); +} + +static inline void __range_close(struct files_struct *files, unsigned int fd, + unsigned int max_fd) +{ + struct file *file; + unsigned n; + + spin_lock(&files->file_lock); + n = last_fd(files_fdtable(files)); + max_fd = min(max_fd, n); + + for (; fd <= max_fd; fd++) { + file = file_close_fd_locked(files, fd); + if (file) { + spin_unlock(&files->file_lock); + filp_close(file, files); + cond_resched(); + spin_lock(&files->file_lock); + } else if (need_resched()) { + spin_unlock(&files->file_lock); + cond_resched(); + spin_lock(&files->file_lock); + } + } + spin_unlock(&files->file_lock); +} + +/** + * sys_close_range() - Close all file descriptors in a given range. + * + * @fd: starting file descriptor to close + * @max_fd: last file descriptor to close + * @flags: CLOSE_RANGE flags. + * + * This closes a range of file descriptors. All file descriptors + * from @fd up to and including @max_fd are closed. + * Currently, errors to close a given file descriptor are ignored. + */ +SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, + unsigned int, flags) +{ + struct task_struct *me = current; + struct files_struct *cur_fds = me->files, *fds = NULL; + + if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC)) + return -EINVAL; + + if (fd > max_fd) + return -EINVAL; + + if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) { + struct fd_range range = {fd, max_fd}, *punch_hole = ⦥ + + /* + * If the caller requested all fds to be made cloexec we always + * copy all of the file descriptors since they still want to + * use them. + */ + if (flags & CLOSE_RANGE_CLOEXEC) + punch_hole = NULL; + + fds = dup_fd(cur_fds, punch_hole); + if (IS_ERR(fds)) + return PTR_ERR(fds); + /* + * We used to share our file descriptor table, and have now + * created a private one, make sure we're using it below. + */ + swap(cur_fds, fds); + } + + if (flags & CLOSE_RANGE_CLOEXEC) + __range_cloexec(cur_fds, fd, max_fd); + else + __range_close(cur_fds, fd, max_fd); + + if (fds) { + /* + * We're done closing the files we were supposed to. Time to install + * the new file descriptor table and drop the old one. + */ + task_lock(me); + me->files = cur_fds; + task_unlock(me); + put_files_struct(fds); + } + + return 0; +} + +/** + * file_close_fd - return file associated with fd + * @fd: file descriptor to retrieve file for + * + * Doesn't take a separate reference count. + * + * Returns: The file associated with @fd (NULL if @fd is not open) + */ +struct file *file_close_fd(unsigned int fd) +{ + struct files_struct *files = current->files; + struct file *file; + + spin_lock(&files->file_lock); + file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); - return -EBADF; + + return file; } void do_close_on_exec(struct files_struct *files) @@ -645,45 +918,238 @@ void do_close_on_exec(struct files_struct *files) spin_unlock(&files->file_lock); } -struct file *fget(unsigned int fd) +static struct file *__get_file_rcu(struct file __rcu **f) { - struct file *file; - struct files_struct *files = current->files; + struct file __rcu *file; + struct file __rcu *file_reloaded; + struct file __rcu *file_reloaded_cmp; - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - /* File object ref couldn't be taken */ - if (file->f_mode & FMODE_PATH || - !atomic_long_inc_not_zero(&file->f_count)) - file = NULL; + file = rcu_dereference_raw(*f); + if (!file) + return NULL; + + if (unlikely(!file_ref_get(&file->f_ref))) + return ERR_PTR(-EAGAIN); + + file_reloaded = rcu_dereference_raw(*f); + + /* + * Ensure that all accesses have a dependency on the load from + * rcu_dereference_raw() above so we get correct ordering + * between reuse/allocation and the pointer check below. + */ + file_reloaded_cmp = file_reloaded; + OPTIMIZER_HIDE_VAR(file_reloaded_cmp); + + /* + * file_ref_get() above provided a full memory barrier when we + * acquired a reference. + * + * This is paired with the write barrier from assigning to the + * __rcu protected file pointer so that if that pointer still + * matches the current file, we know we have successfully + * acquired a reference to the right file. + * + * If the pointers don't match the file has been reallocated by + * SLAB_TYPESAFE_BY_RCU. + */ + if (file == file_reloaded_cmp) + return file_reloaded; + + fput(file); + return ERR_PTR(-EAGAIN); +} + +/** + * get_file_rcu - try go get a reference to a file under rcu + * @f: the file to get a reference on + * + * This function tries to get a reference on @f carefully verifying that + * @f hasn't been reused. + * + * This function should rarely have to be used and only by users who + * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. + * + * Return: Returns @f with the reference count increased or NULL. + */ +struct file *get_file_rcu(struct file __rcu **f) +{ + for (;;) { + struct file __rcu *file; + + file = __get_file_rcu(f); + if (!IS_ERR(file)) + return file; } - rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(get_file_rcu); + +/** + * get_file_active - try go get a reference to a file + * @f: the file to get a reference on + * + * In contast to get_file_rcu() the pointer itself isn't part of the + * reference counting. + * + * This function should rarely have to be used and only by users who + * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. + * + * Return: Returns @f with the reference count increased or NULL. + */ +struct file *get_file_active(struct file **f) +{ + struct file __rcu *file; + rcu_read_lock(); + file = __get_file_rcu(f); + rcu_read_unlock(); + if (IS_ERR(file)) + file = NULL; return file; } +EXPORT_SYMBOL_GPL(get_file_active); -EXPORT_SYMBOL(fget); +static inline struct file *__fget_files_rcu(struct files_struct *files, + unsigned int fd, fmode_t mask) +{ + for (;;) { + struct file *file; + struct fdtable *fdt = rcu_dereference_raw(files->fdt); + struct file __rcu **fdentry; + unsigned long nospec_mask; -struct file *fget_raw(unsigned int fd) + /* Mask is a 0 for invalid fd's, ~0 for valid ones */ + nospec_mask = array_index_mask_nospec(fd, fdt->max_fds); + + /* + * fdentry points to the 'fd' offset, or fdt->fd[0]. + * Loading from fdt->fd[0] is always safe, because the + * array always exists. + */ + fdentry = fdt->fd + (fd & nospec_mask); + + /* Do the load, then mask any invalid result */ + file = rcu_dereference_raw(*fdentry); + file = (void *)(nospec_mask & (unsigned long)file); + if (unlikely(!file)) + return NULL; + + /* + * Ok, we have a file pointer that was valid at + * some point, but it might have become stale since. + * + * We need to confirm it by incrementing the refcount + * and then check the lookup again. + * + * file_ref_get() gives us a full memory barrier. We + * only really need an 'acquire' one to protect the + * loads below, but we don't have that. + */ + if (unlikely(!file_ref_get(&file->f_ref))) + continue; + + /* + * Such a race can take two forms: + * + * (a) the file ref already went down to zero and the + * file hasn't been reused yet or the file count + * isn't zero but the file has already been reused. + * + * (b) the file table entry has changed under us. + * Note that we don't need to re-check the 'fdt->fd' + * pointer having changed, because it always goes + * hand-in-hand with 'fdt'. + * + * If so, we need to put our ref and try again. + */ + if (unlikely(file != rcu_dereference_raw(*fdentry)) || + unlikely(rcu_dereference_raw(files->fdt) != fdt)) { + fput(file); + continue; + } + + /* + * This isn't the file we're looking for or we're not + * allowed to get a reference to it. + */ + if (unlikely(file->f_mode & mask)) { + fput(file); + return NULL; + } + + /* + * Ok, we have a ref to the file, and checked that it + * still exists. + */ + return file; + } +} + +static struct file *__fget_files(struct files_struct *files, unsigned int fd, + fmode_t mask) { struct file *file; - struct files_struct *files = current->files; rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - /* File object ref couldn't be taken */ - if (!atomic_long_inc_not_zero(&file->f_count)) - file = NULL; - } + file = __fget_files_rcu(files, fd, mask); rcu_read_unlock(); return file; } +static inline struct file *__fget(unsigned int fd, fmode_t mask) +{ + return __fget_files(current->files, fd, mask); +} + +struct file *fget(unsigned int fd) +{ + return __fget(fd, FMODE_PATH); +} +EXPORT_SYMBOL(fget); + +struct file *fget_raw(unsigned int fd) +{ + return __fget(fd, 0); +} EXPORT_SYMBOL(fget_raw); +struct file *fget_task(struct task_struct *task, unsigned int fd) +{ + struct file *file = NULL; + + task_lock(task); + if (task->files) + file = __fget_files(task->files, fd, 0); + task_unlock(task); + + return file; +} + +struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd) +{ + /* Must be called with rcu_read_lock held */ + struct files_struct *files; + unsigned int fd = *ret_fd; + struct file *file = NULL; + + task_lock(task); + files = task->files; + if (files) { + rcu_read_lock(); + for (; fd < files_fdtable(files)->max_fds; fd++) { + file = __fget_files_rcu(files, fd, 0); + if (file) + break; + } + rcu_read_unlock(); + } + task_unlock(task); + *ret_fd = fd; + return file; +} +EXPORT_SYMBOL(fget_task_next); + /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * @@ -699,115 +1165,167 @@ EXPORT_SYMBOL(fget_raw); * * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. + * + * (As an exception to rule 2, you can call filp_close between fget_light and + * fput_light provided that you capture a real refcount with get_file before + * the call to filp_close, and ensure that this real refcount is fput *after* + * the fput_light call.) + * + * See also the documentation in rust/kernel/file.rs. */ -struct file *fget_light(unsigned int fd, int *fput_needed) +static inline struct fd __fget_light(unsigned int fd, fmode_t mask) { - struct file *file; struct files_struct *files = current->files; + struct file *file; - *fput_needed = 0; - if (atomic_read(&files->count) == 1) { - file = fcheck_files(files, fd); - if (file && (file->f_mode & FMODE_PATH)) - file = NULL; + /* + * If another thread is concurrently calling close_fd() followed + * by put_files_struct(), we must not observe the old table + * entry combined with the new refcount - otherwise we could + * return a file that is concurrently being freed. + * + * atomic_read_acquire() pairs with atomic_dec_and_test() in + * put_files_struct(). + */ + if (likely(atomic_read_acquire(&files->count) == 1)) { + file = files_lookup_fd_raw(files, fd); + if (!file || unlikely(file->f_mode & mask)) + return EMPTY_FD; + return BORROWED_FD(file); } else { - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - if (!(file->f_mode & FMODE_PATH) && - atomic_long_inc_not_zero(&file->f_count)) - *fput_needed = 1; - else - /* Didn't get the reference, someone's freed */ - file = NULL; - } - rcu_read_unlock(); + file = __fget_files(files, fd, mask); + if (!file) + return EMPTY_FD; + return CLONED_FD(file); } +} +struct fd fdget(unsigned int fd) +{ + return __fget_light(fd, FMODE_PATH); +} +EXPORT_SYMBOL(fdget); - return file; +struct fd fdget_raw(unsigned int fd) +{ + return __fget_light(fd, 0); +} + +/* + * Try to avoid f_pos locking. We only need it if the + * file is marked for FMODE_ATOMIC_POS, and it can be + * accessed multiple ways. + * + * Always do it for directories, because pidfd_getfd() + * can make a file accessible even if it otherwise would + * not be, and for directories this is a correctness + * issue, not a "POSIX requirement". + */ +static inline bool file_needs_f_pos_lock(struct file *file) +{ + if (!(file->f_mode & FMODE_ATOMIC_POS)) + return false; + if (__file_ref_read_raw(&file->f_ref) != FILE_REF_ONEREF) + return true; + if (file->f_op->iterate_shared) + return true; + return false; } -EXPORT_SYMBOL(fget_light); -struct file *fget_raw_light(unsigned int fd, int *fput_needed) +bool file_seek_cur_needs_f_lock(struct file *file) { - struct file *file; - struct files_struct *files = current->files; + if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared) + return false; - *fput_needed = 0; - if (atomic_read(&files->count) == 1) { - file = fcheck_files(files, fd); - } else { - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - if (atomic_long_inc_not_zero(&file->f_count)) - *fput_needed = 1; - else - /* Didn't get the reference, someone's freed */ - file = NULL; - } - rcu_read_unlock(); + /* + * Note that we are not guaranteed to be called after fdget_pos() on + * this file obj, in which case the caller is expected to provide the + * appropriate locking. + */ + + return true; +} + +struct fd fdget_pos(unsigned int fd) +{ + struct fd f = fdget(fd); + struct file *file = fd_file(f); + + if (likely(file) && file_needs_f_pos_lock(file)) { + f.word |= FDPUT_POS_UNLOCK; + mutex_lock(&file->f_pos_lock); } + return f; +} - return file; +void __f_unlock_pos(struct file *f) +{ + mutex_unlock(&f->f_pos_lock); } +/* + * We only lock f_pos if we have threads or if the file might be + * shared with another process. In both cases we'll have an elevated + * file count (done either by fdget() or by fork()). + */ + void set_close_on_exec(unsigned int fd, int flag) { struct files_struct *files = current->files; - struct fdtable *fdt; spin_lock(&files->file_lock); - fdt = files_fdtable(files); - if (flag) - __set_close_on_exec(fd, fdt); - else - __clear_close_on_exec(fd, fdt); + __set_close_on_exec(fd, files_fdtable(files), flag); spin_unlock(&files->file_lock); } bool get_close_on_exec(unsigned int fd) { - struct files_struct *files = current->files; - struct fdtable *fdt; bool res; rcu_read_lock(); - fdt = files_fdtable(files); - res = close_on_exec(fd, fdt); + res = close_on_exec(fd, current->files); rcu_read_unlock(); return res; } static int do_dup2(struct files_struct *files, struct file *file, unsigned fd, unsigned flags) +__releases(&files->file_lock) { struct file *tofree; struct fdtable *fdt; /* - * We need to detect attempts to do dup2() over allocated but still - * not finished descriptor. NB: OpenBSD avoids that at the price of - * extra work in their equivalent of fget() - they insert struct - * file immediately after grabbing descriptor, mark it larval if - * more work (e.g. actual opening) is needed and make sure that - * fget() treats larval files as absent. Potentially interesting, - * but while extra work in fget() is trivial, locking implications - * and amount of surgery on open()-related paths in VFS are not. - * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" - * deadlocks in rather amusing ways, AFAICS. All of that is out of - * scope of POSIX or SUS, since neither considers shared descriptor - * tables and this condition does not arise without those. + * dup2() is expected to close the file installed in the target fd slot + * (if any). However, userspace hand-picking a fd may be racing against + * its own threads which happened to allocate it in open() et al but did + * not populate it yet. + * + * Broadly speaking we may be racing against the following: + * fd = get_unused_fd_flags(); // fd slot reserved, ->fd[fd] == NULL + * file = hard_work_goes_here(); + * fd_install(fd, file); // only now ->fd[fd] == file + * + * It is an invariant that a successfully allocated fd has a NULL entry + * in the array until the matching fd_install(). + * + * If we fit the window, we have the fd to populate, yet no target file + * to close. Trying to ignore it and install our new file would violate + * the invariant and make fd_install() overwrite our file. + * + * Things can be done(tm) to handle this. However, the issue does not + * concern legitimate programs and we only need to make sure the kernel + * does not trip over it. + * + * The simplest way out is to return an error if we find ourselves here. + * + * POSIX is silent on the issue, we return -EBUSY. */ fdt = files_fdtable(files); - tofree = fdt->fd[fd]; + fd = array_index_nospec(fd, fdt->max_fds); + tofree = rcu_dereference_raw(fdt->fd[fd]); if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; get_file(file); rcu_assign_pointer(fdt->fd[fd], file); - __set_open_fd(fd, fdt); - if (flags & O_CLOEXEC) - __set_close_on_exec(fd, fdt); - else - __clear_close_on_exec(fd, fdt); + __set_open_fd(fd, fdt, flags & O_CLOEXEC); spin_unlock(&files->file_lock); if (tofree) @@ -826,7 +1344,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags) struct files_struct *files = current->files; if (!file) - return __close_fd(files, fd); + return close_fd(fd); if (fd >= rlimit(RLIMIT_NOFILE)) return -EBADF; @@ -835,14 +1353,70 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags) err = expand_files(files, fd); if (unlikely(err < 0)) goto out_unlock; - return do_dup2(files, file, fd, flags); + err = do_dup2(files, file, fd, flags); + if (err < 0) + return err; + return 0; out_unlock: spin_unlock(&files->file_lock); return err; } -SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) +/** + * receive_fd() - Install received file into file descriptor table + * @file: struct file that was received from another process + * @ufd: __user pointer to write new fd number to + * @o_flags: the O_* flags to apply to the new fd entry + * + * Installs a received file into the file descriptor table, with appropriate + * checks and count updates. Optionally writes the fd number to userspace, if + * @ufd is non-NULL. + * + * This helper handles its own reference counting of the incoming + * struct file. + * + * Returns newly install fd or -ve on error. + */ +int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) +{ + int error; + + error = security_file_receive(file); + if (error) + return error; + + FD_PREPARE(fdf, o_flags, file); + if (fdf.err) + return fdf.err; + get_file(file); + + if (ufd) { + error = put_user(fd_prepare_fd(fdf), ufd); + if (error) + return error; + } + + __receive_sock(fd_prepare_file(fdf)); + return fd_publish(fdf); +} +EXPORT_SYMBOL_GPL(receive_fd); + +int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) +{ + int error; + + error = security_file_receive(file); + if (error) + return error; + error = replace_fd(new_fd, file, o_flags); + if (error) + return error; + __receive_sock(file); + return new_fd; +} + +static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; struct file *file; @@ -859,7 +1433,7 @@ SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) spin_lock(&files->file_lock); err = expand_files(files, newfd); - file = fcheck(oldfd); + file = files_lookup_fd_locked(files, oldfd); if (unlikely(!file)) goto Ebadf; if (unlikely(err < 0)) { @@ -876,19 +1450,28 @@ out_unlock: return err; } +SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) +{ + return ksys_dup3(oldfd, newfd, flags); +} + SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) { if (unlikely(newfd == oldfd)) { /* corner case */ struct files_struct *files = current->files; + struct file *f; int retval = oldfd; rcu_read_lock(); - if (!fcheck_files(files, oldfd)) + f = __fget_files_rcu(files, oldfd, 0); + if (!f) retval = -EBADF; rcu_read_unlock(); + if (f) + fput(f); return retval; } - return sys_dup3(oldfd, newfd, 0); + return ksys_dup3(oldfd, newfd, 0); } SYSCALL_DEFINE1(dup, unsigned int, fildes) @@ -897,7 +1480,7 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes) struct file *file = fget_raw(fildes); if (file) { - ret = get_unused_fd(); + ret = get_unused_fd_flags(0); if (ret >= 0) fd_install(ret, file); else @@ -908,10 +1491,11 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes) int f_dupfd(unsigned int from, struct file *file, unsigned flags) { + unsigned long nofile = rlimit(RLIMIT_NOFILE); int err; - if (from >= rlimit(RLIMIT_NOFILE)) + if (from >= nofile) return -EINVAL; - err = alloc_fd(from, flags); + err = alloc_fd(from, nofile, flags); if (err >= 0) { get_file(file); fd_install(err, file); |
