diff options
Diffstat (limited to 'fs/file.c')
| -rw-r--r-- | fs/file.c | 853 |
1 files changed, 531 insertions, 322 deletions
diff --git a/fs/file.c b/fs/file.c index 35c62b54c9d6..0a4f3bdb2dec 100644 --- a/fs/file.c +++ b/fs/file.c @@ -20,10 +20,79 @@ #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/close_range.h> +#include <linux/file_ref.h> #include <net/sock.h> +#include <linux/init_task.h> #include "internal.h" +static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt) +{ + /* + * If the reference count was already in the dead zone, then this + * put() operation is imbalanced. Warn, put the reference count back to + * DEAD and tell the caller to not deconstruct the object. + */ + if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { + atomic_long_set(&ref->refcnt, FILE_REF_DEAD); + return false; + } + + /* + * This is a put() operation on a saturated refcount. Restore the + * mean saturation value and tell the caller to not deconstruct the + * object. + */ + if (cnt > FILE_REF_MAXREF) + atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); + return false; +} + +/** + * __file_ref_put - Slowpath of file_ref_put() + * @ref: Pointer to the reference count + * @cnt: Current reference count + * + * Invoked when the reference count is outside of the valid zone. + * + * Return: + * True if this was the last reference with no future references + * possible. This signals the caller that it can safely schedule the + * object, which is protected by the reference counter, for + * deconstruction. + * + * False if there are still active references or the put() raced + * with a concurrent get()/put() pair. Caller is not allowed to + * deconstruct the protected object. + */ +bool __file_ref_put(file_ref_t *ref, unsigned long cnt) +{ + /* Did this drop the last reference? */ + if (likely(cnt == FILE_REF_NOREF)) { + /* + * Carefully try to set the reference count to FILE_REF_DEAD. + * + * This can fail if a concurrent get() operation has + * elevated it again or the corresponding put() even marked + * it dead already. Both are valid situations and do not + * require a retry. If this fails the caller is not + * allowed to deconstruct the object. + */ + if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD)) + return false; + + /* + * The caller can safely schedule the object for + * deconstruction. Provide acquire ordering. + */ + smp_acquire__after_ctrl_dep(); + return true; + } + + return __file_ref_put_badval(ref, cnt); +} +EXPORT_SYMBOL_GPL(__file_ref_put); + unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; /* our min() is unusable in constant expressions ;-/ */ @@ -46,27 +115,23 @@ static void free_fdtable_rcu(struct rcu_head *rcu) #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) +#define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds /* * Copy 'count' fd bits from the old table to the new table and clear the extra * space if any. This does not copy the file pointers. Called with the files * spinlock held for write. */ -static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, - unsigned int count) +static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, + unsigned int copy_words) { - unsigned int cpy, set; - - cpy = count / BITS_PER_BYTE; - set = (nfdt->max_fds - count) / BITS_PER_BYTE; - memcpy(nfdt->open_fds, ofdt->open_fds, cpy); - memset((char *)nfdt->open_fds + cpy, 0, set); - memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); - memset((char *)nfdt->close_on_exec + cpy, 0, set); - - cpy = BITBIT_SIZE(count); - set = BITBIT_SIZE(nfdt->max_fds) - cpy; - memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy); - memset((char *)nfdt->full_fds_bits + cpy, 0, set); + unsigned int nwords = fdt_words(nfdt); + + bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds, + copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); + bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec, + copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); + bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits, + copy_words, nwords); } /* @@ -84,7 +149,7 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) memcpy(nfdt->fd, ofdt->fd, cpy); memset((char *)nfdt->fd + cpy, 0, set); - copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds); + copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt)); } /* @@ -93,18 +158,11 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) * 'unsigned long' in some places, but simply because that is how the Linux * kernel bitmaps are defined to work: they are not "bits in an array of bytes", * they are very much "bits in an array of unsigned long". - * - * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied - * by that "1024/sizeof(ptr)" before, we already know there are sufficient - * clear low bits. Clang seems to realize that, gcc ends up being confused. - * - * On a 128-bit machine, the ALIGN() would actually matter. In the meantime, - * let's consider it documentation (and maybe a test-case for gcc to improve - * its code generation ;) */ -static struct fdtable * alloc_fdtable(unsigned int nr) +static struct fdtable *alloc_fdtable(unsigned int slots_wanted) { struct fdtable *fdt; + unsigned int nr; void *data; /* @@ -112,22 +170,47 @@ static struct fdtable * alloc_fdtable(unsigned int nr) * Allocation steps are keyed to the size of the fdarray, since it * grows far faster than any of the other dynamic data. We try to fit * the fdarray into comfortable page-tuned chunks: starting at 1024B - * and growing in powers of two from there on. + * and growing in powers of two from there on. Since we called only + * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab + * already gives BITS_PER_LONG slots), the above boils down to + * 1. use the smallest power of two large enough to give us that many + * slots. + * 2. on 32bit skip 64 and 128 - the minimal capacity we want there is + * 256 slots (i.e. 1Kb fd array). + * 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there + * and we are never going to be asked for 64 or less. */ - nr /= (1024 / sizeof(struct file *)); - nr = roundup_pow_of_two(nr + 1); - nr *= (1024 / sizeof(struct file *)); - nr = ALIGN(nr, BITS_PER_LONG); + if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256) + nr = 256; + else + nr = roundup_pow_of_two(slots_wanted); /* * Note that this can drive nr *below* what we had passed if sysctl_nr_open - * had been set lower between the check in expand_files() and here. Deal - * with that in caller, it's cheaper that way. + * had been set lower between the check in expand_files() and here. * * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise * bitmaps handling below becomes unpleasant, to put it mildly... */ - if (unlikely(nr > sysctl_nr_open)) - nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; + if (unlikely(nr > sysctl_nr_open)) { + nr = round_down(sysctl_nr_open, BITS_PER_LONG); + if (nr < slots_wanted) + return ERR_PTR(-EMFILE); + } + + /* + * Check if the allocation size would exceed INT_MAX. kvmalloc_array() + * and kvmalloc() will warn if the allocation size is greater than + * INT_MAX, as filp_cachep objects are not __GFP_NOWARN. + * + * This can happen when sysctl_nr_open is set to a very high value and + * a process tries to use a file descriptor near that limit. For example, + * if sysctl_nr_open is set to 1073741816 (0x3ffffff8) - which is what + * systemd typically sets it to - then trying to use a file descriptor + * close to that value will require allocating a file descriptor table + * that exceeds 8GB in size. + */ + if (unlikely(nr > INT_MAX / sizeof(struct file *))) + return ERR_PTR(-EMFILE); fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) @@ -156,14 +239,14 @@ out_arr: out_fdt: kfree(fdt); out: - return NULL; + return ERR_PTR(-ENOMEM); } /* * Expand the file descriptor table. * This function will allocate a new fdtable and both fd array and fdset, of * the given size. - * Return <0 error code on error; 1 on successful completion. + * Return <0 error code on error; 0 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_fdtable(struct files_struct *files, unsigned int nr) @@ -173,7 +256,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) struct fdtable *new_fdt, *cur_fdt; spin_unlock(&files->file_lock); - new_fdt = alloc_fdtable(nr); + new_fdt = alloc_fdtable(nr + 1); /* make sure all fd_install() have seen resize_in_progress * or have finished their rcu_read_lock_sched() section. @@ -182,16 +265,8 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) synchronize_rcu(); spin_lock(&files->file_lock); - if (!new_fdt) - return -ENOMEM; - /* - * extremely unlikely race - sysctl_nr_open decreased between the check in - * caller and alloc_fdtable(). Cheaper to catch it here... - */ - if (unlikely(new_fdt->max_fds <= nr)) { - __free_fdtable(new_fdt); - return -EMFILE; - } + if (IS_ERR(new_fdt)) + return PTR_ERR(new_fdt); cur_fdt = files_fdtable(files); BUG_ON(nr < cur_fdt->max_fds); copy_fdtable(new_fdt, cur_fdt); @@ -200,15 +275,14 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) call_rcu(&cur_fdt->rcu, free_fdtable_rcu); /* coupled with smp_rmb() in fd_install() */ smp_wmb(); - return 1; + return 0; } /* * Expand files. * This function will expand the file structures, if the requested size exceeds * the current capacity and there is room for expansion. - * Return <0 error code on error; 0 when nothing done; 1 when files were - * expanded and execution may have blocked. + * Return <0 error code on error; 0 on success. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, unsigned int nr) @@ -216,50 +290,50 @@ static int expand_files(struct files_struct *files, unsigned int nr) __acquires(files->file_lock) { struct fdtable *fdt; - int expanded = 0; + int error; repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) - return expanded; - - /* Can we expand? */ - if (nr >= sysctl_nr_open) - return -EMFILE; + return 0; if (unlikely(files->resize_in_progress)) { spin_unlock(&files->file_lock); - expanded = 1; wait_event(files->resize_wait, !files->resize_in_progress); spin_lock(&files->file_lock); goto repeat; } + /* Can we expand? */ + if (unlikely(nr >= sysctl_nr_open)) + return -EMFILE; + /* All good, so we try */ files->resize_in_progress = true; - expanded = expand_fdtable(files, nr); + error = expand_fdtable(files, nr); files->resize_in_progress = false; wake_up_all(&files->resize_wait); - return expanded; -} - -static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt) -{ - __set_bit(fd, fdt->close_on_exec); + return error; } -static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt) +static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt, + bool set) { - if (test_bit(fd, fdt->close_on_exec)) - __clear_bit(fd, fdt->close_on_exec); + if (set) { + __set_bit(fd, fdt->close_on_exec); + } else { + if (test_bit(fd, fdt->close_on_exec)) + __clear_bit(fd, fdt->close_on_exec); + } } -static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) +static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set) { __set_bit(fd, fdt->open_fds); + __set_close_on_exec(fd, fdt, set); fd /= BITS_PER_LONG; if (!~fdt->open_fds[fd]) __set_bit(fd, fdt->full_fds_bits); @@ -268,62 +342,54 @@ static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) { __clear_bit(fd, fdt->open_fds); - __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); + fd /= BITS_PER_LONG; + if (test_bit(fd, fdt->full_fds_bits)) + __clear_bit(fd, fdt->full_fds_bits); } -static unsigned int count_open_files(struct fdtable *fdt) +static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) { - unsigned int size = fdt->max_fds; - unsigned int i; - - /* Find the last open fd */ - for (i = size / BITS_PER_LONG; i > 0; ) { - if (fdt->open_fds[--i]) - break; - } - i = (i + 1) * BITS_PER_LONG; - return i; + return test_bit(fd, fdt->open_fds); } /* * Note that a sane fdtable size always has to be a multiple of * BITS_PER_LONG, since we have bitmaps that are sized by this. * - * 'max_fds' will normally already be properly aligned, but it - * turns out that in the close_range() -> __close_range() -> - * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end - * up having a 'max_fds' value that isn't already aligned. - * - * Rather than make close_range() have to worry about this, - * just make that BITS_PER_LONG alignment be part of a sane - * fdtable size. Becuase that's really what it is. + * punch_hole is optional - when close_range() is asked to unshare + * and close, we don't need to copy descriptors in that range, so + * a smaller cloned descriptor table might suffice if the last + * currently opened descriptor falls into that range. */ -static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds) +static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole) { - unsigned int count; - - count = count_open_files(fdt); - if (max_fds < NR_OPEN_DEFAULT) - max_fds = NR_OPEN_DEFAULT; - return ALIGN(min(count, max_fds), BITS_PER_LONG); + unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds); + + if (last == fdt->max_fds) + return NR_OPEN_DEFAULT; + if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) { + last = find_last_bit(fdt->open_fds, punch_hole->from); + if (last == punch_hole->from) + return NR_OPEN_DEFAULT; + } + return ALIGN(last + 1, BITS_PER_LONG); } /* - * Allocate a new files structure and copy contents from the - * passed in files structure. - * errorp will be valid only when the returned files_struct is NULL. + * Allocate a new descriptor table and copy contents from the passed in + * instance. Returns a pointer to cloned table on success, ERR_PTR() + * on failure. For 'punch_hole' see sane_fdtable_size(). */ -struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp) +struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole) { struct files_struct *newf; struct file **old_fds, **new_fds; unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; - *errorp = -ENOMEM; newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) - goto out; + return ERR_PTR(-ENOMEM); atomic_set(&newf->count, 1); @@ -340,7 +406,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); - open_files = sane_fdtable_size(old_fdt, max_fds); + open_files = sane_fdtable_size(old_fdt, punch_hole); /* * Check whether we need to allocate a larger fd array and fd set. @@ -351,17 +417,10 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int if (new_fdt != &newf->fdtab) __free_fdtable(new_fdt); - new_fdt = alloc_fdtable(open_files - 1); - if (!new_fdt) { - *errorp = -ENOMEM; - goto out_release; - } - - /* beyond sysctl_nr_open; nothing to do */ - if (unlikely(new_fdt->max_fds < open_files)) { - __free_fdtable(new_fdt); - *errorp = -EMFILE; - goto out_release; + new_fdt = alloc_fdtable(open_files); + if (IS_ERR(new_fdt)) { + kmem_cache_free(files_cachep, newf); + return ERR_CAST(new_fdt); } /* @@ -371,25 +430,33 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int */ spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); - open_files = sane_fdtable_size(old_fdt, max_fds); + open_files = sane_fdtable_size(old_fdt, punch_hole); } - copy_fd_bitmaps(new_fdt, old_fdt, open_files); + copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG); old_fds = old_fdt->fd; new_fds = new_fdt->fd; + /* + * We may be racing against fd allocation from other threads using this + * files_struct, despite holding ->file_lock. + * + * alloc_fd() might have already claimed a slot, while fd_install() + * did not populate it yet. Note the latter operates locklessly, so + * the file can show up as we are walking the array below. + * + * At the same time we know no files will disappear as all other + * operations take the lock. + * + * Instead of trying to placate userspace racing with itself, we + * ref the file if we see it and mark the fd slot as unused otherwise. + */ for (i = open_files; i != 0; i--) { - struct file *f = *old_fds++; + struct file *f = rcu_dereference_raw(*old_fds++); if (f) { get_file(f); } else { - /* - * The fd may be claimed in the fd bitmap but not yet - * instantiated in the files array if a sibling thread - * is partway through open(). So make sure that this - * fd is available to the new process. - */ __clear_open_fd(open_files - i, new_fdt); } rcu_assign_pointer(*new_fds++, f); @@ -402,11 +469,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int rcu_assign_pointer(newf->fdt, new_fdt); return newf; - -out_release: - kmem_cache_free(files_cachep, newf); -out: - return NULL; } static struct fdtable *close_files(struct files_struct * files) @@ -427,7 +489,7 @@ static struct fdtable *close_files(struct files_struct * files) set = fdt->open_fds[j++]; while (set) { if (set & 1) { - struct file * file = xchg(&fdt->fd[i], NULL); + struct file *file = fdt->fd[i]; if (file) { filp_close(file, files); cond_resched(); @@ -481,12 +543,21 @@ struct files_struct init_files = { static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) { - unsigned int maxfd = fdt->max_fds; + unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */ unsigned int maxbit = maxfd / BITS_PER_LONG; unsigned int bitbit = start / BITS_PER_LONG; + unsigned int bit; + + /* + * Try to avoid looking at the second level bitmap + */ + bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG, + start & (BITS_PER_LONG - 1)); + if (bit < BITS_PER_LONG) + return bit + bitbit * BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; - if (bitbit > maxfd) + if (bitbit >= maxfd) return maxfd; if (bitbit > start) start = bitbit; @@ -510,7 +581,7 @@ repeat: if (fd < files->next_fd) fd = files->next_fd; - if (fd < fdt->max_fds) + if (likely(fd < fdt->max_fds)) fd = find_next_fd(fdt, fd); /* @@ -518,36 +589,23 @@ repeat: * will limit the total number of files that can be opened. */ error = -EMFILE; - if (fd >= end) + if (unlikely(fd >= end)) goto out; - error = expand_files(files, fd); - if (error < 0) - goto out; + if (unlikely(fd >= fdt->max_fds)) { + error = expand_files(files, fd); + if (error < 0) + goto out; - /* - * If we needed to expand the fs array we - * might have blocked - try again. - */ - if (error) goto repeat; + } if (start <= files->next_fd) files->next_fd = fd + 1; - __set_open_fd(fd, fdt); - if (flags & O_CLOEXEC) - __set_close_on_exec(fd, fdt); - else - __clear_close_on_exec(fd, fdt); + __set_open_fd(fd, fdt, flags & O_CLOEXEC); error = fd; -#if 1 - /* Sanity check */ - if (rcu_access_pointer(fdt->fd[fd]) != NULL) { - printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); - rcu_assign_pointer(fdt->fd[fd], NULL); - } -#endif + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); out: spin_unlock(&files->file_lock); @@ -584,41 +642,59 @@ void put_unused_fd(unsigned int fd) EXPORT_SYMBOL(put_unused_fd); /* - * Install a file pointer in the fd array. + * Install a file pointer in the fd array while it is being resized. + * + * We need to make sure our update to the array does not get lost as the resizing + * thread can be copying the content as we modify it. + * + * We have two ways to do it: + * - go off CPU waiting for resize_in_progress to clear + * - take the spin lock * - * The VFS is full of places where we drop the files lock between - * setting the open_fds bitmap and installing the file in the file - * array. At any such point, we are vulnerable to a dup2() race - * installing a file in the array before us. We need to detect this and - * fput() the struct file we are about to overwrite in this case. + * The latter is trivial to implement and saves us from having to might_sleep() + * for debugging purposes. * - * It should never happen - if we allow dup2() do it, _really_ bad things - * will follow. + * This is moved out of line from fd_install() to convince gcc to optimize that + * routine better. + */ +static void noinline fd_install_slowpath(unsigned int fd, struct file *file) +{ + struct files_struct *files = current->files; + struct fdtable *fdt; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); + rcu_assign_pointer(fdt->fd[fd], file); + spin_unlock(&files->file_lock); +} + +/** + * fd_install - install a file pointer in the fd array + * @fd: file descriptor to install the file in + * @file: the file to install * * This consumes the "file" refcount, so callers should treat it * as if they had called fput(file). */ - void fd_install(unsigned int fd, struct file *file) { struct files_struct *files = current->files; struct fdtable *fdt; - rcu_read_lock_sched(); + if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING))) + return; + rcu_read_lock_sched(); if (unlikely(files->resize_in_progress)) { rcu_read_unlock_sched(); - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - BUG_ON(fdt->fd[fd] != NULL); - rcu_assign_pointer(fdt->fd[fd], file); - spin_unlock(&files->file_lock); + fd_install_slowpath(fd, file); return; } /* coupled with smp_wmb() in expand_fdtable() */ smp_rmb(); fdt = rcu_dereference_sched(files->fdt); - BUG_ON(fdt->fd[fd] != NULL); + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); rcu_assign_pointer(fdt->fd[fd], file); rcu_read_unlock_sched(); } @@ -626,24 +702,28 @@ void fd_install(unsigned int fd, struct file *file) EXPORT_SYMBOL(fd_install); /** - * pick_file - return file associatd with fd + * file_close_fd_locked - return file associated with fd * @files: file struct to retrieve file from * @fd: file descriptor to retrieve file for * + * Doesn't take a separate reference count. + * * Context: files_lock must be held. * * Returns: The file associated with @fd (NULL if @fd is not open) */ -static struct file *pick_file(struct files_struct *files, unsigned fd) +struct file *file_close_fd_locked(struct files_struct *files, unsigned fd) { struct fdtable *fdt = files_fdtable(files); struct file *file; + lockdep_assert_held(&files->file_lock); + if (fd >= fdt->max_fds) return NULL; fd = array_index_nospec(fd, fdt->max_fds); - file = fdt->fd[fd]; + file = rcu_dereference_raw(fdt->fd[fd]); if (file) { rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); @@ -657,18 +737,18 @@ int close_fd(unsigned fd) struct file *file; spin_lock(&files->file_lock); - file = pick_file(files, fd); + file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); if (!file) return -EBADF; return filp_close(file, files); } -EXPORT_SYMBOL(close_fd); /* for ksys_close() */ +EXPORT_SYMBOL(close_fd); /** * last_fd - return last valid index into fd table - * @cur_fds: files struct + * @fdt: File descriptor table. * * Context: Either rcu read lock or files_lock must be held. * @@ -693,41 +773,45 @@ static inline void __range_cloexec(struct files_struct *cur_fds, spin_unlock(&cur_fds->file_lock); } -static inline void __range_close(struct files_struct *cur_fds, unsigned int fd, +static inline void __range_close(struct files_struct *files, unsigned int fd, unsigned int max_fd) { + struct file *file; unsigned n; - rcu_read_lock(); - n = last_fd(files_fdtable(cur_fds)); - rcu_read_unlock(); + spin_lock(&files->file_lock); + n = last_fd(files_fdtable(files)); max_fd = min(max_fd, n); - while (fd <= max_fd) { - struct file *file; - - spin_lock(&cur_fds->file_lock); - file = pick_file(cur_fds, fd++); - spin_unlock(&cur_fds->file_lock); - + for (; fd <= max_fd; fd++) { + file = file_close_fd_locked(files, fd); if (file) { - /* found a valid file to close */ - filp_close(file, cur_fds); + spin_unlock(&files->file_lock); + filp_close(file, files); cond_resched(); + spin_lock(&files->file_lock); + } else if (need_resched()) { + spin_unlock(&files->file_lock); + cond_resched(); + spin_lock(&files->file_lock); } } + spin_unlock(&files->file_lock); } /** - * __close_range() - Close all file descriptors in a given range. + * sys_close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close + * @flags: CLOSE_RANGE flags. * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. + * Currently, errors to close a given file descriptor are ignored. */ -int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) +SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, + unsigned int, flags) { struct task_struct *me = current; struct files_struct *cur_fds = me->files, *fds = NULL; @@ -738,37 +822,25 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) if (fd > max_fd) return -EINVAL; - if (flags & CLOSE_RANGE_UNSHARE) { - int ret; - unsigned int max_unshare_fds = NR_OPEN_MAX; + if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) { + struct fd_range range = {fd, max_fd}, *punch_hole = ⦥ /* * If the caller requested all fds to be made cloexec we always * copy all of the file descriptors since they still want to * use them. */ - if (!(flags & CLOSE_RANGE_CLOEXEC)) { - /* - * If the requested range is greater than the current - * maximum, we're closing everything so only copy all - * file descriptors beneath the lowest file descriptor. - */ - rcu_read_lock(); - if (max_fd >= last_fd(files_fdtable(cur_fds))) - max_unshare_fds = fd; - rcu_read_unlock(); - } - - ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds); - if (ret) - return ret; + if (flags & CLOSE_RANGE_CLOEXEC) + punch_hole = NULL; + fds = dup_fd(cur_fds, punch_hole); + if (IS_ERR(fds)) + return PTR_ERR(fds); /* * We used to share our file descriptor table, and have now * created a private one, make sure we're using it below. */ - if (fds) - swap(cur_fds, fds); + swap(cur_fds, fds); } if (flags & CLOSE_RANGE_CLOEXEC) @@ -790,26 +862,21 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) return 0; } -/* - * See close_fd_get_file() below, this variant assumes current->files->file_lock - * is held. - */ -struct file *__close_fd_get_file(unsigned int fd) -{ - return pick_file(current->files, fd); -} - -/* - * variant of close_fd that gets a ref on the file for later fput. - * The caller must ensure that filp_close() called on the file. +/** + * file_close_fd - return file associated with fd + * @fd: file descriptor to retrieve file for + * + * Doesn't take a separate reference count. + * + * Returns: The file associated with @fd (NULL if @fd is not open) */ -struct file *close_fd_get_file(unsigned int fd) +struct file *file_close_fd(unsigned int fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); - file = pick_file(files, fd); + file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); return file; @@ -851,39 +918,143 @@ void do_close_on_exec(struct files_struct *files) spin_unlock(&files->file_lock); } +static struct file *__get_file_rcu(struct file __rcu **f) +{ + struct file __rcu *file; + struct file __rcu *file_reloaded; + struct file __rcu *file_reloaded_cmp; + + file = rcu_dereference_raw(*f); + if (!file) + return NULL; + + if (unlikely(!file_ref_get(&file->f_ref))) + return ERR_PTR(-EAGAIN); + + file_reloaded = rcu_dereference_raw(*f); + + /* + * Ensure that all accesses have a dependency on the load from + * rcu_dereference_raw() above so we get correct ordering + * between reuse/allocation and the pointer check below. + */ + file_reloaded_cmp = file_reloaded; + OPTIMIZER_HIDE_VAR(file_reloaded_cmp); + + /* + * file_ref_get() above provided a full memory barrier when we + * acquired a reference. + * + * This is paired with the write barrier from assigning to the + * __rcu protected file pointer so that if that pointer still + * matches the current file, we know we have successfully + * acquired a reference to the right file. + * + * If the pointers don't match the file has been reallocated by + * SLAB_TYPESAFE_BY_RCU. + */ + if (file == file_reloaded_cmp) + return file_reloaded; + + fput(file); + return ERR_PTR(-EAGAIN); +} + +/** + * get_file_rcu - try go get a reference to a file under rcu + * @f: the file to get a reference on + * + * This function tries to get a reference on @f carefully verifying that + * @f hasn't been reused. + * + * This function should rarely have to be used and only by users who + * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. + * + * Return: Returns @f with the reference count increased or NULL. + */ +struct file *get_file_rcu(struct file __rcu **f) +{ + for (;;) { + struct file __rcu *file; + + file = __get_file_rcu(f); + if (!IS_ERR(file)) + return file; + } +} +EXPORT_SYMBOL_GPL(get_file_rcu); + +/** + * get_file_active - try go get a reference to a file + * @f: the file to get a reference on + * + * In contast to get_file_rcu() the pointer itself isn't part of the + * reference counting. + * + * This function should rarely have to be used and only by users who + * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. + * + * Return: Returns @f with the reference count increased or NULL. + */ +struct file *get_file_active(struct file **f) +{ + struct file __rcu *file; + + rcu_read_lock(); + file = __get_file_rcu(f); + rcu_read_unlock(); + if (IS_ERR(file)) + file = NULL; + return file; +} +EXPORT_SYMBOL_GPL(get_file_active); + static inline struct file *__fget_files_rcu(struct files_struct *files, - unsigned int fd, fmode_t mask) + unsigned int fd, fmode_t mask) { for (;;) { struct file *file; struct fdtable *fdt = rcu_dereference_raw(files->fdt); struct file __rcu **fdentry; + unsigned long nospec_mask; - if (unlikely(fd >= fdt->max_fds)) - return NULL; + /* Mask is a 0 for invalid fd's, ~0 for valid ones */ + nospec_mask = array_index_mask_nospec(fd, fdt->max_fds); - fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds); + /* + * fdentry points to the 'fd' offset, or fdt->fd[0]. + * Loading from fdt->fd[0] is always safe, because the + * array always exists. + */ + fdentry = fdt->fd + (fd & nospec_mask); + + /* Do the load, then mask any invalid result */ file = rcu_dereference_raw(*fdentry); + file = (void *)(nospec_mask & (unsigned long)file); if (unlikely(!file)) return NULL; - if (unlikely(file->f_mode & mask)) - return NULL; - /* - * Ok, we have a file pointer. However, because we do - * this all locklessly under RCU, we may be racing with - * that file being closed. + * Ok, we have a file pointer that was valid at + * some point, but it might have become stale since. * - * Such a race can take two forms: + * We need to confirm it by incrementing the refcount + * and then check the lookup again. * - * (a) the file ref already went down to zero, - * and get_file_rcu() fails. Just try again: + * file_ref_get() gives us a full memory barrier. We + * only really need an 'acquire' one to protect the + * loads below, but we don't have that. */ - if (unlikely(!get_file_rcu(file))) + if (unlikely(!file_ref_get(&file->f_ref))) continue; /* + * Such a race can take two forms: + * + * (a) the file ref already went down to zero and the + * file hasn't been reused yet or the file count + * isn't zero but the file has already been reused. + * * (b) the file table entry has changed under us. * Note that we don't need to re-check the 'fdt->fd' * pointer having changed, because it always goes @@ -891,13 +1062,22 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * * If so, we need to put our ref and try again. */ - if (unlikely(rcu_dereference_raw(files->fdt) != fdt) || - unlikely(rcu_dereference_raw(*fdentry) != file)) { + if (unlikely(file != rcu_dereference_raw(*fdentry)) || + unlikely(rcu_dereference_raw(files->fdt) != fdt)) { fput(file); continue; } /* + * This isn't the file we're looking for or we're not + * allowed to get a reference to it. + */ + if (unlikely(file->f_mode & mask)) { + fput(file); + return NULL; + } + + /* * Ok, we have a ref to the file, and checked that it * still exists. */ @@ -946,22 +1126,7 @@ struct file *fget_task(struct task_struct *task, unsigned int fd) return file; } -struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd) -{ - /* Must be called with rcu_read_lock held */ - struct files_struct *files; - struct file *file = NULL; - - task_lock(task); - files = task->files; - if (files) - file = files_lookup_fd_rcu(files, fd); - task_unlock(task); - - return file; -} - -struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd) +struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; @@ -971,17 +1136,19 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret task_lock(task); files = task->files; if (files) { + rcu_read_lock(); for (; fd < files_fdtable(files)->max_fds; fd++) { - file = files_lookup_fd_rcu(files, fd); + file = __fget_files_rcu(files, fd, 0); if (file) break; } + rcu_read_unlock(); } task_unlock(task); *ret_fd = fd; return file; } -EXPORT_SYMBOL(task_lookup_next_fd_rcu); +EXPORT_SYMBOL(fget_task_next); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. @@ -998,8 +1165,15 @@ EXPORT_SYMBOL(task_lookup_next_fd_rcu); * * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. + * + * (As an exception to rule 2, you can call filp_close between fget_light and + * fput_light provided that you capture a real refcount with get_file before + * the call to filp_close, and ensure that this real refcount is fput *after* + * the fput_light call.) + * + * See also the documentation in rust/kernel/file.rs. */ -static unsigned long __fget_light(unsigned int fd, fmode_t mask) +static inline struct fd __fget_light(unsigned int fd, fmode_t mask) { struct files_struct *files = current->files; struct file *file; @@ -1013,39 +1187,74 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask) * atomic_read_acquire() pairs with atomic_dec_and_test() in * put_files_struct(). */ - if (atomic_read_acquire(&files->count) == 1) { + if (likely(atomic_read_acquire(&files->count) == 1)) { file = files_lookup_fd_raw(files, fd); if (!file || unlikely(file->f_mode & mask)) - return 0; - return (unsigned long)file; + return EMPTY_FD; + return BORROWED_FD(file); } else { - file = __fget(fd, mask); + file = __fget_files(files, fd, mask); if (!file) - return 0; - return FDPUT_FPUT | (unsigned long)file; + return EMPTY_FD; + return CLONED_FD(file); } } -unsigned long __fdget(unsigned int fd) +struct fd fdget(unsigned int fd) { return __fget_light(fd, FMODE_PATH); } -EXPORT_SYMBOL(__fdget); +EXPORT_SYMBOL(fdget); -unsigned long __fdget_raw(unsigned int fd) +struct fd fdget_raw(unsigned int fd) { return __fget_light(fd, 0); } -unsigned long __fdget_pos(unsigned int fd) +/* + * Try to avoid f_pos locking. We only need it if the + * file is marked for FMODE_ATOMIC_POS, and it can be + * accessed multiple ways. + * + * Always do it for directories, because pidfd_getfd() + * can make a file accessible even if it otherwise would + * not be, and for directories this is a correctness + * issue, not a "POSIX requirement". + */ +static inline bool file_needs_f_pos_lock(struct file *file) +{ + if (!(file->f_mode & FMODE_ATOMIC_POS)) + return false; + if (__file_ref_read_raw(&file->f_ref) != FILE_REF_ONEREF) + return true; + if (file->f_op->iterate_shared) + return true; + return false; +} + +bool file_seek_cur_needs_f_lock(struct file *file) +{ + if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared) + return false; + + /* + * Note that we are not guaranteed to be called after fdget_pos() on + * this file obj, in which case the caller is expected to provide the + * appropriate locking. + */ + + return true; +} + +struct fd fdget_pos(unsigned int fd) { - unsigned long v = __fdget(fd); - struct file *file = (struct file *)(v & ~3); + struct fd f = fdget(fd); + struct file *file = fd_file(f); - if (file && (file->f_mode & FMODE_ATOMIC_POS)) { - v |= FDPUT_POS_UNLOCK; + if (likely(file) && file_needs_f_pos_lock(file)) { + f.word |= FDPUT_POS_UNLOCK; mutex_lock(&file->f_pos_lock); } - return v; + return f; } void __f_unlock_pos(struct file *f) @@ -1062,24 +1271,16 @@ void __f_unlock_pos(struct file *f) void set_close_on_exec(unsigned int fd, int flag) { struct files_struct *files = current->files; - struct fdtable *fdt; spin_lock(&files->file_lock); - fdt = files_fdtable(files); - if (flag) - __set_close_on_exec(fd, fdt); - else - __clear_close_on_exec(fd, fdt); + __set_close_on_exec(fd, files_fdtable(files), flag); spin_unlock(&files->file_lock); } bool get_close_on_exec(unsigned int fd) { - struct files_struct *files = current->files; - struct fdtable *fdt; bool res; rcu_read_lock(); - fdt = files_fdtable(files); - res = close_on_exec(fd, fdt); + res = close_on_exec(fd, current->files); rcu_read_unlock(); return res; } @@ -1092,30 +1293,39 @@ __releases(&files->file_lock) struct fdtable *fdt; /* - * We need to detect attempts to do dup2() over allocated but still - * not finished descriptor. NB: OpenBSD avoids that at the price of - * extra work in their equivalent of fget() - they insert struct - * file immediately after grabbing descriptor, mark it larval if - * more work (e.g. actual opening) is needed and make sure that - * fget() treats larval files as absent. Potentially interesting, - * but while extra work in fget() is trivial, locking implications - * and amount of surgery on open()-related paths in VFS are not. - * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" - * deadlocks in rather amusing ways, AFAICS. All of that is out of - * scope of POSIX or SUS, since neither considers shared descriptor - * tables and this condition does not arise without those. + * dup2() is expected to close the file installed in the target fd slot + * (if any). However, userspace hand-picking a fd may be racing against + * its own threads which happened to allocate it in open() et al but did + * not populate it yet. + * + * Broadly speaking we may be racing against the following: + * fd = get_unused_fd_flags(); // fd slot reserved, ->fd[fd] == NULL + * file = hard_work_goes_here(); + * fd_install(fd, file); // only now ->fd[fd] == file + * + * It is an invariant that a successfully allocated fd has a NULL entry + * in the array until the matching fd_install(). + * + * If we fit the window, we have the fd to populate, yet no target file + * to close. Trying to ignore it and install our new file would violate + * the invariant and make fd_install() overwrite our file. + * + * Things can be done(tm) to handle this. However, the issue does not + * concern legitimate programs and we only need to make sure the kernel + * does not trip over it. + * + * The simplest way out is to return an error if we find ourselves here. + * + * POSIX is silent on the issue, we return -EBUSY. */ fdt = files_fdtable(files); - tofree = fdt->fd[fd]; + fd = array_index_nospec(fd, fdt->max_fds); + tofree = rcu_dereference_raw(fdt->fd[fd]); if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; get_file(file); rcu_assign_pointer(fdt->fd[fd], file); - __set_open_fd(fd, fdt); - if (flags & O_CLOEXEC) - __set_close_on_exec(fd, fdt); - else - __clear_close_on_exec(fd, fdt); + __set_open_fd(fd, fdt, flags & O_CLOEXEC); spin_unlock(&files->file_lock); if (tofree) @@ -1143,7 +1353,10 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags) err = expand_files(files, fd); if (unlikely(err < 0)) goto out_unlock; - return do_dup2(files, file, fd, flags); + err = do_dup2(files, file, fd, flags); + if (err < 0) + return err; + return 0; out_unlock: spin_unlock(&files->file_lock); @@ -1151,7 +1364,7 @@ out_unlock: } /** - * __receive_fd() - Install received file into file descriptor table + * receive_fd() - Install received file into file descriptor table * @file: struct file that was received from another process * @ufd: __user pointer to write new fd number to * @o_flags: the O_* flags to apply to the new fd entry @@ -1165,31 +1378,29 @@ out_unlock: * * Returns newly install fd or -ve on error. */ -int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) +int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { - int new_fd; int error; error = security_file_receive(file); if (error) return error; - new_fd = get_unused_fd_flags(o_flags); - if (new_fd < 0) - return new_fd; + FD_PREPARE(fdf, o_flags, file); + if (fdf.err) + return fdf.err; + get_file(file); if (ufd) { - error = put_user(new_fd, ufd); - if (error) { - put_unused_fd(new_fd); + error = put_user(fd_prepare_fd(fdf), ufd); + if (error) return error; - } } - fd_install(new_fd, get_file(file)); - __receive_sock(file); - return new_fd; + __receive_sock(fd_prepare_file(fdf)); + return fd_publish(fdf); } +EXPORT_SYMBOL_GPL(receive_fd); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) { @@ -1205,12 +1416,6 @@ int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) return new_fd; } -int receive_fd(struct file *file, unsigned int o_flags) -{ - return __receive_fd(file, NULL, o_flags); -} -EXPORT_SYMBOL_GPL(receive_fd); - static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; @@ -1254,12 +1459,16 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) { if (unlikely(newfd == oldfd)) { /* corner case */ struct files_struct *files = current->files; + struct file *f; int retval = oldfd; rcu_read_lock(); - if (!files_lookup_fd_rcu(files, oldfd)) + f = __fget_files_rcu(files, oldfd, 0); + if (!f) retval = -EBADF; rcu_read_unlock(); + if (f) + fput(f); return retval; } return ksys_dup3(oldfd, newfd, 0); |
