summaryrefslogtreecommitdiff
path: root/fs/file.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/file.c')
-rw-r--r--fs/file.c567
1 files changed, 295 insertions, 272 deletions
diff --git a/fs/file.c b/fs/file.c
index 3b683b9101d8..3a3146664cf3 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -20,10 +20,79 @@
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/close_range.h>
+#include <linux/file_ref.h>
#include <net/sock.h>
+#include <linux/init_task.h>
#include "internal.h"
+static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt)
+{
+ /*
+ * If the reference count was already in the dead zone, then this
+ * put() operation is imbalanced. Warn, put the reference count back to
+ * DEAD and tell the caller to not deconstruct the object.
+ */
+ if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) {
+ atomic_long_set(&ref->refcnt, FILE_REF_DEAD);
+ return false;
+ }
+
+ /*
+ * This is a put() operation on a saturated refcount. Restore the
+ * mean saturation value and tell the caller to not deconstruct the
+ * object.
+ */
+ if (cnt > FILE_REF_MAXREF)
+ atomic_long_set(&ref->refcnt, FILE_REF_SATURATED);
+ return false;
+}
+
+/**
+ * __file_ref_put - Slowpath of file_ref_put()
+ * @ref: Pointer to the reference count
+ * @cnt: Current reference count
+ *
+ * Invoked when the reference count is outside of the valid zone.
+ *
+ * Return:
+ * True if this was the last reference with no future references
+ * possible. This signals the caller that it can safely schedule the
+ * object, which is protected by the reference counter, for
+ * deconstruction.
+ *
+ * False if there are still active references or the put() raced
+ * with a concurrent get()/put() pair. Caller is not allowed to
+ * deconstruct the protected object.
+ */
+bool __file_ref_put(file_ref_t *ref, unsigned long cnt)
+{
+ /* Did this drop the last reference? */
+ if (likely(cnt == FILE_REF_NOREF)) {
+ /*
+ * Carefully try to set the reference count to FILE_REF_DEAD.
+ *
+ * This can fail if a concurrent get() operation has
+ * elevated it again or the corresponding put() even marked
+ * it dead already. Both are valid situations and do not
+ * require a retry. If this fails the caller is not
+ * allowed to deconstruct the object.
+ */
+ if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD))
+ return false;
+
+ /*
+ * The caller can safely schedule the object for
+ * deconstruction. Provide acquire ordering.
+ */
+ smp_acquire__after_ctrl_dep();
+ return true;
+ }
+
+ return __file_ref_put_badval(ref, cnt);
+}
+EXPORT_SYMBOL_GPL(__file_ref_put);
+
unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
/* our min() is unusable in constant expressions ;-/ */
@@ -46,27 +115,23 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
#define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr))
#define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long))
+#define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds
/*
* Copy 'count' fd bits from the old table to the new table and clear the extra
* space if any. This does not copy the file pointers. Called with the files
* spinlock held for write.
*/
-static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
- unsigned int count)
+static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
+ unsigned int copy_words)
{
- unsigned int cpy, set;
-
- cpy = count / BITS_PER_BYTE;
- set = (nfdt->max_fds - count) / BITS_PER_BYTE;
- memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
- memset((char *)nfdt->open_fds + cpy, 0, set);
- memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
- memset((char *)nfdt->close_on_exec + cpy, 0, set);
-
- cpy = BITBIT_SIZE(count);
- set = BITBIT_SIZE(nfdt->max_fds) - cpy;
- memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
- memset((char *)nfdt->full_fds_bits + cpy, 0, set);
+ unsigned int nwords = fdt_words(nfdt);
+
+ bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds,
+ copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
+ bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec,
+ copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
+ bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits,
+ copy_words, nwords);
}
/*
@@ -84,7 +149,7 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
memcpy(nfdt->fd, ofdt->fd, cpy);
memset((char *)nfdt->fd + cpy, 0, set);
- copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
+ copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt));
}
/*
@@ -93,18 +158,11 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
* 'unsigned long' in some places, but simply because that is how the Linux
* kernel bitmaps are defined to work: they are not "bits in an array of bytes",
* they are very much "bits in an array of unsigned long".
- *
- * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
- * by that "1024/sizeof(ptr)" before, we already know there are sufficient
- * clear low bits. Clang seems to realize that, gcc ends up being confused.
- *
- * On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
- * let's consider it documentation (and maybe a test-case for gcc to improve
- * its code generation ;)
*/
-static struct fdtable * alloc_fdtable(unsigned int nr)
+static struct fdtable *alloc_fdtable(unsigned int slots_wanted)
{
struct fdtable *fdt;
+ unsigned int nr;
void *data;
/*
@@ -112,22 +170,32 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
* Allocation steps are keyed to the size of the fdarray, since it
* grows far faster than any of the other dynamic data. We try to fit
* the fdarray into comfortable page-tuned chunks: starting at 1024B
- * and growing in powers of two from there on.
+ * and growing in powers of two from there on. Since we called only
+ * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab
+ * already gives BITS_PER_LONG slots), the above boils down to
+ * 1. use the smallest power of two large enough to give us that many
+ * slots.
+ * 2. on 32bit skip 64 and 128 - the minimal capacity we want there is
+ * 256 slots (i.e. 1Kb fd array).
+ * 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there
+ * and we are never going to be asked for 64 or less.
*/
- nr /= (1024 / sizeof(struct file *));
- nr = roundup_pow_of_two(nr + 1);
- nr *= (1024 / sizeof(struct file *));
- nr = ALIGN(nr, BITS_PER_LONG);
+ if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256)
+ nr = 256;
+ else
+ nr = roundup_pow_of_two(slots_wanted);
/*
* Note that this can drive nr *below* what we had passed if sysctl_nr_open
- * had been set lower between the check in expand_files() and here. Deal
- * with that in caller, it's cheaper that way.
+ * had been set lower between the check in expand_files() and here.
*
* We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
* bitmaps handling below becomes unpleasant, to put it mildly...
*/
- if (unlikely(nr > sysctl_nr_open))
- nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
+ if (unlikely(nr > sysctl_nr_open)) {
+ nr = round_down(sysctl_nr_open, BITS_PER_LONG);
+ if (nr < slots_wanted)
+ return ERR_PTR(-EMFILE);
+ }
fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
if (!fdt)
@@ -156,14 +224,14 @@ out_arr:
out_fdt:
kfree(fdt);
out:
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
/*
* Expand the file descriptor table.
* This function will allocate a new fdtable and both fd array and fdset, of
* the given size.
- * Return <0 error code on error; 1 on successful completion.
+ * Return <0 error code on error; 0 on successful completion.
* The files->file_lock should be held on entry, and will be held on exit.
*/
static int expand_fdtable(struct files_struct *files, unsigned int nr)
@@ -173,7 +241,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
struct fdtable *new_fdt, *cur_fdt;
spin_unlock(&files->file_lock);
- new_fdt = alloc_fdtable(nr);
+ new_fdt = alloc_fdtable(nr + 1);
/* make sure all fd_install() have seen resize_in_progress
* or have finished their rcu_read_lock_sched() section.
@@ -182,16 +250,8 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
synchronize_rcu();
spin_lock(&files->file_lock);
- if (!new_fdt)
- return -ENOMEM;
- /*
- * extremely unlikely race - sysctl_nr_open decreased between the check in
- * caller and alloc_fdtable(). Cheaper to catch it here...
- */
- if (unlikely(new_fdt->max_fds <= nr)) {
- __free_fdtable(new_fdt);
- return -EMFILE;
- }
+ if (IS_ERR(new_fdt))
+ return PTR_ERR(new_fdt);
cur_fdt = files_fdtable(files);
BUG_ON(nr < cur_fdt->max_fds);
copy_fdtable(new_fdt, cur_fdt);
@@ -200,15 +260,14 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
/* coupled with smp_rmb() in fd_install() */
smp_wmb();
- return 1;
+ return 0;
}
/*
* Expand files.
* This function will expand the file structures, if the requested size exceeds
* the current capacity and there is room for expansion.
- * Return <0 error code on error; 0 when nothing done; 1 when files were
- * expanded and execution may have blocked.
+ * Return <0 error code on error; 0 on success.
* The files->file_lock should be held on entry, and will be held on exit.
*/
static int expand_files(struct files_struct *files, unsigned int nr)
@@ -216,50 +275,50 @@ static int expand_files(struct files_struct *files, unsigned int nr)
__acquires(files->file_lock)
{
struct fdtable *fdt;
- int expanded = 0;
+ int error;
repeat:
fdt = files_fdtable(files);
/* Do we need to expand? */
if (nr < fdt->max_fds)
- return expanded;
-
- /* Can we expand? */
- if (nr >= sysctl_nr_open)
- return -EMFILE;
+ return 0;
if (unlikely(files->resize_in_progress)) {
spin_unlock(&files->file_lock);
- expanded = 1;
wait_event(files->resize_wait, !files->resize_in_progress);
spin_lock(&files->file_lock);
goto repeat;
}
+ /* Can we expand? */
+ if (unlikely(nr >= sysctl_nr_open))
+ return -EMFILE;
+
/* All good, so we try */
files->resize_in_progress = true;
- expanded = expand_fdtable(files, nr);
+ error = expand_fdtable(files, nr);
files->resize_in_progress = false;
wake_up_all(&files->resize_wait);
- return expanded;
-}
-
-static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt)
-{
- __set_bit(fd, fdt->close_on_exec);
+ return error;
}
-static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt)
+static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt,
+ bool set)
{
- if (test_bit(fd, fdt->close_on_exec))
- __clear_bit(fd, fdt->close_on_exec);
+ if (set) {
+ __set_bit(fd, fdt->close_on_exec);
+ } else {
+ if (test_bit(fd, fdt->close_on_exec))
+ __clear_bit(fd, fdt->close_on_exec);
+ }
}
-static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
+static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set)
{
__set_bit(fd, fdt->open_fds);
+ __set_close_on_exec(fd, fdt, set);
fd /= BITS_PER_LONG;
if (!~fdt->open_fds[fd])
__set_bit(fd, fdt->full_fds_bits);
@@ -268,62 +327,54 @@ static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
{
__clear_bit(fd, fdt->open_fds);
- __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
+ fd /= BITS_PER_LONG;
+ if (test_bit(fd, fdt->full_fds_bits))
+ __clear_bit(fd, fdt->full_fds_bits);
}
-static unsigned int count_open_files(struct fdtable *fdt)
+static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
{
- unsigned int size = fdt->max_fds;
- unsigned int i;
-
- /* Find the last open fd */
- for (i = size / BITS_PER_LONG; i > 0; ) {
- if (fdt->open_fds[--i])
- break;
- }
- i = (i + 1) * BITS_PER_LONG;
- return i;
+ return test_bit(fd, fdt->open_fds);
}
/*
* Note that a sane fdtable size always has to be a multiple of
* BITS_PER_LONG, since we have bitmaps that are sized by this.
*
- * 'max_fds' will normally already be properly aligned, but it
- * turns out that in the close_range() -> __close_range() ->
- * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end
- * up having a 'max_fds' value that isn't already aligned.
- *
- * Rather than make close_range() have to worry about this,
- * just make that BITS_PER_LONG alignment be part of a sane
- * fdtable size. Becuase that's really what it is.
+ * punch_hole is optional - when close_range() is asked to unshare
+ * and close, we don't need to copy descriptors in that range, so
+ * a smaller cloned descriptor table might suffice if the last
+ * currently opened descriptor falls into that range.
*/
-static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
+static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole)
{
- unsigned int count;
-
- count = count_open_files(fdt);
- if (max_fds < NR_OPEN_DEFAULT)
- max_fds = NR_OPEN_DEFAULT;
- return ALIGN(min(count, max_fds), BITS_PER_LONG);
+ unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds);
+
+ if (last == fdt->max_fds)
+ return NR_OPEN_DEFAULT;
+ if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) {
+ last = find_last_bit(fdt->open_fds, punch_hole->from);
+ if (last == punch_hole->from)
+ return NR_OPEN_DEFAULT;
+ }
+ return ALIGN(last + 1, BITS_PER_LONG);
}
/*
- * Allocate a new files structure and copy contents from the
- * passed in files structure.
- * errorp will be valid only when the returned files_struct is NULL.
+ * Allocate a new descriptor table and copy contents from the passed in
+ * instance. Returns a pointer to cloned table on success, ERR_PTR()
+ * on failure. For 'punch_hole' see sane_fdtable_size().
*/
-struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp)
+struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole)
{
struct files_struct *newf;
struct file **old_fds, **new_fds;
unsigned int open_files, i;
struct fdtable *old_fdt, *new_fdt;
- *errorp = -ENOMEM;
newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
if (!newf)
- goto out;
+ return ERR_PTR(-ENOMEM);
atomic_set(&newf->count, 1);
@@ -340,7 +391,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
spin_lock(&oldf->file_lock);
old_fdt = files_fdtable(oldf);
- open_files = sane_fdtable_size(old_fdt, max_fds);
+ open_files = sane_fdtable_size(old_fdt, punch_hole);
/*
* Check whether we need to allocate a larger fd array and fd set.
@@ -351,17 +402,10 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
if (new_fdt != &newf->fdtab)
__free_fdtable(new_fdt);
- new_fdt = alloc_fdtable(open_files - 1);
- if (!new_fdt) {
- *errorp = -ENOMEM;
- goto out_release;
- }
-
- /* beyond sysctl_nr_open; nothing to do */
- if (unlikely(new_fdt->max_fds < open_files)) {
- __free_fdtable(new_fdt);
- *errorp = -EMFILE;
- goto out_release;
+ new_fdt = alloc_fdtable(open_files);
+ if (IS_ERR(new_fdt)) {
+ kmem_cache_free(files_cachep, newf);
+ return ERR_CAST(new_fdt);
}
/*
@@ -371,25 +415,33 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
*/
spin_lock(&oldf->file_lock);
old_fdt = files_fdtable(oldf);
- open_files = sane_fdtable_size(old_fdt, max_fds);
+ open_files = sane_fdtable_size(old_fdt, punch_hole);
}
- copy_fd_bitmaps(new_fdt, old_fdt, open_files);
+ copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG);
old_fds = old_fdt->fd;
new_fds = new_fdt->fd;
+ /*
+ * We may be racing against fd allocation from other threads using this
+ * files_struct, despite holding ->file_lock.
+ *
+ * alloc_fd() might have already claimed a slot, while fd_install()
+ * did not populate it yet. Note the latter operates locklessly, so
+ * the file can show up as we are walking the array below.
+ *
+ * At the same time we know no files will disappear as all other
+ * operations take the lock.
+ *
+ * Instead of trying to placate userspace racing with itself, we
+ * ref the file if we see it and mark the fd slot as unused otherwise.
+ */
for (i = open_files; i != 0; i--) {
- struct file *f = *old_fds++;
+ struct file *f = rcu_dereference_raw(*old_fds++);
if (f) {
get_file(f);
} else {
- /*
- * The fd may be claimed in the fd bitmap but not yet
- * instantiated in the files array if a sibling thread
- * is partway through open(). So make sure that this
- * fd is available to the new process.
- */
__clear_open_fd(open_files - i, new_fdt);
}
rcu_assign_pointer(*new_fds++, f);
@@ -402,11 +454,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
rcu_assign_pointer(newf->fdt, new_fdt);
return newf;
-
-out_release:
- kmem_cache_free(files_cachep, newf);
-out:
- return NULL;
}
static struct fdtable *close_files(struct files_struct * files)
@@ -427,7 +474,7 @@ static struct fdtable *close_files(struct files_struct * files)
set = fdt->open_fds[j++];
while (set) {
if (set & 1) {
- struct file * file = xchg(&fdt->fd[i], NULL);
+ struct file *file = fdt->fd[i];
if (file) {
filp_close(file, files);
cond_resched();
@@ -481,12 +528,21 @@ struct files_struct init_files = {
static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
{
- unsigned int maxfd = fdt->max_fds;
+ unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
unsigned int maxbit = maxfd / BITS_PER_LONG;
unsigned int bitbit = start / BITS_PER_LONG;
+ unsigned int bit;
+
+ /*
+ * Try to avoid looking at the second level bitmap
+ */
+ bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG,
+ start & (BITS_PER_LONG - 1));
+ if (bit < BITS_PER_LONG)
+ return bit + bitbit * BITS_PER_LONG;
bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
- if (bitbit > maxfd)
+ if (bitbit >= maxfd)
return maxfd;
if (bitbit > start)
start = bitbit;
@@ -510,7 +566,7 @@ repeat:
if (fd < files->next_fd)
fd = files->next_fd;
- if (fd < fdt->max_fds)
+ if (likely(fd < fdt->max_fds))
fd = find_next_fd(fdt, fd);
/*
@@ -518,36 +574,23 @@ repeat:
* will limit the total number of files that can be opened.
*/
error = -EMFILE;
- if (fd >= end)
+ if (unlikely(fd >= end))
goto out;
- error = expand_files(files, fd);
- if (error < 0)
- goto out;
+ if (unlikely(fd >= fdt->max_fds)) {
+ error = expand_files(files, fd);
+ if (error < 0)
+ goto out;
- /*
- * If we needed to expand the fs array we
- * might have blocked - try again.
- */
- if (error)
goto repeat;
+ }
if (start <= files->next_fd)
files->next_fd = fd + 1;
- __set_open_fd(fd, fdt);
- if (flags & O_CLOEXEC)
- __set_close_on_exec(fd, fdt);
- else
- __clear_close_on_exec(fd, fdt);
+ __set_open_fd(fd, fdt, flags & O_CLOEXEC);
error = fd;
-#if 1
- /* Sanity check */
- if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
- printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
- rcu_assign_pointer(fdt->fd[fd], NULL);
- }
-#endif
+ VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
out:
spin_unlock(&files->file_lock);
@@ -583,22 +626,14 @@ void put_unused_fd(unsigned int fd)
EXPORT_SYMBOL(put_unused_fd);
-/*
- * Install a file pointer in the fd array.
- *
- * The VFS is full of places where we drop the files lock between
- * setting the open_fds bitmap and installing the file in the file
- * array. At any such point, we are vulnerable to a dup2() race
- * installing a file in the array before us. We need to detect this and
- * fput() the struct file we are about to overwrite in this case.
- *
- * It should never happen - if we allow dup2() do it, _really_ bad things
- * will follow.
+/**
+ * fd_install - install a file pointer in the fd array
+ * @fd: file descriptor to install the file in
+ * @file: the file to install
*
* This consumes the "file" refcount, so callers should treat it
* as if they had called fput(file).
*/
-
void fd_install(unsigned int fd, struct file *file)
{
struct files_struct *files = current->files;
@@ -613,7 +648,7 @@ void fd_install(unsigned int fd, struct file *file)
rcu_read_unlock_sched();
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
- BUG_ON(fdt->fd[fd] != NULL);
+ VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
spin_unlock(&files->file_lock);
return;
@@ -621,7 +656,7 @@ void fd_install(unsigned int fd, struct file *file)
/* coupled with smp_wmb() in expand_fdtable() */
smp_rmb();
fdt = rcu_dereference_sched(files->fdt);
- BUG_ON(fdt->fd[fd] != NULL);
+ VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
rcu_read_unlock_sched();
}
@@ -650,7 +685,7 @@ struct file *file_close_fd_locked(struct files_struct *files, unsigned fd)
return NULL;
fd = array_index_nospec(fd, fdt->max_fds);
- file = fdt->fd[fd];
+ file = rcu_dereference_raw(fdt->fd[fd]);
if (file) {
rcu_assign_pointer(fdt->fd[fd], NULL);
__put_unused_fd(files, fd);
@@ -671,7 +706,7 @@ int close_fd(unsigned fd)
return filp_close(file, files);
}
-EXPORT_SYMBOL(close_fd); /* for ksys_close() */
+EXPORT_SYMBOL(close_fd);
/**
* last_fd - return last valid index into fd table
@@ -727,7 +762,7 @@ static inline void __range_close(struct files_struct *files, unsigned int fd,
}
/**
- * __close_range() - Close all file descriptors in a given range.
+ * sys_close_range() - Close all file descriptors in a given range.
*
* @fd: starting file descriptor to close
* @max_fd: last file descriptor to close
@@ -735,8 +770,10 @@ static inline void __range_close(struct files_struct *files, unsigned int fd,
*
* This closes a range of file descriptors. All file descriptors
* from @fd up to and including @max_fd are closed.
+ * Currently, errors to close a given file descriptor are ignored.
*/
-int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
+SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
+ unsigned int, flags)
{
struct task_struct *me = current;
struct files_struct *cur_fds = me->files, *fds = NULL;
@@ -747,37 +784,25 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
if (fd > max_fd)
return -EINVAL;
- if (flags & CLOSE_RANGE_UNSHARE) {
- int ret;
- unsigned int max_unshare_fds = NR_OPEN_MAX;
+ if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) {
+ struct fd_range range = {fd, max_fd}, *punch_hole = &range;
/*
* If the caller requested all fds to be made cloexec we always
* copy all of the file descriptors since they still want to
* use them.
*/
- if (!(flags & CLOSE_RANGE_CLOEXEC)) {
- /*
- * If the requested range is greater than the current
- * maximum, we're closing everything so only copy all
- * file descriptors beneath the lowest file descriptor.
- */
- rcu_read_lock();
- if (max_fd >= last_fd(files_fdtable(cur_fds)))
- max_unshare_fds = fd;
- rcu_read_unlock();
- }
-
- ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);
- if (ret)
- return ret;
+ if (flags & CLOSE_RANGE_CLOEXEC)
+ punch_hole = NULL;
+ fds = dup_fd(cur_fds, punch_hole);
+ if (IS_ERR(fds))
+ return PTR_ERR(fds);
/*
* We used to share our file descriptor table, and have now
* created a private one, make sure we're using it below.
*/
- if (fds)
- swap(cur_fds, fds);
+ swap(cur_fds, fds);
}
if (flags & CLOSE_RANGE_CLOEXEC)
@@ -865,7 +890,7 @@ static struct file *__get_file_rcu(struct file __rcu **f)
if (!file)
return NULL;
- if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
+ if (unlikely(!file_ref_get(&file->f_ref)))
return ERR_PTR(-EAGAIN);
file_reloaded = rcu_dereference_raw(*f);
@@ -879,8 +904,8 @@ static struct file *__get_file_rcu(struct file __rcu **f)
OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
/*
- * atomic_long_inc_not_zero() above provided a full memory
- * barrier when we acquired a reference.
+ * file_ref_get() above provided a full memory barrier when we
+ * acquired a reference.
*
* This is paired with the write barrier from assigning to the
* __rcu protected file pointer so that if that pointer still
@@ -915,13 +940,8 @@ struct file *get_file_rcu(struct file __rcu **f)
struct file __rcu *file;
file = __get_file_rcu(f);
- if (unlikely(!file))
- return NULL;
-
- if (unlikely(IS_ERR(file)))
- continue;
-
- return file;
+ if (!IS_ERR(file))
+ return file;
}
}
EXPORT_SYMBOL_GPL(get_file_rcu);
@@ -983,11 +1003,11 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
* We need to confirm it by incrementing the refcount
* and then check the lookup again.
*
- * atomic_long_inc_not_zero() gives us a full memory
- * barrier. We only really need an 'acquire' one to
- * protect the loads below, but we don't have that.
+ * file_ref_get() gives us a full memory barrier. We
+ * only really need an 'acquire' one to protect the
+ * loads below, but we don't have that.
*/
- if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
+ if (unlikely(!file_ref_get(&file->f_ref)))
continue;
/*
@@ -1068,29 +1088,7 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
return file;
}
-struct file *lookup_fdget_rcu(unsigned int fd)
-{
- return __fget_files_rcu(current->files, fd, 0);
-
-}
-EXPORT_SYMBOL_GPL(lookup_fdget_rcu);
-
-struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd)
-{
- /* Must be called with rcu_read_lock held */
- struct files_struct *files;
- struct file *file = NULL;
-
- task_lock(task);
- files = task->files;
- if (files)
- file = __fget_files_rcu(files, fd, 0);
- task_unlock(task);
-
- return file;
-}
-
-struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd)
+struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd)
{
/* Must be called with rcu_read_lock held */
struct files_struct *files;
@@ -1100,17 +1098,19 @@ struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *
task_lock(task);
files = task->files;
if (files) {
+ rcu_read_lock();
for (; fd < files_fdtable(files)->max_fds; fd++) {
file = __fget_files_rcu(files, fd, 0);
if (file)
break;
}
+ rcu_read_unlock();
}
task_unlock(task);
*ret_fd = fd;
return file;
}
-EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
+EXPORT_SYMBOL(fget_task_next);
/*
* Lightweight file lookup - no refcnt increment if fd table isn't shared.
@@ -1127,8 +1127,15 @@ EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
*
* The fput_needed flag returned by fget_light should be passed to the
* corresponding fput_light.
+ *
+ * (As an exception to rule 2, you can call filp_close between fget_light and
+ * fput_light provided that you capture a real refcount with get_file before
+ * the call to filp_close, and ensure that this real refcount is fput *after*
+ * the fput_light call.)
+ *
+ * See also the documentation in rust/kernel/file.rs.
*/
-static unsigned long __fget_light(unsigned int fd, fmode_t mask)
+static inline struct fd __fget_light(unsigned int fd, fmode_t mask)
{
struct files_struct *files = current->files;
struct file *file;
@@ -1145,22 +1152,22 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask)
if (likely(atomic_read_acquire(&files->count) == 1)) {
file = files_lookup_fd_raw(files, fd);
if (!file || unlikely(file->f_mode & mask))
- return 0;
- return (unsigned long)file;
+ return EMPTY_FD;
+ return BORROWED_FD(file);
} else {
file = __fget_files(files, fd, mask);
if (!file)
- return 0;
- return FDPUT_FPUT | (unsigned long)file;
+ return EMPTY_FD;
+ return CLONED_FD(file);
}
}
-unsigned long __fdget(unsigned int fd)
+struct fd fdget(unsigned int fd)
{
return __fget_light(fd, FMODE_PATH);
}
-EXPORT_SYMBOL(__fdget);
+EXPORT_SYMBOL(fdget);
-unsigned long __fdget_raw(unsigned int fd)
+struct fd fdget_raw(unsigned int fd)
{
return __fget_light(fd, 0);
}
@@ -1177,20 +1184,35 @@ unsigned long __fdget_raw(unsigned int fd)
*/
static inline bool file_needs_f_pos_lock(struct file *file)
{
- return (file->f_mode & FMODE_ATOMIC_POS) &&
- (file_count(file) > 1 || file->f_op->iterate_shared);
+ if (!(file->f_mode & FMODE_ATOMIC_POS))
+ return false;
+ if (__file_ref_read_raw(&file->f_ref) != FILE_REF_ONEREF)
+ return true;
+ if (file->f_op->iterate_shared)
+ return true;
+ return false;
+}
+
+bool file_seek_cur_needs_f_lock(struct file *file)
+{
+ if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared)
+ return false;
+
+ VFS_WARN_ON_ONCE((file_count(file) > 1) &&
+ !mutex_is_locked(&file->f_pos_lock));
+ return true;
}
-unsigned long __fdget_pos(unsigned int fd)
+struct fd fdget_pos(unsigned int fd)
{
- unsigned long v = __fdget(fd);
- struct file *file = (struct file *)(v & ~3);
+ struct fd f = fdget(fd);
+ struct file *file = fd_file(f);
- if (file && file_needs_f_pos_lock(file)) {
- v |= FDPUT_POS_UNLOCK;
+ if (likely(file) && file_needs_f_pos_lock(file)) {
+ f.word |= FDPUT_POS_UNLOCK;
mutex_lock(&file->f_pos_lock);
}
- return v;
+ return f;
}
void __f_unlock_pos(struct file *f)
@@ -1207,24 +1229,16 @@ void __f_unlock_pos(struct file *f)
void set_close_on_exec(unsigned int fd, int flag)
{
struct files_struct *files = current->files;
- struct fdtable *fdt;
spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- if (flag)
- __set_close_on_exec(fd, fdt);
- else
- __clear_close_on_exec(fd, fdt);
+ __set_close_on_exec(fd, files_fdtable(files), flag);
spin_unlock(&files->file_lock);
}
bool get_close_on_exec(unsigned int fd)
{
- struct files_struct *files = current->files;
- struct fdtable *fdt;
bool res;
rcu_read_lock();
- fdt = files_fdtable(files);
- res = close_on_exec(fd, fdt);
+ res = close_on_exec(fd, current->files);
rcu_read_unlock();
return res;
}
@@ -1237,30 +1251,39 @@ __releases(&files->file_lock)
struct fdtable *fdt;
/*
- * We need to detect attempts to do dup2() over allocated but still
- * not finished descriptor. NB: OpenBSD avoids that at the price of
- * extra work in their equivalent of fget() - they insert struct
- * file immediately after grabbing descriptor, mark it larval if
- * more work (e.g. actual opening) is needed and make sure that
- * fget() treats larval files as absent. Potentially interesting,
- * but while extra work in fget() is trivial, locking implications
- * and amount of surgery on open()-related paths in VFS are not.
- * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
- * deadlocks in rather amusing ways, AFAICS. All of that is out of
- * scope of POSIX or SUS, since neither considers shared descriptor
- * tables and this condition does not arise without those.
+ * dup2() is expected to close the file installed in the target fd slot
+ * (if any). However, userspace hand-picking a fd may be racing against
+ * its own threads which happened to allocate it in open() et al but did
+ * not populate it yet.
+ *
+ * Broadly speaking we may be racing against the following:
+ * fd = get_unused_fd_flags(); // fd slot reserved, ->fd[fd] == NULL
+ * file = hard_work_goes_here();
+ * fd_install(fd, file); // only now ->fd[fd] == file
+ *
+ * It is an invariant that a successfully allocated fd has a NULL entry
+ * in the array until the matching fd_install().
+ *
+ * If we fit the window, we have the fd to populate, yet no target file
+ * to close. Trying to ignore it and install our new file would violate
+ * the invariant and make fd_install() overwrite our file.
+ *
+ * Things can be done(tm) to handle this. However, the issue does not
+ * concern legitimate programs and we only need to make sure the kernel
+ * does not trip over it.
+ *
+ * The simplest way out is to return an error if we find ourselves here.
+ *
+ * POSIX is silent on the issue, we return -EBUSY.
*/
fdt = files_fdtable(files);
- tofree = fdt->fd[fd];
+ fd = array_index_nospec(fd, fdt->max_fds);
+ tofree = rcu_dereference_raw(fdt->fd[fd]);
if (!tofree && fd_is_open(fd, fdt))
goto Ebusy;
get_file(file);
rcu_assign_pointer(fdt->fd[fd], file);
- __set_open_fd(fd, fdt);
- if (flags & O_CLOEXEC)
- __set_close_on_exec(fd, fdt);
- else
- __clear_close_on_exec(fd, fdt);
+ __set_open_fd(fd, fdt, flags & O_CLOEXEC);
spin_unlock(&files->file_lock);
if (tofree)