diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-11-18 10:30:29 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-11-18 10:30:29 -0800 |
commit | 4c797b11a88297b9b0010b2c6645b191bac2350c (patch) | |
tree | 3eca58d86d27164682ee732cfb49e1404b315584 /include/linux/file_ref.h | |
parent | 8dcf44fcad5ef5c1ff915628255c19cbe91f2588 (diff) | |
parent | aab154a442f9ba2a08fc130dbc8d178a33e10345 (diff) |
Merge tag 'vfs-6.13.file' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs file updates from Christian Brauner:
"This contains changes the changes for files for this cycle:
- Introduce a new reference counting mechanism for files.
As atomic_inc_not_zero() is implemented with a try_cmpxchg() loop
it has O(N^2) behaviour under contention with N concurrent
operations and it is in a hot path in __fget_files_rcu().
The rcuref infrastructures remedies this problem by using an
unconditional increment relying on safe- and dead zones to make
this work and requiring rcu protection for the data structure in
question. This not just scales better it also introduces overflow
protection.
However, in contrast to generic rcuref, files require a memory
barrier and thus cannot rely on *_relaxed() atomic operations and
also require to be built on atomic_long_t as having massive amounts
of reference isn't unheard of even if it is just an attack.
This adds a file specific variant instead of making this a generic
library.
This has been tested by various people and it gives consistent
improvement up to 3-5% on workloads with loads of threads.
- Add a fastpath for find_next_zero_bit(). Skip 2-levels searching
via find_next_zero_bit() when there is a free slot in the word that
contains the next fd. This improves pts/blogbench-1.1.0 read by 8%
and write by 4% on Intel ICX 160.
- Conditionally clear full_fds_bits since it's very likely that a bit
in full_fds_bits has been cleared during __clear_open_fds(). This
improves pts/blogbench-1.1.0 read up to 13%, and write up to 5% on
Intel ICX 160.
- Get rid of all lookup_*_fdget_rcu() variants. They were used to
lookup files without taking a reference count. That became invalid
once files were switched to SLAB_TYPESAFE_BY_RCU and now we're
always taking a reference count. Switch to an already existing
helper and remove the legacy variants.
- Remove pointless includes of <linux/fdtable.h>.
- Avoid cmpxchg() in close_files() as nobody else has a reference to
the files_struct at that point.
- Move close_range() into fs/file.c and fold __close_range() into it.
- Cleanup calling conventions of alloc_fdtable() and expand_files().
- Merge __{set,clear}_close_on_exec() into one.
- Make __set_open_fd() set cloexec as well instead of doing it in two
separate steps"
* tag 'vfs-6.13.file' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
selftests: add file SLAB_TYPESAFE_BY_RCU recycling stressor
fs: port files to file_ref
fs: add file_ref
expand_files(): simplify calling conventions
make __set_open_fd() set cloexec state as well
fs: protect backing files with rcu
file.c: merge __{set,clear}_close_on_exec()
alloc_fdtable(): change calling conventions.
fs/file.c: add fast path in find_next_fd()
fs/file.c: conditionally clear full_fds
fs/file.c: remove sanity_check and add likely/unlikely in alloc_fd()
move close_range(2) into fs/file.c, fold __close_range() into it
close_files(): don't bother with xchg()
remove pointless includes of <linux/fdtable.h>
get rid of ...lookup...fdget_rcu() family
Diffstat (limited to 'include/linux/file_ref.h')
-rw-r--r-- | include/linux/file_ref.h | 177 |
1 files changed, 177 insertions, 0 deletions
diff --git a/include/linux/file_ref.h b/include/linux/file_ref.h new file mode 100644 index 000000000000..9b3a8d9b17ab --- /dev/null +++ b/include/linux/file_ref.h @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _LINUX_FILE_REF_H +#define _LINUX_FILE_REF_H + +#include <linux/atomic.h> +#include <linux/preempt.h> +#include <linux/types.h> + +/* + * file_ref is a reference count implementation specifically for use by + * files. It takes inspiration from rcuref but differs in key aspects + * such as support for SLAB_TYPESAFE_BY_RCU type caches. + * + * FILE_REF_ONEREF FILE_REF_MAXREF + * 0x0000000000000000UL 0x7FFFFFFFFFFFFFFFUL + * <-------------------valid -------------------> + * + * FILE_REF_SATURATED + * 0x8000000000000000UL 0xA000000000000000UL 0xBFFFFFFFFFFFFFFFUL + * <-----------------------saturation zone----------------------> + * + * FILE_REF_RELEASED FILE_REF_DEAD + * 0xC000000000000000UL 0xE000000000000000UL + * <-------------------dead zone-------------------> + * + * FILE_REF_NOREF + * 0xFFFFFFFFFFFFFFFFUL + */ + +#ifdef CONFIG_64BIT +#define FILE_REF_ONEREF 0x0000000000000000UL +#define FILE_REF_MAXREF 0x7FFFFFFFFFFFFFFFUL +#define FILE_REF_SATURATED 0xA000000000000000UL +#define FILE_REF_RELEASED 0xC000000000000000UL +#define FILE_REF_DEAD 0xE000000000000000UL +#define FILE_REF_NOREF 0xFFFFFFFFFFFFFFFFUL +#else +#define FILE_REF_ONEREF 0x00000000U +#define FILE_REF_MAXREF 0x7FFFFFFFU +#define FILE_REF_SATURATED 0xA0000000U +#define FILE_REF_RELEASED 0xC0000000U +#define FILE_REF_DEAD 0xE0000000U +#define FILE_REF_NOREF 0xFFFFFFFFU +#endif + +typedef struct { +#ifdef CONFIG_64BIT + atomic64_t refcnt; +#else + atomic_t refcnt; +#endif +} file_ref_t; + +/** + * file_ref_init - Initialize a file reference count + * @ref: Pointer to the reference count + * @cnt: The initial reference count typically '1' + */ +static inline void file_ref_init(file_ref_t *ref, unsigned long cnt) +{ + atomic_long_set(&ref->refcnt, cnt - 1); +} + +bool __file_ref_put(file_ref_t *ref, unsigned long cnt); + +/** + * file_ref_get - Acquire one reference on a file + * @ref: Pointer to the reference count + * + * Similar to atomic_inc_not_zero() but saturates at FILE_REF_MAXREF. + * + * Provides full memory ordering. + * + * Return: False if the attempt to acquire a reference failed. This happens + * when the last reference has been put already. True if a reference + * was successfully acquired + */ +static __always_inline __must_check bool file_ref_get(file_ref_t *ref) +{ + /* + * Unconditionally increase the reference count with full + * ordering. The saturation and dead zones provide enough + * tolerance for this. + * + * If this indicates negative the file in question the fail can + * be freed and immediately reused due to SLAB_TYPSAFE_BY_RCU. + * Hence, unconditionally altering the file reference count to + * e.g., reset the file reference count back to the middle of + * the deadzone risk end up marking someone else's file as dead + * behind their back. + * + * It would be possible to do a careful: + * + * cnt = atomic_long_inc_return(); + * if (likely(cnt >= 0)) + * return true; + * + * and then something like: + * + * if (cnt >= FILE_REF_RELEASE) + * atomic_long_try_cmpxchg(&ref->refcnt, &cnt, FILE_REF_DEAD), + * + * to set the value back to the middle of the deadzone. But it's + * practically impossible to go from FILE_REF_DEAD to + * FILE_REF_ONEREF. It would need 2305843009213693952/2^61 + * file_ref_get()s to resurrect such a dead file. + */ + return !atomic_long_add_negative(1, &ref->refcnt); +} + +/** + * file_ref_inc - Acquire one reference on a file + * @ref: Pointer to the reference count + * + * Acquire an additional reference on a file. Warns if the caller didn't + * already hold a reference. + */ +static __always_inline void file_ref_inc(file_ref_t *ref) +{ + long prior = atomic_long_fetch_inc_relaxed(&ref->refcnt); + WARN_ONCE(prior < 0, "file_ref_inc() on a released file reference"); +} + +/** + * file_ref_put -- Release a file reference + * @ref: Pointer to the reference count + * + * Provides release memory ordering, such that prior loads and stores + * are done before, and provides an acquire ordering on success such + * that free() must come after. + * + * Return: True if this was the last reference with no future references + * possible. This signals the caller that it can safely release + * the object which is protected by the reference counter. + * False if there are still active references or the put() raced + * with a concurrent get()/put() pair. Caller is not allowed to + * release the protected object. + */ +static __always_inline __must_check bool file_ref_put(file_ref_t *ref) +{ + long cnt; + + /* + * While files are SLAB_TYPESAFE_BY_RCU and thus file_ref_put() + * calls don't risk UAFs when a file is recyclyed, it is still + * vulnerable to UAFs caused by freeing the whole slab page once + * it becomes unused. Prevent file_ref_put() from being + * preempted protects against this. + */ + guard(preempt)(); + /* + * Unconditionally decrease the reference count. The saturation + * and dead zones provide enough tolerance for this. If this + * fails then we need to handle the last reference drop and + * cases inside the saturation and dead zones. + */ + cnt = atomic_long_dec_return(&ref->refcnt); + if (cnt >= 0) + return false; + return __file_ref_put(ref, cnt); +} + +/** + * file_ref_read - Read the number of file references + * @ref: Pointer to the reference count + * + * Return: The number of held references (0 ... N) + */ +static inline unsigned long file_ref_read(file_ref_t *ref) +{ + unsigned long c = atomic_long_read(&ref->refcnt); + + /* Return 0 if within the DEAD zone. */ + return c >= FILE_REF_RELEASED ? 0 : c + 1; +} + +#endif |