diff options
Diffstat (limited to 'lib/vdso')
| -rw-r--r-- | lib/vdso/Kconfig | 27 | ||||
| -rw-r--r-- | lib/vdso/Makefile | 3 | ||||
| -rw-r--r-- | lib/vdso/Makefile.include | 18 | ||||
| -rw-r--r-- | lib/vdso/datastore.c | 130 | ||||
| -rw-r--r-- | lib/vdso/getrandom.c | 264 | ||||
| -rw-r--r-- | lib/vdso/gettimeofday.c | 513 |
6 files changed, 955 insertions, 0 deletions
diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig new file mode 100644 index 000000000000..db87ba34ef19 --- /dev/null +++ b/lib/vdso/Kconfig @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: GPL-2.0 + +config HAVE_GENERIC_VDSO + bool + +if HAVE_GENERIC_VDSO + +config GENERIC_GETTIMEOFDAY + bool + help + This is a generic implementation of gettimeofday vdso. + Each architecture that enables this feature has to + provide the fallback implementation. + +config GENERIC_VDSO_OVERFLOW_PROTECT + bool + help + Select to add multiplication overflow protection to the VDSO + time getter functions for the price of an extra conditional + in the hotpath. + +config VDSO_GETRANDOM + bool + help + Selected by architectures that support vDSO getrandom(). + +endif diff --git a/lib/vdso/Makefile b/lib/vdso/Makefile new file mode 100644 index 000000000000..405f743253d7 --- /dev/null +++ b/lib/vdso/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_HAVE_GENERIC_VDSO) += datastore.o diff --git a/lib/vdso/Makefile.include b/lib/vdso/Makefile.include new file mode 100644 index 000000000000..cedbf15f8087 --- /dev/null +++ b/lib/vdso/Makefile.include @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0 + +GENERIC_VDSO_MK_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) +GENERIC_VDSO_DIR := $(dir $(GENERIC_VDSO_MK_PATH)) + +c-gettimeofday-$(CONFIG_GENERIC_GETTIMEOFDAY) := $(addprefix $(GENERIC_VDSO_DIR), gettimeofday.c) +c-getrandom-$(CONFIG_VDSO_GETRANDOM) := $(addprefix $(GENERIC_VDSO_DIR), getrandom.c) + +# This cmd checks that the vdso library does not contain dynamic relocations. +# It has to be called after the linking of the vdso library and requires it +# as a parameter. +# +# As a workaround for some GNU ld ports which produce unneeded R_*_NONE +# dynamic relocations, ignore R_*_NONE. +quiet_cmd_vdso_check = VDSOCHK $@ + cmd_vdso_check = if $(READELF) -rW $@ | grep -v _NONE | grep -q " R_\w*_"; \ + then (echo >&2 "$@: dynamic relocations are not supported"; \ + rm -f $@; /bin/false); fi diff --git a/lib/vdso/datastore.c b/lib/vdso/datastore.c new file mode 100644 index 000000000000..a565c30c71a0 --- /dev/null +++ b/lib/vdso/datastore.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/linkage.h> +#include <linux/mmap_lock.h> +#include <linux/mm.h> +#include <linux/time_namespace.h> +#include <linux/types.h> +#include <linux/vdso_datastore.h> +#include <vdso/datapage.h> + +/* + * The vDSO data page. + */ +#ifdef CONFIG_GENERIC_GETTIMEOFDAY +static union { + struct vdso_time_data data; + u8 page[PAGE_SIZE]; +} vdso_time_data_store __page_aligned_data; +struct vdso_time_data *vdso_k_time_data = &vdso_time_data_store.data; +static_assert(sizeof(vdso_time_data_store) == PAGE_SIZE); +#endif /* CONFIG_GENERIC_GETTIMEOFDAY */ + +#ifdef CONFIG_VDSO_GETRANDOM +static union { + struct vdso_rng_data data; + u8 page[PAGE_SIZE]; +} vdso_rng_data_store __page_aligned_data; +struct vdso_rng_data *vdso_k_rng_data = &vdso_rng_data_store.data; +static_assert(sizeof(vdso_rng_data_store) == PAGE_SIZE); +#endif /* CONFIG_VDSO_GETRANDOM */ + +#ifdef CONFIG_ARCH_HAS_VDSO_ARCH_DATA +static union { + struct vdso_arch_data data; + u8 page[VDSO_ARCH_DATA_SIZE]; +} vdso_arch_data_store __page_aligned_data; +struct vdso_arch_data *vdso_k_arch_data = &vdso_arch_data_store.data; +#endif /* CONFIG_ARCH_HAS_VDSO_ARCH_DATA */ + +static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *timens_page = find_timens_vvar_page(vma); + unsigned long addr, pfn; + vm_fault_t err; + + switch (vmf->pgoff) { + case VDSO_TIME_PAGE_OFFSET: + if (!IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)) + return VM_FAULT_SIGBUS; + pfn = __phys_to_pfn(__pa_symbol(vdso_k_time_data)); + if (timens_page) { + /* + * Fault in VVAR page too, since it will be accessed + * to get clock data anyway. + */ + addr = vmf->address + VDSO_TIMENS_PAGE_OFFSET * PAGE_SIZE; + err = vmf_insert_pfn(vma, addr, pfn); + if (unlikely(err & VM_FAULT_ERROR)) + return err; + pfn = page_to_pfn(timens_page); + } + break; + case VDSO_TIMENS_PAGE_OFFSET: + /* + * If a task belongs to a time namespace then a namespace + * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and + * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET + * offset. + * See also the comment near timens_setup_vdso_data(). + */ + if (!IS_ENABLED(CONFIG_TIME_NS) || !timens_page) + return VM_FAULT_SIGBUS; + pfn = __phys_to_pfn(__pa_symbol(vdso_k_time_data)); + break; + case VDSO_RNG_PAGE_OFFSET: + if (!IS_ENABLED(CONFIG_VDSO_GETRANDOM)) + return VM_FAULT_SIGBUS; + pfn = __phys_to_pfn(__pa_symbol(vdso_k_rng_data)); + break; + case VDSO_ARCH_PAGES_START ... VDSO_ARCH_PAGES_END: + if (!IS_ENABLED(CONFIG_ARCH_HAS_VDSO_ARCH_DATA)) + return VM_FAULT_SIGBUS; + pfn = __phys_to_pfn(__pa_symbol(vdso_k_arch_data)) + + vmf->pgoff - VDSO_ARCH_PAGES_START; + break; + default: + return VM_FAULT_SIGBUS; + } + + return vmf_insert_pfn(vma, vmf->address, pfn); +} + +const struct vm_special_mapping vdso_vvar_mapping = { + .name = "[vvar]", + .fault = vvar_fault, +}; + +struct vm_area_struct *vdso_install_vvar_mapping(struct mm_struct *mm, unsigned long addr) +{ + return _install_special_mapping(mm, addr, VDSO_NR_PAGES * PAGE_SIZE, + VM_READ | VM_MAYREAD | VM_IO | VM_DONTDUMP | + VM_PFNMAP | VM_SEALED_SYSMAP, + &vdso_vvar_mapping); +} + +#ifdef CONFIG_TIME_NS +/* + * The vvar page layout depends on whether a task belongs to the root or + * non-root time namespace. Whenever a task changes its namespace, the VVAR + * page tables are cleared and then they will be re-faulted with a + * corresponding layout. + * See also the comment near timens_setup_vdso_clock_data() for details. + */ +int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + + mmap_read_lock(mm); + for_each_vma(vmi, vma) { + if (vma_is_special_mapping(vma, &vdso_vvar_mapping)) + zap_vma_pages(vma); + } + mmap_read_unlock(mm); + + return 0; +} +#endif diff --git a/lib/vdso/getrandom.c b/lib/vdso/getrandom.c new file mode 100644 index 000000000000..440f8a6203a6 --- /dev/null +++ b/lib/vdso/getrandom.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#include <linux/array_size.h> +#include <linux/minmax.h> +#include <vdso/datapage.h> +#include <vdso/getrandom.h> +#include <vdso/unaligned.h> +#include <asm/vdso/getrandom.h> +#include <uapi/linux/mman.h> +#include <uapi/linux/random.h> + +/* Bring in default accessors */ +#include <vdso/vsyscall.h> + +#undef PAGE_SIZE +#undef PAGE_MASK +#define PAGE_SIZE (1UL << CONFIG_PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE - 1)) + +#define MEMCPY_AND_ZERO_SRC(type, dst, src, len) do { \ + while (len >= sizeof(type)) { \ + __put_unaligned_t(type, __get_unaligned_t(type, src), dst); \ + __put_unaligned_t(type, 0, src); \ + dst += sizeof(type); \ + src += sizeof(type); \ + len -= sizeof(type); \ + } \ +} while (0) + +static void memcpy_and_zero_src(void *dst, void *src, size_t len) +{ + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { + if (IS_ENABLED(CONFIG_64BIT)) + MEMCPY_AND_ZERO_SRC(u64, dst, src, len); + MEMCPY_AND_ZERO_SRC(u32, dst, src, len); + MEMCPY_AND_ZERO_SRC(u16, dst, src, len); + } + MEMCPY_AND_ZERO_SRC(u8, dst, src, len); +} + +/** + * __cvdso_getrandom_data - Generic vDSO implementation of getrandom() syscall. + * @rng_info: Describes state of kernel RNG, memory shared with kernel. + * @buffer: Destination buffer to fill with random bytes. + * @len: Size of @buffer in bytes. + * @flags: Zero or more GRND_* flags. + * @opaque_state: Pointer to an opaque state area. + * @opaque_len: Length of opaque state area. + * + * This implements a "fast key erasure" RNG using ChaCha20, in the same way that the kernel's + * getrandom() syscall does. It periodically reseeds its key from the kernel's RNG, at the same + * schedule that the kernel's RNG is reseeded. If the kernel's RNG is not ready, then this always + * calls into the syscall. + * + * If @buffer, @len, and @flags are 0, and @opaque_len is ~0UL, then @opaque_state is populated + * with a struct vgetrandom_opaque_params and the function returns 0; if it does not return 0, + * this function should not be used. + * + * @opaque_state *must* be allocated by calling mmap(2) using the mmap_prot and mmap_flags fields + * from the struct vgetrandom_opaque_params, and states must not straddle pages. Unless external + * locking is used, one state must be allocated per thread, as it is not safe to call this function + * concurrently with the same @opaque_state. However, it is safe to call this using the same + * @opaque_state that is shared between main code and signal handling code, within the same thread. + * + * Returns: The number of random bytes written to @buffer, or a negative value indicating an error. + */ +static __always_inline ssize_t +__cvdso_getrandom_data(const struct vdso_rng_data *rng_info, void *buffer, size_t len, + unsigned int flags, void *opaque_state, size_t opaque_len) +{ + ssize_t ret = min_t(size_t, INT_MAX & PAGE_MASK /* = MAX_RW_COUNT */, len); + struct vgetrandom_state *state = opaque_state; + size_t batch_len, nblocks, orig_len = len; + bool in_use, have_retried = false; + void *orig_buffer = buffer; + u64 current_generation; + u32 counter[2] = { 0 }; + + if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags)) { + struct vgetrandom_opaque_params *params = opaque_state; + params->size_of_opaque_state = sizeof(*state); + params->mmap_prot = PROT_READ | PROT_WRITE; + params->mmap_flags = MAP_DROPPABLE | MAP_ANONYMOUS; + for (size_t i = 0; i < ARRAY_SIZE(params->reserved); ++i) + params->reserved[i] = 0; + return 0; + } + + /* The state must not straddle a page, since pages can be zeroed at any time. */ + if (unlikely(((unsigned long)opaque_state & ~PAGE_MASK) + sizeof(*state) > PAGE_SIZE)) + return -EFAULT; + + /* Handle unexpected flags by falling back to the kernel. */ + if (unlikely(flags & ~(GRND_NONBLOCK | GRND_RANDOM | GRND_INSECURE))) + goto fallback_syscall; + + /* If the caller passes the wrong size, which might happen due to CRIU, fallback. */ + if (unlikely(opaque_len != sizeof(*state))) + goto fallback_syscall; + + /* + * If the kernel's RNG is not yet ready, then it's not possible to provide random bytes from + * userspace, because A) the various @flags require this to block, or not, depending on + * various factors unavailable to userspace, and B) the kernel's behavior before the RNG is + * ready is to reseed from the entropy pool at every invocation. + */ + if (unlikely(!READ_ONCE(rng_info->is_ready))) + goto fallback_syscall; + + /* + * This condition is checked after @rng_info->is_ready, because before the kernel's RNG is + * initialized, the @flags parameter may require this to block or return an error, even when + * len is zero. + */ + if (unlikely(!len)) + return 0; + + /* + * @state->in_use is basic reentrancy protection against this running in a signal handler + * with the same @opaque_state, but obviously not atomic wrt multiple CPUs or more than one + * level of reentrancy. If a signal interrupts this after reading @state->in_use, but before + * writing @state->in_use, there is still no race, because the signal handler will run to + * its completion before returning execution. + */ + in_use = READ_ONCE(state->in_use); + if (unlikely(in_use)) + /* The syscall simply fills the buffer and does not touch @state, so fallback. */ + goto fallback_syscall; + WRITE_ONCE(state->in_use, true); + +retry_generation: + /* + * @rng_info->generation must always be read here, as it serializes @state->key with the + * kernel's RNG reseeding schedule. + */ + current_generation = READ_ONCE(rng_info->generation); + + /* + * If @state->generation doesn't match the kernel RNG's generation, then it means the + * kernel's RNG has reseeded, and so @state->key is reseeded as well. + */ + if (unlikely(state->generation != current_generation)) { + /* + * Write the generation before filling the key, in case of fork. If there is a fork + * just after this line, the parent and child will get different random bytes from + * the syscall, which is good. However, were this line to occur after the getrandom + * syscall, then both child and parent could have the same bytes and the same + * generation counter, so the fork would not be detected. Therefore, write + * @state->generation before the call to the getrandom syscall. + */ + WRITE_ONCE(state->generation, current_generation); + + /* + * Prevent the syscall from being reordered wrt current_generation. Pairs with the + * smp_store_release(&vdso_k_rng_data->generation) in random.c. + */ + smp_rmb(); + + /* Reseed @state->key using fresh bytes from the kernel. */ + if (getrandom_syscall(state->key, sizeof(state->key), 0) != sizeof(state->key)) { + /* + * If the syscall failed to refresh the key, then @state->key is now + * invalid, so invalidate the generation so that it is not used again, and + * fallback to using the syscall entirely. + */ + WRITE_ONCE(state->generation, 0); + + /* + * Set @state->in_use to false only after the last write to @state in the + * line above. + */ + WRITE_ONCE(state->in_use, false); + + goto fallback_syscall; + } + + /* + * Set @state->pos to beyond the end of the batch, so that the batch is refilled + * using the new key. + */ + state->pos = sizeof(state->batch); + } + + /* Set len to the total amount of bytes that this function is allowed to read, ret. */ + len = ret; +more_batch: + /* + * First use bytes out of @state->batch, which may have been filled by the last call to this + * function. + */ + batch_len = min_t(size_t, sizeof(state->batch) - state->pos, len); + if (batch_len) { + /* Zeroing at the same time as memcpying helps preserve forward secrecy. */ + memcpy_and_zero_src(buffer, state->batch + state->pos, batch_len); + state->pos += batch_len; + buffer += batch_len; + len -= batch_len; + } + + if (!len) { + /* Prevent the loop from being reordered wrt ->generation. */ + barrier(); + + /* + * Since @rng_info->generation will never be 0, re-read @state->generation, rather + * than using the local current_generation variable, to learn whether a fork + * occurred or if @state was zeroed due to memory pressure. Primarily, though, this + * indicates whether the kernel's RNG has reseeded, in which case generate a new key + * and start over. + */ + if (unlikely(READ_ONCE(state->generation) != READ_ONCE(rng_info->generation))) { + /* + * Prevent this from looping forever in case of low memory or racing with a + * user force-reseeding the kernel's RNG using the ioctl. + */ + if (have_retried) { + WRITE_ONCE(state->in_use, false); + goto fallback_syscall; + } + + have_retried = true; + buffer = orig_buffer; + goto retry_generation; + } + + /* + * Set @state->in_use to false only when there will be no more reads or writes of + * @state. + */ + WRITE_ONCE(state->in_use, false); + return ret; + } + + /* Generate blocks of RNG output directly into @buffer while there's enough room left. */ + nblocks = len / CHACHA_BLOCK_SIZE; + if (nblocks) { + __arch_chacha20_blocks_nostack(buffer, state->key, counter, nblocks); + buffer += nblocks * CHACHA_BLOCK_SIZE; + len -= nblocks * CHACHA_BLOCK_SIZE; + } + + BUILD_BUG_ON(sizeof(state->batch_key) % CHACHA_BLOCK_SIZE != 0); + + /* Refill the batch and overwrite the key, in order to preserve forward secrecy. */ + __arch_chacha20_blocks_nostack(state->batch_key, state->key, counter, + sizeof(state->batch_key) / CHACHA_BLOCK_SIZE); + + /* Since the batch was just refilled, set the position back to 0 to indicate a full batch. */ + state->pos = 0; + goto more_batch; + +fallback_syscall: + return getrandom_syscall(orig_buffer, orig_len, flags); +} + +static __always_inline ssize_t +__cvdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) +{ + return __cvdso_getrandom_data(__arch_get_vdso_u_rng_data(), buffer, len, flags, + opaque_state, opaque_len); +} diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c new file mode 100644 index 000000000000..95df0153f05a --- /dev/null +++ b/lib/vdso/gettimeofday.c @@ -0,0 +1,513 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generic userspace implementations of gettimeofday() and similar. + */ +#include <vdso/auxclock.h> +#include <vdso/datapage.h> +#include <vdso/helpers.h> + +/* Bring in default accessors */ +#include <vdso/vsyscall.h> + +#ifndef vdso_calc_ns + +#ifdef VDSO_DELTA_NOMASK +# define VDSO_DELTA_MASK(vd) ULLONG_MAX +#else +# define VDSO_DELTA_MASK(vd) (vd->mask) +#endif + +#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT +static __always_inline bool vdso_delta_ok(const struct vdso_clock *vc, u64 delta) +{ + return delta < vc->max_cycles; +} +#else +static __always_inline bool vdso_delta_ok(const struct vdso_clock *vc, u64 delta) +{ + return true; +} +#endif + +#ifndef vdso_shift_ns +static __always_inline u64 vdso_shift_ns(u64 ns, u32 shift) +{ + return ns >> shift; +} +#endif + +/* + * Default implementation which works for all sane clocksources. That + * obviously excludes x86/TSC. + */ +static __always_inline u64 vdso_calc_ns(const struct vdso_clock *vc, u64 cycles, u64 base) +{ + u64 delta = (cycles - vc->cycle_last) & VDSO_DELTA_MASK(vc); + + if (likely(vdso_delta_ok(vc, delta))) + return vdso_shift_ns((delta * vc->mult) + base, vc->shift); + + return mul_u64_u32_add_u64_shr(delta, vc->mult, base, vc->shift); +} +#endif /* vdso_calc_ns */ + +#ifndef __arch_vdso_hres_capable +static inline bool __arch_vdso_hres_capable(void) +{ + return true; +} +#endif + +#ifndef vdso_clocksource_ok +static inline bool vdso_clocksource_ok(const struct vdso_clock *vc) +{ + return vc->clock_mode != VDSO_CLOCKMODE_NONE; +} +#endif + +#ifndef vdso_cycles_ok +static inline bool vdso_cycles_ok(u64 cycles) +{ + return true; +} +#endif + +static __always_inline bool vdso_clockid_valid(clockid_t clock) +{ + /* Check for negative values or invalid clocks */ + return likely((u32) clock <= CLOCK_AUX_LAST); +} + +/* + * Must not be invoked within the sequence read section as a race inside + * that loop could result in __iter_div_u64_rem() being extremely slow. + */ +static __always_inline void vdso_set_timespec(struct __kernel_timespec *ts, u64 sec, u64 ns) +{ + ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; +} + +static __always_inline +bool vdso_get_timestamp(const struct vdso_time_data *vd, const struct vdso_clock *vc, + unsigned int clkidx, u64 *sec, u64 *ns) +{ + const struct vdso_timestamp *vdso_ts = &vc->basetime[clkidx]; + u64 cycles; + + if (unlikely(!vdso_clocksource_ok(vc))) + return false; + + cycles = __arch_get_hw_counter(vc->clock_mode, vd); + if (unlikely(!vdso_cycles_ok(cycles))) + return false; + + *ns = vdso_calc_ns(vc, cycles, vdso_ts->nsec); + *sec = vdso_ts->sec; + + return true; +} + +static __always_inline +const struct vdso_time_data *__arch_get_vdso_u_timens_data(const struct vdso_time_data *vd) +{ + return (void *)vd + PAGE_SIZE; +} + +static __always_inline +bool do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, + clockid_t clk, struct __kernel_timespec *ts) +{ + const struct vdso_time_data *vd = __arch_get_vdso_u_timens_data(vdns); + const struct timens_offset *offs = &vcns->offset[clk]; + const struct vdso_clock *vc = vd->clock_data; + u32 seq; + s64 sec; + u64 ns; + + if (clk != CLOCK_MONOTONIC_RAW) + vc = &vc[CS_HRES_COARSE]; + else + vc = &vc[CS_RAW]; + + do { + seq = vdso_read_begin(vc); + + if (!vdso_get_timestamp(vd, vc, clk, &sec, &ns)) + return false; + } while (unlikely(vdso_read_retry(vc, seq))); + + /* Add the namespace offset */ + sec += offs->sec; + ns += offs->nsec; + + vdso_set_timespec(ts, sec, ns); + + return true; +} + +static __always_inline +bool do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, + clockid_t clk, struct __kernel_timespec *ts) +{ + u64 sec, ns; + u32 seq; + + /* Allows to compile the high resolution parts out */ + if (!__arch_vdso_hres_capable()) + return false; + + do { + /* + * Open coded function vdso_read_begin() to handle + * VDSO_CLOCKMODE_TIMENS. Time namespace enabled tasks have a + * special VVAR page installed which has vc->seq set to 1 and + * vc->clock_mode set to VDSO_CLOCKMODE_TIMENS. For non time + * namespace affected tasks this does not affect performance + * because if vc->seq is odd, i.e. a concurrent update is in + * progress the extra check for vc->clock_mode is just a few + * extra instructions while spin waiting for vc->seq to become + * even again. + */ + while (unlikely((seq = READ_ONCE(vc->seq)) & 1)) { + if (IS_ENABLED(CONFIG_TIME_NS) && + vc->clock_mode == VDSO_CLOCKMODE_TIMENS) + return do_hres_timens(vd, vc, clk, ts); + cpu_relax(); + } + smp_rmb(); + + if (!vdso_get_timestamp(vd, vc, clk, &sec, &ns)) + return false; + } while (unlikely(vdso_read_retry(vc, seq))); + + vdso_set_timespec(ts, sec, ns); + + return true; +} + +static __always_inline +bool do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, + clockid_t clk, struct __kernel_timespec *ts) +{ + const struct vdso_time_data *vd = __arch_get_vdso_u_timens_data(vdns); + const struct timens_offset *offs = &vcns->offset[clk]; + const struct vdso_clock *vc = vd->clock_data; + const struct vdso_timestamp *vdso_ts; + u64 nsec; + s64 sec; + s32 seq; + + vdso_ts = &vc->basetime[clk]; + + do { + seq = vdso_read_begin(vc); + sec = vdso_ts->sec; + nsec = vdso_ts->nsec; + } while (unlikely(vdso_read_retry(vc, seq))); + + /* Add the namespace offset */ + sec += offs->sec; + nsec += offs->nsec; + + vdso_set_timespec(ts, sec, nsec); + + return true; +} + +static __always_inline +bool do_coarse(const struct vdso_time_data *vd, const struct vdso_clock *vc, + clockid_t clk, struct __kernel_timespec *ts) +{ + const struct vdso_timestamp *vdso_ts = &vc->basetime[clk]; + u32 seq; + + do { + /* + * Open coded function vdso_read_begin() to handle + * VDSO_CLOCK_TIMENS. See comment in do_hres(). + */ + while ((seq = READ_ONCE(vc->seq)) & 1) { + if (IS_ENABLED(CONFIG_TIME_NS) && + vc->clock_mode == VDSO_CLOCKMODE_TIMENS) + return do_coarse_timens(vd, vc, clk, ts); + cpu_relax(); + } + smp_rmb(); + + ts->tv_sec = vdso_ts->sec; + ts->tv_nsec = vdso_ts->nsec; + } while (unlikely(vdso_read_retry(vc, seq))); + + return true; +} + +static __always_inline +bool do_aux(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_timespec *ts) +{ + const struct vdso_clock *vc; + u32 seq, idx; + u64 sec, ns; + + if (!IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) + return false; + + idx = clock - CLOCK_AUX; + vc = &vd->aux_clock_data[idx]; + + do { + /* + * Open coded function vdso_read_begin() to handle + * VDSO_CLOCK_TIMENS. See comment in do_hres(). + */ + while ((seq = READ_ONCE(vc->seq)) & 1) { + if (IS_ENABLED(CONFIG_TIME_NS) && vc->clock_mode == VDSO_CLOCKMODE_TIMENS) { + vd = __arch_get_vdso_u_timens_data(vd); + vc = &vd->aux_clock_data[idx]; + /* Re-read from the real time data page */ + continue; + } + cpu_relax(); + } + smp_rmb(); + + /* Auxclock disabled? */ + if (vc->clock_mode == VDSO_CLOCKMODE_NONE) + return false; + + if (!vdso_get_timestamp(vd, vc, VDSO_BASE_AUX, &sec, &ns)) + return false; + } while (unlikely(vdso_read_retry(vc, seq))); + + vdso_set_timespec(ts, sec, ns); + + return true; +} + +static __always_inline bool +__cvdso_clock_gettime_common(const struct vdso_time_data *vd, clockid_t clock, + struct __kernel_timespec *ts) +{ + const struct vdso_clock *vc = vd->clock_data; + u32 msk; + + if (!vdso_clockid_valid(clock)) + return false; + + /* + * Convert the clockid to a bitmask and use it to check which + * clocks are handled in the VDSO directly. + */ + msk = 1U << clock; + if (likely(msk & VDSO_HRES)) + vc = &vc[CS_HRES_COARSE]; + else if (msk & VDSO_COARSE) + return do_coarse(vd, &vc[CS_HRES_COARSE], clock, ts); + else if (msk & VDSO_RAW) + vc = &vc[CS_RAW]; + else if (msk & VDSO_AUX) + return do_aux(vd, clock, ts); + else + return false; + + return do_hres(vd, vc, clock, ts); +} + +static __maybe_unused int +__cvdso_clock_gettime_data(const struct vdso_time_data *vd, clockid_t clock, + struct __kernel_timespec *ts) +{ + bool ok; + + ok = __cvdso_clock_gettime_common(vd, clock, ts); + + if (unlikely(!ok)) + return clock_gettime_fallback(clock, ts); + return 0; +} + +static __maybe_unused int +__cvdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts) +{ + return __cvdso_clock_gettime_data(__arch_get_vdso_u_time_data(), clock, ts); +} + +#ifdef BUILD_VDSO32 +static __maybe_unused int +__cvdso_clock_gettime32_data(const struct vdso_time_data *vd, clockid_t clock, + struct old_timespec32 *res) +{ + struct __kernel_timespec ts; + bool ok; + + ok = __cvdso_clock_gettime_common(vd, clock, &ts); + + if (unlikely(!ok)) + return clock_gettime32_fallback(clock, res); + + /* For ok == true */ + res->tv_sec = ts.tv_sec; + res->tv_nsec = ts.tv_nsec; + + return 0; +} + +static __maybe_unused int +__cvdso_clock_gettime32(clockid_t clock, struct old_timespec32 *res) +{ + return __cvdso_clock_gettime32_data(__arch_get_vdso_u_time_data(), clock, res); +} +#endif /* BUILD_VDSO32 */ + +static __maybe_unused int +__cvdso_gettimeofday_data(const struct vdso_time_data *vd, + struct __kernel_old_timeval *tv, struct timezone *tz) +{ + const struct vdso_clock *vc = vd->clock_data; + + if (likely(tv != NULL)) { + struct __kernel_timespec ts; + + if (!do_hres(vd, &vc[CS_HRES_COARSE], CLOCK_REALTIME, &ts)) + return gettimeofday_fallback(tv, tz); + + tv->tv_sec = ts.tv_sec; + tv->tv_usec = (u32)ts.tv_nsec / NSEC_PER_USEC; + } + + if (unlikely(tz != NULL)) { + if (IS_ENABLED(CONFIG_TIME_NS) && + vc->clock_mode == VDSO_CLOCKMODE_TIMENS) + vd = __arch_get_vdso_u_timens_data(vd); + + tz->tz_minuteswest = vd[CS_HRES_COARSE].tz_minuteswest; + tz->tz_dsttime = vd[CS_HRES_COARSE].tz_dsttime; + } + + return 0; +} + +static __maybe_unused int +__cvdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) +{ + return __cvdso_gettimeofday_data(__arch_get_vdso_u_time_data(), tv, tz); +} + +#ifdef VDSO_HAS_TIME +static __maybe_unused __kernel_old_time_t +__cvdso_time_data(const struct vdso_time_data *vd, __kernel_old_time_t *time) +{ + const struct vdso_clock *vc = vd->clock_data; + __kernel_old_time_t t; + + if (IS_ENABLED(CONFIG_TIME_NS) && + vc->clock_mode == VDSO_CLOCKMODE_TIMENS) { + vd = __arch_get_vdso_u_timens_data(vd); + vc = vd->clock_data; + } + + t = READ_ONCE(vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME].sec); + + if (time) + *time = t; + + return t; +} + +static __maybe_unused __kernel_old_time_t __cvdso_time(__kernel_old_time_t *time) +{ + return __cvdso_time_data(__arch_get_vdso_u_time_data(), time); +} +#endif /* VDSO_HAS_TIME */ + +#ifdef VDSO_HAS_CLOCK_GETRES +static __maybe_unused +bool __cvdso_clock_getres_common(const struct vdso_time_data *vd, clockid_t clock, + struct __kernel_timespec *res) +{ + const struct vdso_clock *vc = vd->clock_data; + u32 msk; + u64 ns; + + if (!vdso_clockid_valid(clock)) + return false; + + if (IS_ENABLED(CONFIG_TIME_NS) && + vc->clock_mode == VDSO_CLOCKMODE_TIMENS) + vd = __arch_get_vdso_u_timens_data(vd); + + /* + * Convert the clockid to a bitmask and use it to check which + * clocks are handled in the VDSO directly. + */ + msk = 1U << clock; + if (msk & (VDSO_HRES | VDSO_RAW)) { + /* + * Preserves the behaviour of posix_get_hrtimer_res(). + */ + ns = READ_ONCE(vd->hrtimer_res); + } else if (msk & VDSO_COARSE) { + /* + * Preserves the behaviour of posix_get_coarse_res(). + */ + ns = LOW_RES_NSEC; + } else if (msk & VDSO_AUX) { + ns = aux_clock_resolution_ns(); + } else { + return false; + } + + if (likely(res)) { + res->tv_sec = 0; + res->tv_nsec = ns; + } + return true; +} + +static __maybe_unused +int __cvdso_clock_getres_data(const struct vdso_time_data *vd, clockid_t clock, + struct __kernel_timespec *res) +{ + bool ok; + + ok = __cvdso_clock_getres_common(vd, clock, res); + + if (unlikely(!ok)) + return clock_getres_fallback(clock, res); + return 0; +} + +static __maybe_unused +int __cvdso_clock_getres(clockid_t clock, struct __kernel_timespec *res) +{ + return __cvdso_clock_getres_data(__arch_get_vdso_u_time_data(), clock, res); +} + +#ifdef BUILD_VDSO32 +static __maybe_unused int +__cvdso_clock_getres_time32_data(const struct vdso_time_data *vd, clockid_t clock, + struct old_timespec32 *res) +{ + struct __kernel_timespec ts; + bool ok; + + ok = __cvdso_clock_getres_common(vd, clock, &ts); + + if (unlikely(!ok)) + return clock_getres32_fallback(clock, res); + + if (likely(res)) { + res->tv_sec = ts.tv_sec; + res->tv_nsec = ts.tv_nsec; + } + return 0; +} + +static __maybe_unused int +__cvdso_clock_getres_time32(clockid_t clock, struct old_timespec32 *res) +{ + return __cvdso_clock_getres_time32_data(__arch_get_vdso_u_time_data(), + clock, res); +} +#endif /* BUILD_VDSO32 */ +#endif /* VDSO_HAS_CLOCK_GETRES */ |
