From 2707745533d6d38fa7d3a2212f1fd599c3879491 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Tue, 7 Jan 2020 02:06:29 +0100 Subject: time/sched_clock: Disable interrupts in sched_clock_register() Instead of issueing a warning if sched_clock_register() is called from a context where IRQs are enabled, the code now ensures that IRQs are indeed disabled. Signed-off-by: Paul Cercueil Signed-off-by: Thomas Gleixner Acked-by: Daniel Lezcano Link: https://lore.kernel.org/r/20200107010630.954648-1-paul@crapouillou.net --- kernel/time/sched_clock.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index dbd69052eaa6..e4332e3e2d56 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -169,14 +169,15 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; - unsigned long r; + unsigned long r, flags; char r_unit; struct clock_read_data rd; if (cd.rate > rate) return; - WARN_ON(!irqs_disabled()); + /* Cannot register a sched_clock with interrupts on */ + local_irq_save(flags); /* Calculate the mult/shift to convert counter ticks to ns. */ clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); @@ -233,6 +234,8 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) enable_sched_clock_irqtime(); + local_irq_restore(flags); + pr_debug("Registered %pS as sched_clock source\n", read); } -- cgit From 3b5584afeef05319ade0fbf5f634a64fd3e5772b Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Fri, 30 Aug 2019 14:58:55 +0100 Subject: arm64: compat: vdso: Expose BUILD_VDSO32 clock_gettime32 and clock_getres_time32 should be compiled only with the 32 bit vdso library. Expose BUILD_VDSO32 when arm64 compat is compiled, to provide an indication to the generic library to include these symbols. Signed-off-by: Vincenzo Frascino Signed-off-by: Thomas Gleixner Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20190830135902.20861-2-vincenzo.frascino@arm.com --- arch/arm64/include/asm/vdso/compat_gettimeofday.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h b/arch/arm64/include/asm/vdso/compat_gettimeofday.h index c50ee1b7d5cd..fe7afe0f1a3d 100644 --- a/arch/arm64/include/asm/vdso/compat_gettimeofday.h +++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h @@ -17,6 +17,7 @@ #define VDSO_HAS_CLOCK_GETRES 1 #define VDSO_HAS_32BIT_FALLBACK 1 +#define BUILD_VDSO32 1 static __always_inline int gettimeofday_fallback(struct __kernel_old_timeval *_tv, -- cgit From 715f23b6104aa297feea20d4f200ca81941e23de Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Jan 2020 09:41:09 +0100 Subject: ARM: vdso: Set BUILD_VDSO32 and provide 32bit fallbacks Setting BUILD_VDSO32 is required to expose the legacy 32bit interfaces in the generic VDSO code which are going to be hidden behind an #ifdef BUILD_VDSO32. The 32bit fallbacks are necessary to remove the existing VDSO_HAS_32BIT_FALLBACK hackery. Signed-off-by: Thomas Gleixner Tested-by: Vincenzo Frascino Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/87tv4zq9dc.fsf@nanos.tec.linutronix.de --- arch/arm/include/asm/vdso/gettimeofday.h | 36 ++++++++++++++++++++++++++++++++ arch/arm/vdso/Makefile | 2 +- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/vdso/gettimeofday.h b/arch/arm/include/asm/vdso/gettimeofday.h index 0ad2429c324f..fe6e1f65932d 100644 --- a/arch/arm/include/asm/vdso/gettimeofday.h +++ b/arch/arm/include/asm/vdso/gettimeofday.h @@ -52,6 +52,24 @@ static __always_inline long clock_gettime_fallback( return ret; } +static __always_inline long clock_gettime32_fallback( + clockid_t _clkid, + struct old_timespec32 *_ts) +{ + register struct old_timespec32 *ts asm("r1") = _ts; + register clockid_t clkid asm("r0") = _clkid; + register long ret asm ("r0"); + register long nr asm("r7") = __NR_clock_gettime; + + asm volatile( + " swi #0\n" + : "=r" (ret) + : "r" (clkid), "r" (ts), "r" (nr) + : "memory"); + + return ret; +} + static __always_inline int clock_getres_fallback( clockid_t _clkid, struct __kernel_timespec *_ts) @@ -70,6 +88,24 @@ static __always_inline int clock_getres_fallback( return ret; } +static __always_inline int clock_getres32_fallback( + clockid_t _clkid, + struct old_timespec32 *_ts) +{ + register struct old_timespec32 *ts asm("r1") = _ts; + register clockid_t clkid asm("r0") = _clkid; + register long ret asm ("r0"); + register long nr asm("r7") = __NR_clock_getres; + + asm volatile( + " swi #0\n" + : "=r" (ret) + : "r" (clkid), "r" (ts), "r" (nr) + : "memory"); + + return ret; +} + static __always_inline u64 __arch_get_hw_counter(int clock_mode) { #ifdef CONFIG_ARM_ARCH_TIMER diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile index 0fda344beb0b..1babb392e70a 100644 --- a/arch/arm/vdso/Makefile +++ b/arch/arm/vdso/Makefile @@ -14,7 +14,7 @@ targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.so.raw vdso.lds obj-vdso := $(addprefix $(obj)/, $(obj-vdso)) ccflags-y := -fPIC -fno-common -fno-builtin -fno-stack-protector -ccflags-y += -DDISABLE_BRANCH_PROFILING +ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO32 ldflags-$(CONFIG_CPU_ENDIAN_BE8) := --be8 ldflags-y := -Bsymbolic --no-undefined -soname=linux-vdso.so.1 \ -- cgit From bf279849ad59538a1518c667c0795ec1fe9dbd66 Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Fri, 30 Aug 2019 14:58:56 +0100 Subject: lib/vdso: Build 32 bit specific functions in the right context clock_gettime32 and clock_getres_time32 should be compiled only with a 32 bit vdso library. Exclude these symbols when BUILD_VDSO32 is not defined. Signed-off-by: Vincenzo Frascino Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20190830135902.20861-3-vincenzo.frascino@arm.com --- lib/vdso/gettimeofday.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index 42bd8ab955fa..8e77071a4a77 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -117,6 +117,7 @@ __cvdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts) return 0; } +#ifdef BUILD_VDSO32 static __maybe_unused int __cvdso_clock_gettime32(clockid_t clock, struct old_timespec32 *res) { @@ -139,6 +140,7 @@ __cvdso_clock_gettime32(clockid_t clock, struct old_timespec32 *res) } return ret; } +#endif /* BUILD_VDSO32 */ static __maybe_unused int __cvdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) @@ -231,6 +233,7 @@ int __cvdso_clock_getres(clockid_t clock, struct __kernel_timespec *res) return 0; } +#ifdef BUILD_VDSO32 static __maybe_unused int __cvdso_clock_getres_time32(clockid_t clock, struct old_timespec32 *res) { @@ -253,4 +256,5 @@ __cvdso_clock_getres_time32(clockid_t clock, struct old_timespec32 *res) } return ret; } +#endif /* BUILD_VDSO32 */ #endif /* VDSO_HAS_CLOCK_GETRES */ -- cgit From b767081c07a400ff1c6f95b87639a9405886e7a6 Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Fri, 30 Aug 2019 14:58:58 +0100 Subject: lib/vdso: Remove VDSO_HAS_32BIT_FALLBACK VDSO_HAS_32BIT_FALLBACK was introduced to address a regression which caused seccomp to deny access to the applications to clock_gettime64() and clock_getres64() because they are not enabled in the existing filters. The purpose of VDSO_HAS_32BIT_FALLBACK was to simplify the conditional implementation of __cvdso_clock_get*time32() variants. Now that all the architectures that support the generic vDSO library have been converted to support the 32 bit fallbacks the conditional can be removed. Signed-off-by: Vincenzo Frascino Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20190830135902.20861-5-vincenzo.frascino@arm.com References: c60a32ea4f45 ("lib/vdso/32: Provide legacy syscall fallbacks") --- lib/vdso/gettimeofday.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index 8e77071a4a77..cd3aacf1cf86 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -126,13 +126,8 @@ __cvdso_clock_gettime32(clockid_t clock, struct old_timespec32 *res) ret = __cvdso_clock_gettime_common(clock, &ts); -#ifdef VDSO_HAS_32BIT_FALLBACK if (unlikely(ret)) return clock_gettime32_fallback(clock, res); -#else - if (unlikely(ret)) - ret = clock_gettime_fallback(clock, &ts); -#endif if (likely(!ret)) { res->tv_sec = ts.tv_sec; @@ -242,13 +237,8 @@ __cvdso_clock_getres_time32(clockid_t clock, struct old_timespec32 *res) ret = __cvdso_clock_getres_common(clock, &ts); -#ifdef VDSO_HAS_32BIT_FALLBACK if (unlikely(ret)) return clock_getres32_fallback(clock, res); -#else - if (unlikely(ret)) - ret = clock_getres_fallback(clock, &ts); -#endif if (likely(!ret && res)) { res->tv_sec = ts.tv_sec; -- cgit From a279235ddbe975670afe2267162028ec0a312293 Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Fri, 30 Aug 2019 14:58:59 +0100 Subject: lib/vdso: Remove checks on return value for 32 bit vDSO Since all the architectures that support the generic vDSO library have been converted to support the 32 bit fallbacks it is not required anymore to check the return value of __cvdso_clock_get*time32_common() before updating the old_timespec fields. Remove the related checks from the generic vdso library. References: c60a32ea4f45 ("lib/vdso/32: Provide legacy syscall fallbacks") Signed-off-by: Vincenzo Frascino Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20190830135902.20861-6-vincenzo.frascino@arm.com --- lib/vdso/gettimeofday.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index cd3aacf1cf86..b676a9845def 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -129,10 +129,10 @@ __cvdso_clock_gettime32(clockid_t clock, struct old_timespec32 *res) if (unlikely(ret)) return clock_gettime32_fallback(clock, res); - if (likely(!ret)) { - res->tv_sec = ts.tv_sec; - res->tv_nsec = ts.tv_nsec; - } + /* For ret == 0 */ + res->tv_sec = ts.tv_sec; + res->tv_nsec = ts.tv_nsec; + return ret; } #endif /* BUILD_VDSO32 */ @@ -240,7 +240,7 @@ __cvdso_clock_getres_time32(clockid_t clock, struct old_timespec32 *res) if (unlikely(ret)) return clock_getres32_fallback(clock, res); - if (likely(!ret && res)) { + if (likely(res)) { res->tv_sec = ts.tv_sec; res->tv_nsec = ts.tv_nsec; } -- cgit From 972188f3a2dac07a6f000a4418776f446259fc87 Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Fri, 30 Aug 2019 14:59:00 +0100 Subject: arm64: compat: vdso: Remove unused VDSO_HAS_32BIT_FALLBACK VDSO_HAS_32BIT_FALLBACK has been removed from the core since the architectures that support the generic vDSO library have been converted to support the 32 bit fallbacks. Remove unused VDSO_HAS_32BIT_FALLBACK from arm64 compat vdso. Signed-off-by: Vincenzo Frascino Signed-off-by: Thomas Gleixner Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20190830135902.20861-7-vincenzo.frascino@arm.com --- arch/arm64/include/asm/vdso/compat_gettimeofday.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h b/arch/arm64/include/asm/vdso/compat_gettimeofday.h index fe7afe0f1a3d..537b1e695365 100644 --- a/arch/arm64/include/asm/vdso/compat_gettimeofday.h +++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h @@ -16,7 +16,6 @@ #define VDSO_HAS_CLOCK_GETRES 1 -#define VDSO_HAS_32BIT_FALLBACK 1 #define BUILD_VDSO32 1 static __always_inline -- cgit From de0209f53aba44d9b57d0739a076dfb2db767584 Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Fri, 30 Aug 2019 14:59:01 +0100 Subject: mips: vdso: Remove unused VDSO_HAS_32BIT_FALLBACK VDSO_HAS_32BIT_FALLBACK has been removed from the core since the architectures that support the generic vDSO library have been converted to support the 32 bit fallbacks. Remove unused VDSO_HAS_32BIT_FALLBACK from mips vdso. Signed-off-by: Vincenzo Frascino Signed-off-by: Thomas Gleixner Acked-by: Paul Burton Link: https://lore.kernel.org/r/20190830135902.20861-8-vincenzo.frascino@arm.com --- arch/mips/include/asm/vdso/gettimeofday.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/mips/include/asm/vdso/gettimeofday.h b/arch/mips/include/asm/vdso/gettimeofday.h index 0ae9b4cbc153..a58687e26c5d 100644 --- a/arch/mips/include/asm/vdso/gettimeofday.h +++ b/arch/mips/include/asm/vdso/gettimeofday.h @@ -96,8 +96,6 @@ static __always_inline int clock_getres_fallback( #if _MIPS_SIM != _MIPS_SIM_ABI64 -#define VDSO_HAS_32BIT_FALLBACK 1 - static __always_inline long clock_gettime32_fallback( clockid_t _clkid, struct old_timespec32 *_ts) -- cgit From 0b5c12332db5b71c6db0c102b3e6acc7c7c6a54d Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Fri, 30 Aug 2019 14:59:02 +0100 Subject: x86/vdso: Remove unused VDSO_HAS_32BIT_FALLBACK VDSO_HAS_32BIT_FALLBACK has been removed from the core since the architectures that support the generic vDSO library have been converted to support the 32 bit fallbacks. Remove unused VDSO_HAS_32BIT_FALLBACK from x86 vdso. Signed-off-by: Vincenzo Frascino Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20190830135902.20861-9-vincenzo.frascino@arm.com --- arch/x86/include/asm/vdso/gettimeofday.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index e9ee139cf29e..52c3bcd672cf 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -96,8 +96,6 @@ long clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) #else -#define VDSO_HAS_32BIT_FALLBACK 1 - static __always_inline long clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) { -- cgit From 8463cf80529d0fd80b84cd5ab8b9b952b01c7eb9 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 23 Dec 2019 14:31:07 +0000 Subject: lib/vdso: Let do_coarse() return 0 to simplify the callsite do_coarse() is similar to do_hres() except that it never fails. Change its type to int instead of void and let it always return success (0) to simplify the call site. Signed-off-by: Christophe Leroy Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/21e8afa38c02ca8672c2690307383507fe63b454.1577111367.git.christophe.leroy@c-s.fr --- lib/vdso/gettimeofday.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index b676a9845def..5a5ec899a21a 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -68,7 +68,7 @@ static int do_hres(const struct vdso_data *vd, clockid_t clk, return 0; } -static void do_coarse(const struct vdso_data *vd, clockid_t clk, +static int do_coarse(const struct vdso_data *vd, clockid_t clk, struct __kernel_timespec *ts) { const struct vdso_timestamp *vdso_ts = &vd->basetime[clk]; @@ -79,6 +79,8 @@ static void do_coarse(const struct vdso_data *vd, clockid_t clk, ts->tv_sec = vdso_ts->sec; ts->tv_nsec = vdso_ts->nsec; } while (unlikely(vdso_read_retry(vd, seq))); + + return 0; } static __maybe_unused int @@ -96,14 +98,13 @@ __cvdso_clock_gettime_common(clockid_t clock, struct __kernel_timespec *ts) * clocks are handled in the VDSO directly. */ msk = 1U << clock; - if (likely(msk & VDSO_HRES)) { + if (likely(msk & VDSO_HRES)) return do_hres(&vd[CS_HRES_COARSE], clock, ts); - } else if (msk & VDSO_COARSE) { - do_coarse(&vd[CS_HRES_COARSE], clock, ts); - return 0; - } else if (msk & VDSO_RAW) { + else if (msk & VDSO_COARSE) + return do_coarse(&vd[CS_HRES_COARSE], clock, ts); + else if (msk & VDSO_RAW) return do_hres(&vd[CS_RAW], clock, ts); - } + return -1; } -- cgit From cdb7c5a9c897ab2e5c56df647dd84c84e150e925 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 23 Dec 2019 14:31:09 +0000 Subject: lib/vdso: Avoid duplication in __cvdso_clock_getres() VDSO_HRES and VDSO_RAW clocks are handled the same way. Avoid the code duplication. Signed-off-by: Christophe Leroy Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/fdf1a968a8f7edd61456f1689ac44082ebb19c15.1577111367.git.christophe.leroy@c-s.fr --- lib/vdso/gettimeofday.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index 5a5ec899a21a..fac9e86ef124 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -193,7 +193,7 @@ int __cvdso_clock_getres_common(clockid_t clock, struct __kernel_timespec *res) * clocks are handled in the VDSO directly. */ msk = 1U << clock; - if (msk & VDSO_HRES) { + if (msk & (VDSO_HRES | VDSO_RAW)) { /* * Preserves the behaviour of posix_get_hrtimer_res(). */ @@ -203,11 +203,6 @@ int __cvdso_clock_getres_common(clockid_t clock, struct __kernel_timespec *res) * Preserves the behaviour of posix_get_coarse_res(). */ ns = LOW_RES_NSEC; - } else if (msk & VDSO_RAW) { - /* - * Preserves the behaviour of posix_get_hrtimer_res(). - */ - ns = hrtimer_res; } else { return -1; } -- cgit From 0898a16a362d436464b34fa644d0d46efc81df92 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:50 +0000 Subject: lib/vdso: Add unlikely() hint into vdso_read_begin() Place the branch with no concurrent write before the contended case. Performance numbers for Intel(R) Core(TM) i5-6300U CPU @ 2.40GHz (more clock_gettime() cycles - the better): | before | after ----------------------------------- | 150252214 | 153242367 | 150301112 | 153324800 | 150392773 | 153125401 | 150373957 | 153399355 | 150303157 | 153489417 | 150365237 | 153494270 ----------------------------------- avg | 150331408 | 153345935 diff % | 2 | 0 ----------------------------------- stdev % | 0.3 | 0.1 Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Tested-by: Vincenzo Frascino Reviewed-by: Vincenzo Frascino Link: https://lore.kernel.org/r/20191112012724.250792-2-dima@arista.com --- include/vdso/helpers.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/vdso/helpers.h b/include/vdso/helpers.h index 01641dbb68ef..9a2af9fca45e 100644 --- a/include/vdso/helpers.h +++ b/include/vdso/helpers.h @@ -10,7 +10,7 @@ static __always_inline u32 vdso_read_begin(const struct vdso_data *vd) { u32 seq; - while ((seq = READ_ONCE(vd->seq)) & 1) + while (unlikely((seq = READ_ONCE(vd->seq)) & 1)) cpu_relax(); smp_rmb(); -- cgit From c966533f8c6c45f93c52599f8460e7695f0b7eaa Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:51 +0000 Subject: lib/vdso: Mark do_hres() and do_coarse() as __always_inline Performance numbers for Intel(R) Core(TM) i5-6300U CPU @ 2.40GHz (more clock_gettime() cycles - the better): clock | before | after | diff ---------------------------------------------------------- monotonic | 153222105 | 166775025 | 8.8% monotonic-coarse | 671557054 | 691513017 | 3.0% monotonic-raw | 147116067 | 161057395 | 9.5% boottime | 153446224 | 166962668 | 9.1% The improvement for arm64 for monotonic and boottime is around 3.5%. clock | before | after | diff ================================================== monotonic 17326692 17951770 3.6% monotonic-coarse 43624027 44215292 1.3% monotonic-raw 17541809 17554932 0.1% boottime 17334982 17954361 3.5% [ tglx: Avoid the goto ] Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-3-dima@arista.com --- lib/vdso/gettimeofday.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index fac9e86ef124..b453d2469b63 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -38,7 +38,7 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult) } #endif -static int do_hres(const struct vdso_data *vd, clockid_t clk, +static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk, struct __kernel_timespec *ts) { const struct vdso_timestamp *vdso_ts = &vd->basetime[clk]; @@ -68,8 +68,8 @@ static int do_hres(const struct vdso_data *vd, clockid_t clk, return 0; } -static int do_coarse(const struct vdso_data *vd, clockid_t clk, - struct __kernel_timespec *ts) +static __always_inline int do_coarse(const struct vdso_data *vd, clockid_t clk, + struct __kernel_timespec *ts) { const struct vdso_timestamp *vdso_ts = &vd->basetime[clk]; u32 seq; @@ -99,13 +99,15 @@ __cvdso_clock_gettime_common(clockid_t clock, struct __kernel_timespec *ts) */ msk = 1U << clock; if (likely(msk & VDSO_HRES)) - return do_hres(&vd[CS_HRES_COARSE], clock, ts); + vd = &vd[CS_HRES_COARSE]; else if (msk & VDSO_COARSE) return do_coarse(&vd[CS_HRES_COARSE], clock, ts); else if (msk & VDSO_RAW) - return do_hres(&vd[CS_RAW], clock, ts); + vd = &vd[CS_RAW]; + else + return -1; - return -1; + return do_hres(vd, clock, ts); } static __maybe_unused int -- cgit From 769071ac9f20b6a447410c7eaa55d1a5233ef40c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:52 +0000 Subject: ns: Introduce Time Namespace Time Namespace isolates clock values. The kernel provides access to several clocks CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME, etc. CLOCK_REALTIME System-wide clock that measures real (i.e., wall-clock) time. CLOCK_MONOTONIC Clock that cannot be set and represents monotonic time since some unspecified starting point. CLOCK_BOOTTIME Identical to CLOCK_MONOTONIC, except it also includes any time that the system is suspended. For many users, the time namespace means the ability to changes date and time in a container (CLOCK_REALTIME). Providing per namespace notions of CLOCK_REALTIME would be complex with a massive overhead, but has a dubious value. But in the context of checkpoint/restore functionality, monotonic and boottime clocks become interesting. Both clocks are monotonic with unspecified starting points. These clocks are widely used to measure time slices and set timers. After restoring or migrating processes, it has to be guaranteed that they never go backward. In an ideal case, the behavior of these clocks should be the same as for a case when a whole system is suspended. All this means that it is required to set CLOCK_MONOTONIC and CLOCK_BOOTTIME clocks, which can be achieved by adding per-namespace offsets for clocks. A time namespace is similar to a pid namespace in the way how it is created: unshare(CLONE_NEWTIME) system call creates a new time namespace, but doesn't set it to the current process. Then all children of the process will be born in the new time namespace, or a process can use the setns() system call to join a namespace. This scheme allows setting clock offsets for a namespace, before any processes appear in it. All available clone flags have been used, so CLONE_NEWTIME uses the highest bit of CSIGNAL. It means that it can be used only with the unshare() and the clone3() system calls. [ tglx: Adjusted paragraph about clone3() to reality and massaged the changelog a bit. ] Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://criu.org/Time_namespace Link: https://lists.openvz.org/pipermail/criu/2018-June/041504.html Link: https://lore.kernel.org/r/20191112012724.250792-4-dima@arista.com --- MAINTAINERS | 2 + fs/proc/namespaces.c | 4 + include/linux/nsproxy.h | 2 + include/linux/proc_ns.h | 3 + include/linux/time_namespace.h | 71 ++++++++++++++ include/linux/user_namespace.h | 1 + include/uapi/linux/sched.h | 6 ++ init/Kconfig | 7 ++ kernel/fork.c | 16 ++- kernel/nsproxy.c | 41 ++++++-- kernel/time/Makefile | 1 + kernel/time/namespace.c | 217 +++++++++++++++++++++++++++++++++++++++++ 12 files changed, 361 insertions(+), 10 deletions(-) create mode 100644 include/linux/time_namespace.h create mode 100644 kernel/time/namespace.c diff --git a/MAINTAINERS b/MAINTAINERS index 8982c6e013b3..f6d00023e56f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13214,6 +13214,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core S: Maintained F: fs/timerfd.c F: include/linux/timer* +F: include/linux/time_namespace.h +F: kernel/time_namespace.c F: kernel/time/*timer* POWER MANAGEMENT CORE diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index dd2b35f78b09..8b5c720fe5d7 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -33,6 +33,10 @@ static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_CGROUPS &cgroupns_operations, #endif +#ifdef CONFIG_TIME_NS + &timens_operations, + &timens_for_children_operations, +#endif }; static const char *proc_ns_get_link(struct dentry *dentry, diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 2ae1b1a4d84d..074f395b9ad2 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -35,6 +35,8 @@ struct nsproxy { struct mnt_namespace *mnt_ns; struct pid_namespace *pid_ns_for_children; struct net *net_ns; + struct time_namespace *time_ns; + struct time_namespace *time_ns_for_children; struct cgroup_namespace *cgroup_ns; }; extern struct nsproxy init_nsproxy; diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index d31cb6215905..d312e6281e69 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -32,6 +32,8 @@ extern const struct proc_ns_operations pidns_for_children_operations; extern const struct proc_ns_operations userns_operations; extern const struct proc_ns_operations mntns_operations; extern const struct proc_ns_operations cgroupns_operations; +extern const struct proc_ns_operations timens_operations; +extern const struct proc_ns_operations timens_for_children_operations; /* * We always define these enumerators @@ -43,6 +45,7 @@ enum { PROC_USER_INIT_INO = 0xEFFFFFFDU, PROC_PID_INIT_INO = 0xEFFFFFFCU, PROC_CGROUP_INIT_INO = 0xEFFFFFFBU, + PROC_TIME_INIT_INO = 0xEFFFFFFAU, }; #ifdef CONFIG_PROC_FS diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h new file mode 100644 index 000000000000..8c74cc12ad24 --- /dev/null +++ b/include/linux/time_namespace.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_TIMENS_H +#define _LINUX_TIMENS_H + + +#include +#include +#include +#include +#include + +struct user_namespace; +extern struct user_namespace init_user_ns; + +struct time_namespace { + struct kref kref; + struct user_namespace *user_ns; + struct ucounts *ucounts; + struct ns_common ns; +} __randomize_layout; + +extern struct time_namespace init_time_ns; + +#ifdef CONFIG_TIME_NS +static inline struct time_namespace *get_time_ns(struct time_namespace *ns) +{ + kref_get(&ns->kref); + return ns; +} + +struct time_namespace *copy_time_ns(unsigned long flags, + struct user_namespace *user_ns, + struct time_namespace *old_ns); +void free_time_ns(struct kref *kref); +int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk); + +static inline void put_time_ns(struct time_namespace *ns) +{ + kref_put(&ns->kref, free_time_ns); +} + +#else +static inline struct time_namespace *get_time_ns(struct time_namespace *ns) +{ + return NULL; +} + +static inline void put_time_ns(struct time_namespace *ns) +{ +} + +static inline +struct time_namespace *copy_time_ns(unsigned long flags, + struct user_namespace *user_ns, + struct time_namespace *old_ns) +{ + if (flags & CLONE_NEWTIME) + return ERR_PTR(-EINVAL); + + return old_ns; +} + +static inline int timens_on_fork(struct nsproxy *nsproxy, + struct task_struct *tsk) +{ + return 0; +} + +#endif + +#endif /* _LINUX_TIMENS_H */ diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index fb9f4f799554..6ef1c7109fc4 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -45,6 +45,7 @@ enum ucount_type { UCOUNT_NET_NAMESPACES, UCOUNT_MNT_NAMESPACES, UCOUNT_CGROUP_NAMESPACES, + UCOUNT_TIME_NAMESPACES, #ifdef CONFIG_INOTIFY_USER UCOUNT_INOTIFY_INSTANCES, UCOUNT_INOTIFY_WATCHES, diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 4a0217832464..2e3bc22c6f20 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -36,6 +36,12 @@ /* Flags for the clone3() syscall. */ #define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ +/* + * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 + * syscalls only: + */ +#define CLONE_NEWTIME 0x00000080 /* New time namespace */ + #ifndef __ASSEMBLY__ /** * struct clone_args - arguments for the clone3 syscall diff --git a/init/Kconfig b/init/Kconfig index a34064a031a5..b34314fc75f7 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1080,6 +1080,13 @@ config UTS_NS In this namespace tasks see different info provided with the uname() system call +config TIME_NS + bool "TIME namespace" + default y + help + In this namespace boottime and monotonic clocks can be set. + The time will keep going with the same pace. + config IPC_NS bool "IPC namespace" depends on (SYSVIPC || POSIX_MQUEUE) diff --git a/kernel/fork.c b/kernel/fork.c index 2508a4f238a3..363595815144 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1832,6 +1832,7 @@ static __latent_entropy struct task_struct *copy_process( struct multiprocess_signals delayed; struct file *pidfile = NULL; u64 clone_flags = args->flags; + struct nsproxy *nsp = current->nsproxy; /* * Don't allow sharing the root directory with processes in a different @@ -1874,8 +1875,16 @@ static __latent_entropy struct task_struct *copy_process( */ if (clone_flags & CLONE_THREAD) { if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || - (task_active_pid_ns(current) != - current->nsproxy->pid_ns_for_children)) + (task_active_pid_ns(current) != nsp->pid_ns_for_children)) + return ERR_PTR(-EINVAL); + } + + /* + * If the new process will be in a different time namespace + * do not allow it to share VM or a thread group with the forking task. + */ + if (clone_flags & (CLONE_THREAD | CLONE_VM)) { + if (nsp->time_ns != nsp->time_ns_for_children) return ERR_PTR(-EINVAL); } @@ -2811,7 +2820,8 @@ static int check_unshare_flags(unsigned long unshare_flags) if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| - CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP)) + CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP| + CLONE_NEWTIME)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index c815f58e6bc0..ed9882108cd2 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +41,10 @@ struct nsproxy init_nsproxy = { #ifdef CONFIG_CGROUPS .cgroup_ns = &init_cgroup_ns, #endif +#ifdef CONFIG_TIME_NS + .time_ns = &init_time_ns, + .time_ns_for_children = &init_time_ns, +#endif }; static inline struct nsproxy *create_nsproxy(void) @@ -106,8 +111,18 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_net; } + new_nsp->time_ns_for_children = copy_time_ns(flags, user_ns, + tsk->nsproxy->time_ns_for_children); + if (IS_ERR(new_nsp->time_ns_for_children)) { + err = PTR_ERR(new_nsp->time_ns_for_children); + goto out_time; + } + new_nsp->time_ns = get_time_ns(tsk->nsproxy->time_ns); + return new_nsp; +out_time: + put_net(new_nsp->net_ns); out_net: put_cgroup_ns(new_nsp->cgroup_ns); out_cgroup: @@ -136,15 +151,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) struct nsproxy *old_ns = tsk->nsproxy; struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); struct nsproxy *new_ns; + int ret; if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNET | - CLONE_NEWCGROUP)))) { - get_nsproxy(old_ns); - return 0; - } - - if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + CLONE_NEWCGROUP | CLONE_NEWTIME)))) { + if (likely(old_ns->time_ns_for_children == old_ns->time_ns)) { + get_nsproxy(old_ns); + return 0; + } + } else if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; /* @@ -162,6 +178,12 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) if (IS_ERR(new_ns)) return PTR_ERR(new_ns); + ret = timens_on_fork(new_ns, tsk); + if (ret) { + free_nsproxy(new_ns); + return ret; + } + tsk->nsproxy = new_ns; return 0; } @@ -176,6 +198,10 @@ void free_nsproxy(struct nsproxy *ns) put_ipc_ns(ns->ipc_ns); if (ns->pid_ns_for_children) put_pid_ns(ns->pid_ns_for_children); + if (ns->time_ns) + put_time_ns(ns->time_ns); + if (ns->time_ns_for_children) + put_time_ns(ns->time_ns_for_children); put_cgroup_ns(ns->cgroup_ns); put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); @@ -192,7 +218,8 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP))) + CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP | + CLONE_NEWTIME))) return 0; user_ns = new_cred ? new_cred->user_ns : current_user_ns(); diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 1867044800bb..c8f00168afe8 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o +obj-$(CONFIG_TIME_NS) += namespace.o diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c new file mode 100644 index 000000000000..2662a69e0382 --- /dev/null +++ b/kernel/time/namespace.c @@ -0,0 +1,217 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Author: Andrei Vagin + * Author: Dmitry Safonov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct ucounts *inc_time_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES); +} + +static void dec_time_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES); +} + +/** + * clone_time_ns - Clone a time namespace + * @user_ns: User namespace which owns a new namespace. + * @old_ns: Namespace to clone + * + * Clone @old_ns and set the clone refcount to 1 + * + * Return: The new namespace or ERR_PTR. + */ +static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, + struct time_namespace *old_ns) +{ + struct time_namespace *ns; + struct ucounts *ucounts; + int err; + + err = -ENOSPC; + ucounts = inc_time_namespaces(user_ns); + if (!ucounts) + goto fail; + + err = -ENOMEM; + ns = kmalloc(sizeof(*ns), GFP_KERNEL); + if (!ns) + goto fail_dec; + + kref_init(&ns->kref); + + err = ns_alloc_inum(&ns->ns); + if (err) + goto fail_free; + + ns->ucounts = ucounts; + ns->ns.ops = &timens_operations; + ns->user_ns = get_user_ns(user_ns); + return ns; + +fail_free: + kfree(ns); +fail_dec: + dec_time_namespaces(ucounts); +fail: + return ERR_PTR(err); +} + +/** + * copy_time_ns - Create timens_for_children from @old_ns + * @flags: Cloning flags + * @user_ns: User namespace which owns a new namespace. + * @old_ns: Namespace to clone + * + * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children; + * adds a refcounter to @old_ns otherwise. + * + * Return: timens_for_children namespace or ERR_PTR. + */ +struct time_namespace *copy_time_ns(unsigned long flags, + struct user_namespace *user_ns, struct time_namespace *old_ns) +{ + if (!(flags & CLONE_NEWTIME)) + return get_time_ns(old_ns); + + return clone_time_ns(user_ns, old_ns); +} + +void free_time_ns(struct kref *kref) +{ + struct time_namespace *ns; + + ns = container_of(kref, struct time_namespace, kref); + dec_time_namespaces(ns->ucounts); + put_user_ns(ns->user_ns); + ns_free_inum(&ns->ns); + kfree(ns); +} + +static struct time_namespace *to_time_ns(struct ns_common *ns) +{ + return container_of(ns, struct time_namespace, ns); +} + +static struct ns_common *timens_get(struct task_struct *task) +{ + struct time_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->time_ns; + get_time_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static struct ns_common *timens_for_children_get(struct task_struct *task) +{ + struct time_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->time_ns_for_children; + get_time_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static void timens_put(struct ns_common *ns) +{ + put_time_ns(to_time_ns(ns)); +} + +static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) +{ + struct time_namespace *ns = to_time_ns(new); + + if (!current_is_single_threaded()) + return -EUSERS; + + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + return -EPERM; + + get_time_ns(ns); + put_time_ns(nsproxy->time_ns); + nsproxy->time_ns = ns; + + get_time_ns(ns); + put_time_ns(nsproxy->time_ns_for_children); + nsproxy->time_ns_for_children = ns; + return 0; +} + +int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) +{ + struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; + struct time_namespace *ns = to_time_ns(nsc); + + /* create_new_namespaces() already incremented the ref counter */ + if (nsproxy->time_ns == nsproxy->time_ns_for_children) + return 0; + + get_time_ns(ns); + put_time_ns(nsproxy->time_ns); + nsproxy->time_ns = ns; + + return 0; +} + +static struct user_namespace *timens_owner(struct ns_common *ns) +{ + return to_time_ns(ns)->user_ns; +} + +const struct proc_ns_operations timens_operations = { + .name = "time", + .type = CLONE_NEWTIME, + .get = timens_get, + .put = timens_put, + .install = timens_install, + .owner = timens_owner, +}; + +const struct proc_ns_operations timens_for_children_operations = { + .name = "time_for_children", + .type = CLONE_NEWTIME, + .get = timens_for_children_get, + .put = timens_put, + .install = timens_install, + .owner = timens_owner, +}; + +struct time_namespace init_time_ns = { + .kref = KREF_INIT(3), + .user_ns = &init_user_ns, + .ns.inum = PROC_TIME_INIT_INO, + .ns.ops = &timens_operations, +}; + +static int __init time_ns_init(void) +{ + return 0; +} +subsys_initcall(time_ns_init); -- cgit From af993f58d69ee9c1f421dfc87c3ed231c113989c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:53 +0000 Subject: time: Add timens_offsets to be used for tasks in time namespace Introduce offsets for time namespace. They will contain an adjustment needed to convert clocks to/from host's. A new namespace is created with the same offsets as the time namespace of the current process. Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-5-dima@arista.com --- include/linux/time_namespace.h | 22 ++++++++++++++++++++++ kernel/time/namespace.c | 2 ++ 2 files changed, 24 insertions(+) diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 8c74cc12ad24..d7e3b4994e31 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -12,11 +12,17 @@ struct user_namespace; extern struct user_namespace init_user_ns; +struct timens_offsets { + struct timespec64 monotonic; + struct timespec64 boottime; +}; + struct time_namespace { struct kref kref; struct user_namespace *user_ns; struct ucounts *ucounts; struct ns_common ns; + struct timens_offsets offsets; } __randomize_layout; extern struct time_namespace init_time_ns; @@ -39,6 +45,20 @@ static inline void put_time_ns(struct time_namespace *ns) kref_put(&ns->kref, free_time_ns); } +static inline void timens_add_monotonic(struct timespec64 *ts) +{ + struct timens_offsets *ns_offsets = ¤t->nsproxy->time_ns->offsets; + + *ts = timespec64_add(*ts, ns_offsets->monotonic); +} + +static inline void timens_add_boottime(struct timespec64 *ts) +{ + struct timens_offsets *ns_offsets = ¤t->nsproxy->time_ns->offsets; + + *ts = timespec64_add(*ts, ns_offsets->boottime); +} + #else static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { @@ -66,6 +86,8 @@ static inline int timens_on_fork(struct nsproxy *nsproxy, return 0; } +static inline void timens_add_monotonic(struct timespec64 *ts) { } +static inline void timens_add_boottime(struct timespec64 *ts) { } #endif #endif /* _LINUX_TIMENS_H */ diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 2662a69e0382..c2a58e45fc4b 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -14,6 +14,7 @@ #include #include #include +#include static struct ucounts *inc_time_namespaces(struct user_namespace *ns) { @@ -60,6 +61,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, ns->ucounts = ucounts; ns->ns.ops = &timens_operations; ns->user_ns = get_user_ns(user_ns); + ns->offsets = old_ns->offsets; return ns; fail_free: -- cgit From 819a95fe3adfc7b558bfd96dd5ac589c4f543fd4 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:54 +0000 Subject: posix-clocks: Rename the clock_get() callback to clock_get_timespec() The upcoming support for time namespaces requires to have access to: - The time in a task's time namespace for sys_clock_gettime() - The time in the root name space for common_timer_get() That adds a valid reason to finally implement a separate callback which returns the time in ktime_t format, rather than in (struct timespec). Rename the clock_get() callback to clock_get_timespec() as a preparation for introducing clock_get_ktime(). Suggested-by: Thomas Gleixner Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-6-dima@arista.com --- kernel/time/alarmtimer.c | 4 ++-- kernel/time/posix-clock.c | 8 ++++---- kernel/time/posix-cpu-timers.c | 32 ++++++++++++++++---------------- kernel/time/posix-timers.c | 22 +++++++++++----------- kernel/time/posix-timers.h | 4 ++-- 5 files changed, 35 insertions(+), 35 deletions(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 451f9d05ccfe..8523df726fee 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -657,7 +657,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp } /** - * alarm_clock_get - posix clock_get interface + * alarm_clock_get - posix clock_get_timespec interface * @which_clock: clockid * @tp: timespec to fill. * @@ -837,7 +837,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, const struct k_clock alarm_clock = { .clock_getres = alarm_clock_getres, - .clock_get = alarm_clock_get, + .clock_get_timespec = alarm_clock_get, .timer_create = alarm_timer_create, .timer_set = common_timer_set, .timer_del = common_timer_del, diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 200fb2d3be99..77c0c2370b6d 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -310,8 +310,8 @@ out: } const struct k_clock clock_posix_dynamic = { - .clock_getres = pc_clock_getres, - .clock_set = pc_clock_settime, - .clock_get = pc_clock_gettime, - .clock_adj = pc_clock_adjtime, + .clock_getres = pc_clock_getres, + .clock_set = pc_clock_settime, + .clock_get_timespec = pc_clock_gettime, + .clock_adj = pc_clock_adjtime, }; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 42d512fcfda2..8ff6da77a01f 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1391,26 +1391,26 @@ static int thread_cpu_timer_create(struct k_itimer *timer) } const struct k_clock clock_posix_cpu = { - .clock_getres = posix_cpu_clock_getres, - .clock_set = posix_cpu_clock_set, - .clock_get = posix_cpu_clock_get, - .timer_create = posix_cpu_timer_create, - .nsleep = posix_cpu_nsleep, - .timer_set = posix_cpu_timer_set, - .timer_del = posix_cpu_timer_del, - .timer_get = posix_cpu_timer_get, - .timer_rearm = posix_cpu_timer_rearm, + .clock_getres = posix_cpu_clock_getres, + .clock_set = posix_cpu_clock_set, + .clock_get_timespec = posix_cpu_clock_get, + .timer_create = posix_cpu_timer_create, + .nsleep = posix_cpu_nsleep, + .timer_set = posix_cpu_timer_set, + .timer_del = posix_cpu_timer_del, + .timer_get = posix_cpu_timer_get, + .timer_rearm = posix_cpu_timer_rearm, }; const struct k_clock clock_process = { - .clock_getres = process_cpu_clock_getres, - .clock_get = process_cpu_clock_get, - .timer_create = process_cpu_timer_create, - .nsleep = process_cpu_nsleep, + .clock_getres = process_cpu_clock_getres, + .clock_get_timespec = process_cpu_clock_get, + .timer_create = process_cpu_timer_create, + .nsleep = process_cpu_nsleep, }; const struct k_clock clock_thread = { - .clock_getres = thread_cpu_clock_getres, - .clock_get = thread_cpu_clock_get, - .timer_create = thread_cpu_timer_create, + .clock_getres = thread_cpu_clock_getres, + .clock_get_timespec = thread_cpu_clock_get, + .timer_create = thread_cpu_timer_create, }; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 0ec5b7a1d769..44d4f9cb782d 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -667,7 +667,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) * The timespec64 based conversion is suboptimal, but it's not * worth to implement yet another callback. */ - kc->clock_get(timr->it_clock, &ts64); + kc->clock_get_timespec(timr->it_clock, &ts64); now = timespec64_to_ktime(ts64); /* @@ -781,7 +781,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, * Posix magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they become CLOCK_MONOTONIC based under the * hood. See hrtimer_init(). Update timr->kclock, so the generic - * functions which use timr->kclock->clock_get() work. + * functions which use timr->kclock->clock_get_timespec() work. * * Note: it_clock stays unmodified, because the next timer_set() might * use ABSTIME, so it needs to switch back. @@ -1067,7 +1067,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, if (!kc) return -EINVAL; - error = kc->clock_get(which_clock, &kernel_tp); + error = kc->clock_get_timespec(which_clock, &kernel_tp); if (!error && put_timespec64(&kernel_tp, tp)) error = -EFAULT; @@ -1149,7 +1149,7 @@ SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock, if (!kc) return -EINVAL; - err = kc->clock_get(which_clock, &ts); + err = kc->clock_get_timespec(which_clock, &ts); if (!err && put_old_timespec32(&ts, tp)) err = -EFAULT; @@ -1261,7 +1261,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, static const struct k_clock clock_realtime = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_clock_realtime_get, + .clock_get_timespec = posix_clock_realtime_get, .clock_set = posix_clock_realtime_set, .clock_adj = posix_clock_realtime_adj, .nsleep = common_nsleep, @@ -1279,7 +1279,7 @@ static const struct k_clock clock_realtime = { static const struct k_clock clock_monotonic = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_ktime_get_ts, + .clock_get_timespec = posix_ktime_get_ts, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, @@ -1295,22 +1295,22 @@ static const struct k_clock clock_monotonic = { static const struct k_clock clock_monotonic_raw = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_monotonic_raw, + .clock_get_timespec = posix_get_monotonic_raw, }; static const struct k_clock clock_realtime_coarse = { .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_realtime_coarse, + .clock_get_timespec = posix_get_realtime_coarse, }; static const struct k_clock clock_monotonic_coarse = { .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_monotonic_coarse, + .clock_get_timespec = posix_get_monotonic_coarse, }; static const struct k_clock clock_tai = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_tai, + .clock_get_timespec = posix_get_tai, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, @@ -1326,7 +1326,7 @@ static const struct k_clock clock_tai = { static const struct k_clock clock_boottime = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_boottime, + .clock_get_timespec = posix_get_boottime, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 897c29e162b9..070611b2c253 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -6,8 +6,8 @@ struct k_clock { struct timespec64 *tp); int (*clock_set)(const clockid_t which_clock, const struct timespec64 *tp); - int (*clock_get)(const clockid_t which_clock, - struct timespec64 *tp); + int (*clock_get_timespec)(const clockid_t which_clock, + struct timespec64 *tp); int (*clock_adj)(const clockid_t which_clock, struct __kernel_timex *tx); int (*timer_create)(struct k_itimer *timer); int (*nsleep)(const clockid_t which_clock, int flags, -- cgit From eaf80194d0fe48be393587541c48a799a9a06a70 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:55 +0000 Subject: posix-clocks: Rename .clock_get_timespec() callbacks accordingly The upcoming support for time namespaces requires to have access to: - The time in a task's time namespace for sys_clock_gettime() - The time in the root name space for common_timer_get() That adds a valid reason to finally implement a separate callback which returns the time in ktime_t format in (struct k_clock). As a preparation ground for introducing clock_get_ktime(), the original callback clock_get() was renamed into clock_get_timespec(). Reflect the renaming into the callback implementations. Suggested-by: Thomas Gleixner Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-7-dima@arista.com --- kernel/time/alarmtimer.c | 6 +++--- kernel/time/posix-timers.c | 16 ++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 8523df726fee..62b06cfa710d 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -657,13 +657,13 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp } /** - * alarm_clock_get - posix clock_get_timespec interface + * alarm_clock_get_timespec - posix clock_get_timespec interface * @which_clock: clockid * @tp: timespec to fill. * * Provides the underlying alarm base time. */ -static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp) +static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp) { struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; @@ -837,7 +837,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, const struct k_clock alarm_clock = { .clock_getres = alarm_clock_getres, - .clock_get_timespec = alarm_clock_get, + .clock_get_timespec = alarm_clock_get_timespec, .timer_create = alarm_timer_create, .timer_set = common_timer_set, .timer_del = common_timer_del, diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 44d4f9cb782d..68d4690cc225 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -165,7 +165,7 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) } /* Get clock_realtime */ -static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp) +static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_real_ts64(tp); return 0; @@ -187,7 +187,7 @@ static int posix_clock_realtime_adj(const clockid_t which_clock, /* * Get monotonic time for posix timers */ -static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp) +static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_ts64(tp); return 0; @@ -222,13 +222,13 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 * return 0; } -static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp) +static int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp) { ktime_get_boottime_ts64(tp); return 0; } -static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) +static int posix_get_tai_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_clocktai_ts64(tp); return 0; @@ -1261,7 +1261,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, static const struct k_clock clock_realtime = { .clock_getres = posix_get_hrtimer_res, - .clock_get_timespec = posix_clock_realtime_get, + .clock_get_timespec = posix_get_realtime_timespec, .clock_set = posix_clock_realtime_set, .clock_adj = posix_clock_realtime_adj, .nsleep = common_nsleep, @@ -1279,7 +1279,7 @@ static const struct k_clock clock_realtime = { static const struct k_clock clock_monotonic = { .clock_getres = posix_get_hrtimer_res, - .clock_get_timespec = posix_ktime_get_ts, + .clock_get_timespec = posix_get_monotonic_timespec, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, @@ -1310,7 +1310,7 @@ static const struct k_clock clock_monotonic_coarse = { static const struct k_clock clock_tai = { .clock_getres = posix_get_hrtimer_res, - .clock_get_timespec = posix_get_tai, + .clock_get_timespec = posix_get_tai_timespec, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, @@ -1326,7 +1326,7 @@ static const struct k_clock clock_tai = { static const struct k_clock clock_boottime = { .clock_getres = posix_get_hrtimer_res, - .clock_get_timespec = posix_get_boottime, + .clock_get_timespec = posix_get_boottime_timespec, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, -- cgit From 41b3b8dffc1f84e581addfbc09bec0289db3315e Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:56 +0000 Subject: alarmtimer: Rename gettime() callback to get_ktime() The upcoming support for time namespaces requires to have access to: - The time in a tasks time namespace for sys_clock_gettime() - The time in the root name space for common_timer_get() struct alarm_base needs to follow the same naming convention, so rename .gettime() callback into get_ktime() as a preparation for introducing get_timespec(). Suggested-by: Thomas Gleixner Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-8-dima@arista.com --- kernel/time/alarmtimer.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 62b06cfa710d..22b6f9b133b2 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -36,13 +36,13 @@ * struct alarm_base - Alarm timer bases * @lock: Lock for syncrhonized access to the base * @timerqueue: Timerqueue head managing the list of events - * @gettime: Function to read the time correlating to the base + * @get_ktime: Function to read the time correlating to the base * @base_clockid: clockid for the base */ static struct alarm_base { spinlock_t lock; struct timerqueue_head timerqueue; - ktime_t (*gettime)(void); + ktime_t (*get_ktime)(void); clockid_t base_clockid; } alarm_bases[ALARM_NUMTYPE]; @@ -207,7 +207,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) spin_unlock_irqrestore(&base->lock, flags); if (alarm->function) - restart = alarm->function(alarm, base->gettime()); + restart = alarm->function(alarm, base->get_ktime()); spin_lock_irqsave(&base->lock, flags); if (restart != ALARMTIMER_NORESTART) { @@ -217,7 +217,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) } spin_unlock_irqrestore(&base->lock, flags); - trace_alarmtimer_fired(alarm, base->gettime()); + trace_alarmtimer_fired(alarm, base->get_ktime()); return ret; } @@ -225,7 +225,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) ktime_t alarm_expires_remaining(const struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; - return ktime_sub(alarm->node.expires, base->gettime()); + return ktime_sub(alarm->node.expires, base->get_ktime()); } EXPORT_SYMBOL_GPL(alarm_expires_remaining); @@ -270,7 +270,7 @@ static int alarmtimer_suspend(struct device *dev) spin_unlock_irqrestore(&base->lock, flags); if (!next) continue; - delta = ktime_sub(next->expires, base->gettime()); + delta = ktime_sub(next->expires, base->get_ktime()); if (!min || (delta < min)) { expires = next->expires; min = delta; @@ -364,7 +364,7 @@ void alarm_start(struct alarm *alarm, ktime_t start) hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); spin_unlock_irqrestore(&base->lock, flags); - trace_alarmtimer_start(alarm, base->gettime()); + trace_alarmtimer_start(alarm, base->get_ktime()); } EXPORT_SYMBOL_GPL(alarm_start); @@ -377,7 +377,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; - start = ktime_add_safe(start, base->gettime()); + start = ktime_add_safe(start, base->get_ktime()); alarm_start(alarm, start); } EXPORT_SYMBOL_GPL(alarm_start_relative); @@ -414,7 +414,7 @@ int alarm_try_to_cancel(struct alarm *alarm) alarmtimer_dequeue(base, alarm); spin_unlock_irqrestore(&base->lock, flags); - trace_alarmtimer_cancel(alarm, base->gettime()); + trace_alarmtimer_cancel(alarm, base->get_ktime()); return ret; } EXPORT_SYMBOL_GPL(alarm_try_to_cancel); @@ -474,7 +474,7 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) { struct alarm_base *base = &alarm_bases[alarm->type]; - return alarm_forward(alarm, base->gettime(), interval); + return alarm_forward(alarm, base->get_ktime(), interval); } EXPORT_SYMBOL_GPL(alarm_forward_now); @@ -500,7 +500,7 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) return; } - delta = ktime_sub(absexp, base->gettime()); + delta = ktime_sub(absexp, base->get_ktime()); spin_lock_irqsave(&freezer_delta_lock, flags); if (!freezer_delta || (delta < freezer_delta)) { @@ -632,7 +632,7 @@ static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires, struct alarm_base *base = &alarm_bases[alarm->type]; if (!absolute) - expires = ktime_add_safe(expires, base->gettime()); + expires = ktime_add_safe(expires, base->get_ktime()); if (sigev_none) alarm->node.expires = expires; else @@ -670,7 +670,7 @@ static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp if (!alarmtimer_get_rtcdev()) return -EINVAL; - *tp = ktime_to_timespec64(base->gettime()); + *tp = ktime_to_timespec64(base->get_ktime()); return 0; } @@ -747,7 +747,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, struct timespec64 rmt; ktime_t rem; - rem = ktime_sub(absexp, alarm_bases[type].gettime()); + rem = ktime_sub(absexp, alarm_bases[type].get_ktime()); if (rem <= 0) return 0; @@ -816,7 +816,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, exp = timespec64_to_ktime(*tsreq); /* Convert (if necessary) to absolute time */ if (flags != TIMER_ABSTIME) { - ktime_t now = alarm_bases[type].gettime(); + ktime_t now = alarm_bases[type].get_ktime(); exp = ktime_add_safe(now, exp); } @@ -882,9 +882,9 @@ static int __init alarmtimer_init(void) /* Initialize alarm bases */ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; - alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; + alarm_bases[ALARM_REALTIME].get_ktime = &ktime_get_real; alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; - alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime; + alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime; for (i = 0; i < ALARM_NUMTYPE; i++) { timerqueue_init_head(&alarm_bases[i].timerqueue); spin_lock_init(&alarm_bases[i].lock); -- cgit From 2f58bf909abf9670fa4e848b433dc12ba4c2a44e Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:57 +0000 Subject: alarmtimer: Provide get_timespec() callback The upcoming support for time namespaces requires to have access to: - The time in a task's time namespace for sys_clock_gettime() - The time in the root name space for common_timer_get() Wire up alarm bases with get_timespec(). Suggested-by: Thomas Gleixner Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-9-dima@arista.com --- kernel/time/alarmtimer.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 22b6f9b133b2..357be1fe6e1f 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -37,12 +37,14 @@ * @lock: Lock for syncrhonized access to the base * @timerqueue: Timerqueue head managing the list of events * @get_ktime: Function to read the time correlating to the base + * @get_timespec: Function to read the namespace time correlating to the base * @base_clockid: clockid for the base */ static struct alarm_base { spinlock_t lock; struct timerqueue_head timerqueue; ktime_t (*get_ktime)(void); + void (*get_timespec)(struct timespec64 *tp); clockid_t base_clockid; } alarm_bases[ALARM_NUMTYPE]; @@ -670,7 +672,8 @@ static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp if (!alarmtimer_get_rtcdev()) return -EINVAL; - *tp = ktime_to_timespec64(base->get_ktime()); + base->get_timespec(tp); + return 0; } @@ -883,8 +886,10 @@ static int __init alarmtimer_init(void) /* Initialize alarm bases */ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; alarm_bases[ALARM_REALTIME].get_ktime = &ktime_get_real; + alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64, alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime; + alarm_bases[ALARM_BOOTTIME].get_timespec = ktime_get_boottime_ts64; for (i = 0; i < ALARM_NUMTYPE; i++) { timerqueue_init_head(&alarm_bases[i].timerqueue); spin_lock_init(&alarm_bases[i].lock); -- cgit From 9c71a2e8a757bc6aee256bc97c6fb711144b0a0f Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:58 +0000 Subject: posix-clocks: Introduce clock_get_ktime() callback The callsite in common_timer_get() has already a comment: /* * The timespec64 based conversion is suboptimal, but it's not * worth to implement yet another callback. */ kc->clock_get(timr->it_clock, &ts64); now = timespec64_to_ktime(ts64); The upcoming support for time namespaces requires to have access to: - The time in a task's time namespace for sys_clock_gettime() - The time in the root name space for common_timer_get() That adds a valid reason to finally implement a separate callback which returns the time in ktime_t format. Suggested-by: Thomas Gleixner Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-10-dima@arista.com --- kernel/time/alarmtimer.c | 19 ++++++++++++++++++- kernel/time/posix-timers.c | 26 +++++++++++++++++++++++++- kernel/time/posix-timers.h | 3 +++ 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 357be1fe6e1f..4d8c90546635 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -663,7 +663,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp * @which_clock: clockid * @tp: timespec to fill. * - * Provides the underlying alarm base time. + * Provides the underlying alarm base time in a tasks time namespace. */ static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp) { @@ -677,6 +677,22 @@ static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp return 0; } +/** + * alarm_clock_get_ktime - posix clock_get_ktime interface + * @which_clock: clockid + * + * Provides the underlying alarm base time in the root namespace. + */ +static ktime_t alarm_clock_get_ktime(clockid_t which_clock) +{ + struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; + + if (!alarmtimer_get_rtcdev()) + return -EINVAL; + + return base->get_ktime(); +} + /** * alarm_timer_create - posix timer_create interface * @new_timer: k_itimer pointer to manage @@ -840,6 +856,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, const struct k_clock alarm_clock = { .clock_getres = alarm_clock_getres, + .clock_get_ktime = alarm_clock_get_ktime, .clock_get_timespec = alarm_clock_get_timespec, .timer_create = alarm_timer_create, .timer_set = common_timer_set, diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 68d4690cc225..a1f6b968c5d8 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -171,6 +171,11 @@ static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 return 0; } +static ktime_t posix_get_realtime_ktime(clockid_t which_clock) +{ + return ktime_get_real(); +} + /* Set clock_realtime */ static int posix_clock_realtime_set(const clockid_t which_clock, const struct timespec64 *tp) @@ -193,6 +198,11 @@ static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 return 0; } +static ktime_t posix_get_monotonic_ktime(clockid_t which_clock) +{ + return ktime_get(); +} + /* * Get monotonic-raw time for posix timers */ @@ -228,12 +238,22 @@ static int posix_get_boottime_timespec(const clockid_t which_clock, struct times return 0; } +static ktime_t posix_get_boottime_ktime(const clockid_t which_clock) +{ + return ktime_get_boottime(); +} + static int posix_get_tai_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_clocktai_ts64(tp); return 0; } +static ktime_t posix_get_tai_ktime(clockid_t which_clock) +{ + return ktime_get_clocktai(); +} + static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) { tp->tv_sec = 0; @@ -781,7 +801,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, * Posix magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they become CLOCK_MONOTONIC based under the * hood. See hrtimer_init(). Update timr->kclock, so the generic - * functions which use timr->kclock->clock_get_timespec() work. + * functions which use timr->kclock->clock_get_*() work. * * Note: it_clock stays unmodified, because the next timer_set() might * use ABSTIME, so it needs to switch back. @@ -1262,6 +1282,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, static const struct k_clock clock_realtime = { .clock_getres = posix_get_hrtimer_res, .clock_get_timespec = posix_get_realtime_timespec, + .clock_get_ktime = posix_get_realtime_ktime, .clock_set = posix_clock_realtime_set, .clock_adj = posix_clock_realtime_adj, .nsleep = common_nsleep, @@ -1280,6 +1301,7 @@ static const struct k_clock clock_realtime = { static const struct k_clock clock_monotonic = { .clock_getres = posix_get_hrtimer_res, .clock_get_timespec = posix_get_monotonic_timespec, + .clock_get_ktime = posix_get_monotonic_ktime, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, @@ -1310,6 +1332,7 @@ static const struct k_clock clock_monotonic_coarse = { static const struct k_clock clock_tai = { .clock_getres = posix_get_hrtimer_res, + .clock_get_ktime = posix_get_tai_ktime, .clock_get_timespec = posix_get_tai_timespec, .nsleep = common_nsleep, .timer_create = common_timer_create, @@ -1326,6 +1349,7 @@ static const struct k_clock clock_tai = { static const struct k_clock clock_boottime = { .clock_getres = posix_get_hrtimer_res, + .clock_get_ktime = posix_get_boottime_ktime, .clock_get_timespec = posix_get_boottime_timespec, .nsleep = common_nsleep, .timer_create = common_timer_create, diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 070611b2c253..f32a2ebba9b8 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -6,8 +6,11 @@ struct k_clock { struct timespec64 *tp); int (*clock_set)(const clockid_t which_clock, const struct timespec64 *tp); + /* Returns the clock value in the current time namespace. */ int (*clock_get_timespec)(const clockid_t which_clock, struct timespec64 *tp); + /* Returns the clock value in the root time namespace. */ + ktime_t (*clock_get_ktime)(const clockid_t which_clock); int (*clock_adj)(const clockid_t which_clock, struct __kernel_timex *tx); int (*timer_create)(struct k_itimer *timer); int (*nsleep)(const clockid_t which_clock, int flags, -- cgit From 198fa445d5c4c1a1c6c1d39f962559f8d008e79d Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:59 +0000 Subject: posix-timers: Use clock_get_ktime() in common_timer_get() Now, when the clock_get_ktime() callback exists, the suboptimal timespec64-based conversion can be removed from common_timer_get(). Suggested-by: Thomas Gleixner Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-11-dima@arista.com --- kernel/time/posix-timers.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index a1f6b968c5d8..fe1de4f71ace 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -665,7 +665,6 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) { const struct k_clock *kc = timr->kclock; ktime_t now, remaining, iv; - struct timespec64 ts64; bool sig_none; sig_none = timr->it_sigev_notify == SIGEV_NONE; @@ -683,12 +682,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) return; } - /* - * The timespec64 based conversion is suboptimal, but it's not - * worth to implement yet another callback. - */ - kc->clock_get_timespec(timr->it_clock, &ts64); - now = timespec64_to_ktime(ts64); + now = kc->clock_get_ktime(timr->it_clock); /* * When a requeue is pending or this is a SIGEV_NONE timer move the -- cgit From 5a590f35add93c2bdf3ed83eee73111021679562 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:00 +0000 Subject: posix-clocks: Wire up clock_gettime() with timens offsets Adjust monotonic and boottime clocks with per-timens offsets. As the result a process inside time namespace will see timers and clocks corrected to offsets that were set when the namespace was created Note that applications usually go through vDSO to get time, which is not yet adjusted. Further changes will complete time namespace virtualisation with vDSO support. Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-12-dima@arista.com --- kernel/time/alarmtimer.c | 9 ++++++++- kernel/time/posix-stubs.c | 3 +++ kernel/time/posix-timers.c | 5 +++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 4d8c90546635..9a8e81bc4ec2 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "posix-timers.h" @@ -886,6 +887,12 @@ static struct platform_driver alarmtimer_driver = { } }; +static void get_boottime_timespec(struct timespec64 *tp) +{ + ktime_get_boottime_ts64(tp); + timens_add_boottime(tp); +} + /** * alarmtimer_init - Initialize alarm timer code * @@ -906,7 +913,7 @@ static int __init alarmtimer_init(void) alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64, alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime; - alarm_bases[ALARM_BOOTTIME].get_timespec = ktime_get_boottime_ts64; + alarm_bases[ALARM_BOOTTIME].get_timespec = get_boottime_timespec; for (i = 0; i < ALARM_NUMTYPE; i++) { timerqueue_init_head(&alarm_bases[i].timerqueue); spin_lock_init(&alarm_bases[i].lock); diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 20c65a7d4e3a..bcbaa2045f5e 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER @@ -77,9 +78,11 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) break; case CLOCK_MONOTONIC: ktime_get_ts64(tp); + timens_add_monotonic(tp); break; case CLOCK_BOOTTIME: ktime_get_boottime_ts64(tp); + timens_add_boottime(tp); break; default: return -EINVAL; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index fe1de4f71ace..d26b915b227a 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "timekeeping.h" #include "posix-timers.h" @@ -195,6 +196,7 @@ static int posix_clock_realtime_adj(const clockid_t which_clock, static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_ts64(tp); + timens_add_monotonic(tp); return 0; } @@ -209,6 +211,7 @@ static ktime_t posix_get_monotonic_ktime(clockid_t which_clock) static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) { ktime_get_raw_ts64(tp); + timens_add_monotonic(tp); return 0; } @@ -223,6 +226,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock, struct timespec64 *tp) { ktime_get_coarse_ts64(tp); + timens_add_monotonic(tp); return 0; } @@ -235,6 +239,7 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 * static int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp) { ktime_get_boottime_ts64(tp); + timens_add_boottime(tp); return 0; } -- cgit From 89dd8eecfe961fab4924dcd14f80cf2ab2820044 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:01 +0000 Subject: time: Add do_timens_ktime_to_host() helper The helper subtracts namespace's clock offset from the given time and ensures that the result is within [0, KTIME_MAX]. Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-13-dima@arista.com --- include/linux/time_namespace.h | 17 +++++++++++++++++ kernel/time/namespace.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index d7e3b4994e31..34ee110b5c35 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -59,6 +59,19 @@ static inline void timens_add_boottime(struct timespec64 *ts) *ts = timespec64_add(*ts, ns_offsets->boottime); } +ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, + struct timens_offsets *offsets); + +static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) +{ + struct time_namespace *ns = current->nsproxy->time_ns; + + if (likely(ns == &init_time_ns)) + return tim; + + return do_timens_ktime_to_host(clockid, tim, &ns->offsets); +} + #else static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { @@ -88,6 +101,10 @@ static inline int timens_on_fork(struct nsproxy *nsproxy, static inline void timens_add_monotonic(struct timespec64 *ts) { } static inline void timens_add_boottime(struct timespec64 *ts) { } +static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) +{ + return tim; +} #endif #endif /* _LINUX_TIMENS_H */ diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index c2a58e45fc4b..1a0fbaa5d2d4 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -16,6 +16,42 @@ #include #include +ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, + struct timens_offsets *ns_offsets) +{ + ktime_t offset; + + switch (clockid) { + case CLOCK_MONOTONIC: + offset = timespec64_to_ktime(ns_offsets->monotonic); + break; + case CLOCK_BOOTTIME: + case CLOCK_BOOTTIME_ALARM: + offset = timespec64_to_ktime(ns_offsets->boottime); + break; + default: + return tim; + } + + /* + * Check that @tim value is in [offset, KTIME_MAX + offset] + * and subtract offset. + */ + if (tim < offset) { + /* + * User can specify @tim *absolute* value - if it's lesser than + * the time namespace's offset - it's already expired. + */ + tim = 0; + } else { + tim = ktime_sub(tim, offset); + if (unlikely(tim > KTIME_MAX)) + tim = KTIME_MAX; + } + + return tim; +} + static struct ucounts *inc_time_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES); -- cgit From 6cd889d43c40b13f81a44c41896781ce70244769 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:02 +0000 Subject: timerfd: Make timerfd_settime() time namespace aware timerfd_settime() accepts an absolute value of the expiration time if TFD_TIMER_ABSTIME is specified. This value is in the task's time namespace and has to be converted to the host's time namespace. Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-14-dima@arista.com --- fs/timerfd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/timerfd.c b/fs/timerfd.c index ac7f59a58f94..c5509d2448e3 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -26,6 +26,7 @@ #include #include #include +#include struct timerfd_ctx { union { @@ -196,6 +197,8 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, } if (texp != 0) { + if (flags & TFD_TIMER_ABSTIME) + texp = timens_ktime_to_host(clockid, texp); if (isalarm(ctx)) { if (flags & TFD_TIMER_ABSTIME) alarm_start(&ctx->t.alarm, texp); -- cgit From 7da8b3a44bb426a43670b3a97ed61085018a9d43 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:03 +0000 Subject: posix-timers: Make timer_settime() time namespace aware Wire timer_settime() syscall into time namespace virtualization. sys_timer_settime() calls the ktime->timer_set() callback. Right now, common_timer_set() is the only implementation for the callback. The user-supplied expiry value is converted from timespec64 to ktime and then timens_ktime_to_host() can be used to convert namespace's time to the host time. Inside a time namespace kernel's time differs by a fixed offset from a user-supplied time, but only absolute values (TIMER_ABSTIME) must be converted. Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-15-dima@arista.com --- kernel/time/posix-timers.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index d26b915b227a..473082b0b57f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -885,6 +885,8 @@ int common_timer_set(struct k_itimer *timr, int flags, timr->it_interval = timespec64_to_ktime(new_setting->it_interval); expires = timespec64_to_ktime(new_setting->it_value); + if (flags & TIMER_ABSTIME) + expires = timens_ktime_to_host(timr->it_clock, expires); sigev_none = timr->it_sigev_notify == SIGEV_NONE; kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); -- cgit From 0b9b9a3b162e85e620e3598f1badc45b8a177492 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:04 +0000 Subject: alarmtimer: Make nanosleep() time namespace aware clock_nanosleep() accepts absolute values of expiration time when the TIMER_ABSTIME flag is set. This absolute value is inside the task's time namespace and has to be converted to the host's time. Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-16-dima@arista.com --- kernel/time/alarmtimer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 9a8e81bc4ec2..b51b36e533c4 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -839,6 +839,8 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, ktime_t now = alarm_bases[type].get_ktime(); exp = ktime_add_safe(now, exp); + } else { + exp = timens_ktime_to_host(which_clock, exp); } ret = alarmtimer_do_nsleep(&alarm, exp, type); -- cgit From ea2d1f7fce0f18b67f915c00c6a7a6860116bc92 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:05 +0000 Subject: hrtimers: Prepare hrtimer_nanosleep() for time namespaces clock_nanosleep() accepts absolute values of expiration time when TIMER_ABSTIME flag is set. This absolute value is inside the task's time namespace, and has to be converted to the host's time. There is timens_ktime_to_host() helper for converting time, but it accepts ktime argument. As a preparation, make hrtimer_nanosleep() accept a clock value in ktime instead of timespec64. Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-17-dima@arista.com --- include/linux/hrtimer.h | 3 +-- kernel/time/hrtimer.c | 12 +++++++----- kernel/time/posix-stubs.c | 4 ++-- kernel/time/posix-timers.c | 4 +++- tools/perf/examples/bpf/5sec.c | 6 ++++-- 5 files changed, 17 insertions(+), 12 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 1f98b52118f0..15c8ac313678 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -508,8 +508,7 @@ static inline u64 hrtimer_forward_now(struct hrtimer *timer, /* Precise sleep: */ extern int nanosleep_copyout(struct restart_block *, struct timespec64 *); -extern long hrtimer_nanosleep(const struct timespec64 *rqtp, - const enum hrtimer_mode mode, +extern long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid); extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta, diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 8de90ea31280..d8b62f93fc8d 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1910,8 +1910,8 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) return ret; } -long hrtimer_nanosleep(const struct timespec64 *rqtp, - const enum hrtimer_mode mode, const clockid_t clockid) +long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, + const clockid_t clockid) { struct restart_block *restart; struct hrtimer_sleeper t; @@ -1923,7 +1923,7 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp, slack = 0; hrtimer_init_sleeper_on_stack(&t, clockid, mode); - hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); + hrtimer_set_expires_range_ns(&t.timer, rqtp, slack); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) goto out; @@ -1958,7 +1958,8 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, + CLOCK_MONOTONIC); } #endif @@ -1978,7 +1979,8 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, + CLOCK_MONOTONIC); } #endif diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index bcbaa2045f5e..5745a138f254 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -147,7 +147,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? + return hrtimer_nanosleep(timespec64_to_ktime(t), flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } @@ -236,7 +236,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? + return hrtimer_nanosleep(timespec64_to_ktime(t), flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 473082b0b57f..75fee6e39e5a 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1221,7 +1221,9 @@ SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, static int common_nsleep(const clockid_t which_clock, int flags, const struct timespec64 *rqtp) { - return hrtimer_nanosleep(rqtp, flags & TIMER_ABSTIME ? + ktime_t texp = timespec64_to_ktime(*rqtp); + + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } diff --git a/tools/perf/examples/bpf/5sec.c b/tools/perf/examples/bpf/5sec.c index b9c203219691..e6b6181c6dc6 100644 --- a/tools/perf/examples/bpf/5sec.c +++ b/tools/perf/examples/bpf/5sec.c @@ -41,9 +41,11 @@ #include -int probe(hrtimer_nanosleep, rqtp->tv_sec)(void *ctx, int err, long sec) +#define NSEC_PER_SEC 1000000000L + +int probe(hrtimer_nanosleep, rqtp)(void *ctx, int err, long long sec) { - return sec == 5; + return sec / NSEC_PER_SEC == 5ULL; } license(GPL); -- cgit From 1f9b37bfbb607a09d838c248843e63a2cafe1080 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:06 +0000 Subject: posix-timers: Make clock_nanosleep() time namespace aware clock_nanosleep() accepts absolute values of expiration time, if the TIMER_ABSTIME flag is set. This value is in the tasks time namespace, which has to be converted to the host time namespace. Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-18-dima@arista.com --- kernel/time/posix-stubs.c | 12 ++++++++++-- kernel/time/posix-timers.c | 17 +++++++++++++++-- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 5745a138f254..fcb3b21d8bdc 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -129,6 +129,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, struct __kernel_timespec __user *, rmtp) { struct timespec64 t; + ktime_t texp; switch (which_clock) { case CLOCK_REALTIME: @@ -147,7 +148,10 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(timespec64_to_ktime(t), flags & TIMER_ABSTIME ? + texp = timespec64_to_ktime(t); + if (flags & TIMER_ABSTIME) + texp = timens_ktime_to_host(which_clock, texp); + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } @@ -218,6 +222,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, struct old_timespec32 __user *, rmtp) { struct timespec64 t; + ktime_t texp; switch (which_clock) { case CLOCK_REALTIME: @@ -236,7 +241,10 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(timespec64_to_ktime(t), flags & TIMER_ABSTIME ? + texp = timespec64_to_ktime(t); + if (flags & TIMER_ABSTIME) + texp = timens_ktime_to_host(which_clock, texp); + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 75fee6e39e5a..ff0eb30de346 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1228,6 +1228,19 @@ static int common_nsleep(const clockid_t which_clock, int flags, which_clock); } +static int common_nsleep_timens(const clockid_t which_clock, int flags, + const struct timespec64 *rqtp) +{ + ktime_t texp = timespec64_to_ktime(*rqtp); + + if (flags & TIMER_ABSTIME) + texp = timens_ktime_to_host(which_clock, texp); + + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); +} + SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, const struct __kernel_timespec __user *, rqtp, struct __kernel_timespec __user *, rmtp) @@ -1305,7 +1318,7 @@ static const struct k_clock clock_monotonic = { .clock_getres = posix_get_hrtimer_res, .clock_get_timespec = posix_get_monotonic_timespec, .clock_get_ktime = posix_get_monotonic_ktime, - .nsleep = common_nsleep, + .nsleep = common_nsleep_timens, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, @@ -1354,7 +1367,7 @@ static const struct k_clock clock_boottime = { .clock_getres = posix_get_hrtimer_res, .clock_get_ktime = posix_get_boottime_ktime, .clock_get_timespec = posix_get_boottime_timespec, - .nsleep = common_nsleep, + .nsleep = common_nsleep_timens, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, -- cgit From 0efc8bb0bb5fdfd529a23073ee15478b5d5e3839 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 12 Nov 2019 01:27:07 +0000 Subject: fs/proc: Respect boottime inside time namespace for /proc/uptime Make sure that /proc/uptime is adjusted to the tasks time namespace. Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-19-dima@arista.com --- fs/proc/uptime.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index a4c2791ab70b..5a1b228964fb 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -5,6 +5,7 @@ #include #include #include +#include #include static int uptime_proc_show(struct seq_file *m, void *v) @@ -20,6 +21,8 @@ static int uptime_proc_show(struct seq_file *m, void *v) nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; ktime_get_boottime_ts64(&uptime); + timens_add_boottime(&uptime); + idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); idle.tv_nsec = rem; seq_printf(m, "%lu.%02lu %lu.%02lu\n", -- cgit From 6f74acfde20af1eb2178d0bd846bfd8f50b3be32 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 12 Nov 2019 01:27:08 +0000 Subject: x86/vdso: Restrict splitting VVAR VMA Forbid splitting VVAR VMA resulting in a stricter ABI and reducing the amount of corner-cases to consider while working further on VDSO time namespace support. As the offset from timens to VVAR page is computed compile-time, the pages in VVAR should stay together and not being partically mremap()'ed. Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-20-dima@arista.com --- arch/x86/entry/vdso/vma.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index f5937742b290..76cbe54e0c39 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -84,6 +84,18 @@ static int vdso_mremap(const struct vm_special_mapping *sm, return 0; } +static int vvar_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma) +{ + const struct vdso_image *image = new_vma->vm_mm->context.vdso_image; + unsigned long new_size = new_vma->vm_end - new_vma->vm_start; + + if (new_size != -image->sym_vvar_start) + return -EINVAL; + + return 0; +} + static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -136,6 +148,7 @@ static const struct vm_special_mapping vdso_mapping = { static const struct vm_special_mapping vvar_mapping = { .name = "[vvar]", .fault = vvar_fault, + .mremap = vvar_mremap, }; /* -- cgit From 660fd04f9317172ae90f414c68b18a26ae88a829 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Nov 2019 01:27:09 +0000 Subject: lib/vdso: Prepare for time namespace support To support time namespaces in the vdso with a minimal impact on regular non time namespace affected tasks, the namespace handling needs to be hidden in a slow path. The most obvious place is vdso_seq_begin(). If a task belongs to a time namespace then the VVAR page which contains the system wide vdso data is replaced with a namespace specific page which has the same layout as the VVAR page. That page has vdso_data->seq set to 1 to enforce the slow path and vdso_data->clock_mode set to VCLOCK_TIMENS to enforce the time namespace handling path. The extra check in the case that vdso_data->seq is odd, e.g. a concurrent update of the vdso data is in progress, is not really affecting regular tasks which are not part of a time namespace as the task is spin waiting for the update to finish and vdso_data->seq to become even again. If a time namespace task hits that code path, it invokes the corresponding time getter function which retrieves the real VVAR page, reads host time and then adds the offset for the requested clock which is stored in the special VVAR page. If VDSO time namespace support is disabled the whole magic is compiled out. Initial testing shows that the disabled case is almost identical to the host case which does not take the slow timens path. With the special timens page installed the performance hit is constant time and in the range of 5-7%. For the vdso functions which are not using the sequence count an unconditional check for vdso_data->clock_mode is added which switches to the real vdso when the clock_mode is VCLOCK_TIMENS. [avagin: Make do_hres_timens() work with raw clocks too: choose vdso_data pointer by CS_RAW offset.] Suggested-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-21-dima@arista.com --- include/linux/time.h | 6 ++ include/vdso/datapage.h | 19 ++++++- init/Kconfig | 1 + lib/vdso/Kconfig | 6 ++ lib/vdso/gettimeofday.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 169 insertions(+), 5 deletions(-) diff --git a/include/linux/time.h b/include/linux/time.h index 8e10b9dbd8c2..8ef5e5cc9f57 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -110,4 +110,10 @@ static inline bool itimerspec64_valid(const struct itimerspec64 *its) * Equivalent to !(time_before32(@t, @l) || time_after32(@t, @h)). */ #define time_between32(t, l, h) ((u32)(h) - (u32)(l) >= (u32)(t) - (u32)(l)) + +struct timens_offset { + s64 sec; + u64 nsec; +}; + #endif diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index 2e302c0f41f7..c5f347cc5e55 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -21,6 +21,8 @@ #define CS_RAW 1 #define CS_BASES (CS_RAW + 1) +#define VCLOCK_TIMENS UINT_MAX + /** * struct vdso_timestamp - basetime per clock_id * @sec: seconds @@ -48,6 +50,7 @@ struct vdso_timestamp { * @mult: clocksource multiplier * @shift: clocksource shift * @basetime[clock_id]: basetime per clock_id + * @offset[clock_id]: time namespace offset per clock_id * @tz_minuteswest: minutes west of Greenwich * @tz_dsttime: type of DST correction * @hrtimer_res: hrtimer resolution @@ -55,6 +58,17 @@ struct vdso_timestamp { * * vdso_data will be accessed by 64 bit and compat code at the same time * so we should be careful before modifying this structure. + * + * @basetime is used to store the base time for the system wide time getter + * VVAR page. + * + * @offset is used by the special time namespace VVAR pages which are + * installed instead of the real VVAR page. These namespace pages must set + * @seq to 1 and @clock_mode to VLOCK_TIMENS to force the code into the + * time namespace slow path. The namespace aware functions retrieve the + * real system wide VVAR page, read host time and add the per clock offset. + * For clocks which are not affected by time namespace adjustment the + * offset must be zero. */ struct vdso_data { u32 seq; @@ -65,7 +79,10 @@ struct vdso_data { u32 mult; u32 shift; - struct vdso_timestamp basetime[VDSO_BASES]; + union { + struct vdso_timestamp basetime[VDSO_BASES]; + struct timens_offset offset[VDSO_BASES]; + }; s32 tz_minuteswest; s32 tz_dsttime; diff --git a/init/Kconfig b/init/Kconfig index b34314fc75f7..9b7f144a6d35 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1082,6 +1082,7 @@ config UTS_NS config TIME_NS bool "TIME namespace" + depends on GENERIC_VDSO_TIME_NS default y help In this namespace boottime and monotonic clocks can be set. diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig index 9fe698ff62ec..d883ac299508 100644 --- a/lib/vdso/Kconfig +++ b/lib/vdso/Kconfig @@ -24,4 +24,10 @@ config GENERIC_COMPAT_VDSO help This config option enables the compat VDSO layer. +config GENERIC_VDSO_TIME_NS + bool + help + Selected by architectures which support time namespaces in the + VDSO + endif diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index b453d2469b63..f342ac1fce77 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -38,15 +38,89 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult) } #endif +#ifdef CONFIG_TIME_NS +static int do_hres_timens(const struct vdso_data *vdns, clockid_t clk, + struct __kernel_timespec *ts) +{ + const struct vdso_data *vd = __arch_get_timens_vdso_data(); + const struct timens_offset *offs = &vdns->offset[clk]; + const struct vdso_timestamp *vdso_ts; + u64 cycles, last, ns; + u32 seq; + s64 sec; + + if (clk != CLOCK_MONOTONIC_RAW) + vd = &vd[CS_HRES_COARSE]; + else + vd = &vd[CS_RAW]; + vdso_ts = &vd->basetime[clk]; + + do { + seq = vdso_read_begin(vd); + cycles = __arch_get_hw_counter(vd->clock_mode); + ns = vdso_ts->nsec; + last = vd->cycle_last; + if (unlikely((s64)cycles < 0)) + return -1; + + ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult); + ns >>= vd->shift; + sec = vdso_ts->sec; + } while (unlikely(vdso_read_retry(vd, seq))); + + /* Add the namespace offset */ + sec += offs->sec; + ns += offs->nsec; + + /* + * Do this outside the loop: a race inside the loop could result + * in __iter_div_u64_rem() being extremely slow. + */ + ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; + + return 0; +} +#else +static __always_inline const struct vdso_data *__arch_get_timens_vdso_data(void) +{ + return NULL; +} + +static int do_hres_timens(const struct vdso_data *vdns, clockid_t clk, + struct __kernel_timespec *ts) +{ + return -EINVAL; +} +#endif + static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk, - struct __kernel_timespec *ts) + struct __kernel_timespec *ts) { const struct vdso_timestamp *vdso_ts = &vd->basetime[clk]; u64 cycles, last, sec, ns; u32 seq; do { - seq = vdso_read_begin(vd); + /* + * Open coded to handle VCLOCK_TIMENS. Time namespace + * enabled tasks have a special VVAR page installed which + * has vd->seq set to 1 and vd->clock_mode set to + * VCLOCK_TIMENS. For non time namespace affected tasks + * this does not affect performance because if vd->seq is + * odd, i.e. a concurrent update is in progress the extra + * check for vd->clock_mode is just a few extra + * instructions while spin waiting for vd->seq to become + * even again. + */ + while (unlikely((seq = READ_ONCE(vd->seq)) & 1)) { + if (IS_ENABLED(CONFIG_TIME_NS) && + vd->clock_mode == VCLOCK_TIMENS) + return do_hres_timens(vd, clk, ts); + cpu_relax(); + } + smp_rmb(); + cycles = __arch_get_hw_counter(vd->clock_mode); ns = vdso_ts->nsec; last = vd->cycle_last; @@ -68,6 +142,43 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk, return 0; } +#ifdef CONFIG_TIME_NS +static int do_coarse_timens(const struct vdso_data *vdns, clockid_t clk, + struct __kernel_timespec *ts) +{ + const struct vdso_data *vd = __arch_get_timens_vdso_data(); + const struct vdso_timestamp *vdso_ts = &vd->basetime[clk]; + const struct timens_offset *offs = &vdns->offset[clk]; + u64 nsec; + s64 sec; + s32 seq; + + do { + seq = vdso_read_begin(vd); + sec = vdso_ts->sec; + nsec = vdso_ts->nsec; + } while (unlikely(vdso_read_retry(vd, seq))); + + /* Add the namespace offset */ + sec += offs->sec; + nsec += offs->nsec; + + /* + * Do this outside the loop: a race inside the loop could result + * in __iter_div_u64_rem() being extremely slow. + */ + ts->tv_sec = sec + __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec); + ts->tv_nsec = nsec; + return 0; +} +#else +static int do_coarse_timens(const struct vdso_data *vdns, clockid_t clk, + struct __kernel_timespec *ts) +{ + return -1; +} +#endif + static __always_inline int do_coarse(const struct vdso_data *vd, clockid_t clk, struct __kernel_timespec *ts) { @@ -75,7 +186,18 @@ static __always_inline int do_coarse(const struct vdso_data *vd, clockid_t clk, u32 seq; do { - seq = vdso_read_begin(vd); + /* + * Open coded to handle VCLOCK_TIMENS. See comment in + * do_hres(). + */ + while ((seq = READ_ONCE(vd->seq)) & 1) { + if (IS_ENABLED(CONFIG_TIME_NS) && + vd->clock_mode == VCLOCK_TIMENS) + return do_coarse_timens(vd, clk, ts); + cpu_relax(); + } + smp_rmb(); + ts->tv_sec = vdso_ts->sec; ts->tv_nsec = vdso_ts->nsec; } while (unlikely(vdso_read_retry(vd, seq))); @@ -156,6 +278,10 @@ __cvdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) } if (unlikely(tz != NULL)) { + if (IS_ENABLED(CONFIG_TIME_NS) && + vd->clock_mode == VCLOCK_TIMENS) + vd = __arch_get_timens_vdso_data(); + tz->tz_minuteswest = vd[CS_HRES_COARSE].tz_minuteswest; tz->tz_dsttime = vd[CS_HRES_COARSE].tz_dsttime; } @@ -167,7 +293,12 @@ __cvdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) static __maybe_unused __kernel_old_time_t __cvdso_time(__kernel_old_time_t *time) { const struct vdso_data *vd = __arch_get_vdso_data(); - __kernel_old_time_t t = READ_ONCE(vd[CS_HRES_COARSE].basetime[CLOCK_REALTIME].sec); + __kernel_old_time_t t; + + if (IS_ENABLED(CONFIG_TIME_NS) && vd->clock_mode == VCLOCK_TIMENS) + vd = __arch_get_timens_vdso_data(); + + t = READ_ONCE(vd[CS_HRES_COARSE].basetime[CLOCK_REALTIME].sec); if (time) *time = t; @@ -189,6 +320,9 @@ int __cvdso_clock_getres_common(clockid_t clock, struct __kernel_timespec *res) if (unlikely((u32) clock >= MAX_CLOCKS)) return -1; + if (IS_ENABLED(CONFIG_TIME_NS) && vd->clock_mode == VCLOCK_TIMENS) + vd = __arch_get_timens_vdso_data(); + hrtimer_res = READ_ONCE(vd[CS_HRES_COARSE].hrtimer_res); /* * Convert the clockid to a bitmask and use it to check which -- cgit From 64b302ab66c5965702693e79690823ca120288b9 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 12 Nov 2019 01:27:10 +0000 Subject: x86/vdso: Provide vdso_data offset on vvar_page VDSO support for time namespaces needs to set up a page with the same layout as VVAR. That timens page will be placed on position of VVAR page inside namespace. That page has vdso_data->seq set to 1 to enforce the slow path and vdso_data->clock_mode set to VCLOCK_TIMENS to enforce the time namespace handling path. To prepare the time namespace page the kernel needs to know the vdso_data offset. Provide arch_get_vdso_data() helper for locating vdso_data on VVAR page. Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-22-dima@arista.com --- arch/x86/entry/vdso/vdso-layout.lds.S | 2 -- arch/x86/entry/vdso/vma.c | 11 +++++++++++ arch/x86/include/asm/vvar.h | 8 ++++---- arch/x86/kernel/vmlinux.lds.S | 4 +--- include/linux/time_namespace.h | 1 + 5 files changed, 17 insertions(+), 9 deletions(-) diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index 93c6dc7812d0..2330daad67c3 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -21,9 +21,7 @@ SECTIONS /* Place all vvars at the offsets in asm/vvar.h. */ #define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset; -#define __VVAR_KERNEL_LDS #include -#undef __VVAR_KERNEL_LDS #undef EMIT_VVAR pvclock_page = vvar_start + PAGE_SIZE; diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 76cbe54e0c39..04e3498c6c41 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -24,6 +24,17 @@ #include #include +#undef _ASM_X86_VVAR_H +#define EMIT_VVAR(name, offset) \ + const size_t name ## _offset = offset; +#include + +struct vdso_data *arch_get_vdso_data(void *vvar_page) +{ + return (struct vdso_data *)(vvar_page + _vdso_data_offset); +} +#undef EMIT_VVAR + #if defined(CONFIG_X86_64) unsigned int __read_mostly vdso64_enabled = 1; #endif diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index 32f5d9a0b90e..ff2de3025388 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -19,10 +19,10 @@ #ifndef _ASM_X86_VVAR_H #define _ASM_X86_VVAR_H -#if defined(__VVAR_KERNEL_LDS) - -/* The kernel linker script defines its own magic to put vvars in the - * right place. +#ifdef EMIT_VVAR +/* + * EMIT_VVAR() is used by the kernel linker script to put vvars in the + * right place. Also, it's used by kernel code to import offsets values. */ #define DECLARE_VVAR(offset, type, name) \ EMIT_VVAR(name, offset) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 3a1a819da137..e3296aa028fe 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -193,12 +193,10 @@ SECTIONS __vvar_beginning_hack = .; /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) \ +#define EMIT_VVAR(name, offset) \ . = __vvar_beginning_hack + offset; \ *(.vvar_ ## name) -#define __VVAR_KERNEL_LDS #include -#undef __VVAR_KERNEL_LDS #undef EMIT_VVAR /* diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 34ee110b5c35..063a343d1d78 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -39,6 +39,7 @@ struct time_namespace *copy_time_ns(unsigned long flags, struct time_namespace *old_ns); void free_time_ns(struct kref *kref); int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk); +struct vdso_data *arch_get_vdso_data(void *vvar_page); static inline void put_time_ns(struct time_namespace *ns) { -- cgit From 550a77a74c87ecfdadc2214fef4b25ff125f65ab Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 12 Nov 2019 01:27:11 +0000 Subject: x86/vdso: Add time napespace page To support time namespaces in the VDSO with a minimal impact on regular non time namespace affected tasks, the namespace handling needs to be hidden in a slow path. The most obvious place is vdso_seq_begin(). If a task belongs to a time namespace then the VVAR page which contains the system wide VDSO data is replaced with a namespace specific page which has the same layout as the VVAR page. That page has vdso_data->seq set to 1 to enforce the slow path and vdso_data->clock_mode set to VCLOCK_TIMENS to enforce the time namespace handling path. The extra check in the case that vdso_data->seq is odd, e.g. a concurrent update of the VDSO data is in progress, is not really affecting regular tasks which are not part of a time namespace as the task is spin waiting for the update to finish and vdso_data->seq to become even again. If a time namespace task hits that code path, it invokes the corresponding time getter function which retrieves the real VVAR page, reads host time and then adds the offset for the requested clock which is stored in the special VVAR page. Allocate the time namespace page among VVAR pages and place vdso_data on it. Provide __arch_get_timens_vdso_data() helper for VDSO code to get the code-relative position of VVARs on that special page. Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-23-dima@arista.com --- arch/x86/Kconfig | 1 + arch/x86/entry/vdso/vdso-layout.lds.S | 11 +++++++++-- arch/x86/entry/vdso/vdso2c.c | 3 +++ arch/x86/include/asm/vdso.h | 1 + arch/x86/include/asm/vdso/gettimeofday.h | 8 ++++++++ arch/x86/include/asm/vvar.h | 5 ++++- 6 files changed, 26 insertions(+), 3 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5e8949953660..a2488c372fa1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -124,6 +124,7 @@ config X86 select GENERIC_STRNLEN_USER select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY + select GENERIC_VDSO_TIME_NS select GUP_GET_PTE_LOW_HIGH if X86_PAE select HARDLOCKUP_CHECK_TIMESTAMP if X86_64 select HAVE_ACPI_APEI if ACPI diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index 2330daad67c3..ea7e0155c604 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -16,8 +16,8 @@ SECTIONS * segment. */ - vvar_start = . - 3 * PAGE_SIZE; - vvar_page = vvar_start; + vvar_start = . - 4 * PAGE_SIZE; + vvar_page = vvar_start; /* Place all vvars at the offsets in asm/vvar.h. */ #define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset; @@ -26,6 +26,13 @@ SECTIONS pvclock_page = vvar_start + PAGE_SIZE; hvclock_page = vvar_start + 2 * PAGE_SIZE; + timens_page = vvar_start + 3 * PAGE_SIZE; + +#undef _ASM_X86_VVAR_H + /* Place all vvars in timens too at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) timens_ ## name = timens_page + offset; +#include +#undef EMIT_VVAR . = SIZEOF_HEADERS; diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 3a4d8d4d39f8..3842873b3ae3 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -75,12 +75,14 @@ enum { sym_vvar_page, sym_pvclock_page, sym_hvclock_page, + sym_timens_page, }; const int special_pages[] = { sym_vvar_page, sym_pvclock_page, sym_hvclock_page, + sym_timens_page, }; struct vdso_sym { @@ -93,6 +95,7 @@ struct vdso_sym required_syms[] = { [sym_vvar_page] = {"vvar_page", true}, [sym_pvclock_page] = {"pvclock_page", true}, [sym_hvclock_page] = {"hvclock_page", true}, + [sym_timens_page] = {"timens_page", true}, {"VDSO32_NOTE_MASK", true}, {"__kernel_vsyscall", true}, {"__kernel_sigreturn", true}, diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 230474e2ddb5..bbcdc7b8f963 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -21,6 +21,7 @@ struct vdso_image { long sym_vvar_page; long sym_pvclock_page; long sym_hvclock_page; + long sym_timens_page; long sym_VDSO32_NOTE_MASK; long sym___kernel_sigreturn; long sym___kernel_rt_sigreturn; diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index 52c3bcd672cf..6ee1f7dba34b 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -21,6 +21,7 @@ #include #define __vdso_data (VVAR(_vdso_data)) +#define __timens_vdso_data (TIMENS(_vdso_data)) #define VDSO_HAS_TIME 1 @@ -56,6 +57,13 @@ extern struct ms_hyperv_tsc_page hvclock_page __attribute__((visibility("hidden"))); #endif +#ifdef CONFIG_TIME_NS +static __always_inline const struct vdso_data *__arch_get_timens_vdso_data(void) +{ + return __timens_vdso_data; +} +#endif + #ifndef BUILD_VDSO32 static __always_inline diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index ff2de3025388..183e98e49ab9 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -33,9 +33,12 @@ extern char __vvar_page; #define DECLARE_VVAR(offset, type, name) \ extern type vvar_ ## name[CS_BASES] \ - __attribute__((visibility("hidden"))); + __attribute__((visibility("hidden"))); \ + extern type timens_ ## name[CS_BASES] \ + __attribute__((visibility("hidden"))); \ #define VVAR(name) (vvar_ ## name) +#define TIMENS(name) (timens_ ## name) #define DEFINE_VVAR(type, name) \ type name[CS_BASES] \ -- cgit From afaa7b5ac7c87479fb5a626f87d2157af30d6401 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 12 Nov 2019 01:27:12 +0000 Subject: time: Allocate per-timens vvar page VDSO support for Time namespace needs to set up a page with the same layout as VVAR. That timens page will be placed on position of VVAR page inside namespace. That page contains time namespace clock offsets and it has vdso_data->seq set to 1 to enforce the slow path and vdso_data->clock_mode set to VCLOCK_TIMENS to enforce the time namespace handling path. Allocate the timens page during namespace creation. Setup the offsets when the first task enters the ns and freeze them to guarantee the pace of monotonic/boottime clocks and to avoid breakage of applications. The design decision is to have a global offset_lock which is used during namespace offsets setup and to freeze offsets when the first task joins the new time namespace. That is better in terms of memory usage compared to having a per namespace mutex that's used only during the setup period. Suggested-by: Andy Lutomirski Based-on-work-by: Thomas Gleixner Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-24-dima@arista.com --- include/linux/time_namespace.h | 3 ++ kernel/time/namespace.c | 104 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 106 insertions(+), 1 deletion(-) diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 063a343d1d78..6b7767f7df4a 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -23,6 +23,9 @@ struct time_namespace { struct ucounts *ucounts; struct ns_common ns; struct timens_offsets offsets; + struct page *vvar_page; + /* If set prevents changing offsets after any task joined namespace. */ + bool frozen_offsets; } __randomize_layout; extern struct time_namespace init_time_ns; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 1a0fbaa5d2d4..d705c15d0273 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -16,6 +16,8 @@ #include #include +#include + ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, struct timens_offsets *ns_offsets) { @@ -90,16 +92,23 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, kref_init(&ns->kref); + ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!ns->vvar_page) + goto fail_free; + err = ns_alloc_inum(&ns->ns); if (err) - goto fail_free; + goto fail_free_page; ns->ucounts = ucounts; ns->ns.ops = &timens_operations; ns->user_ns = get_user_ns(user_ns); ns->offsets = old_ns->offsets; + ns->frozen_offsets = false; return ns; +fail_free_page: + __free_page(ns->vvar_page); fail_free: kfree(ns); fail_dec: @@ -128,6 +137,93 @@ struct time_namespace *copy_time_ns(unsigned long flags, return clone_time_ns(user_ns, old_ns); } +static struct timens_offset offset_from_ts(struct timespec64 off) +{ + struct timens_offset ret; + + ret.sec = off.tv_sec; + ret.nsec = off.tv_nsec; + + return ret; +} + +/* + * A time namespace VVAR page has the same layout as the VVAR page which + * contains the system wide VDSO data. + * + * For a normal task the VVAR pages are installed in the normal ordering: + * VVAR + * PVCLOCK + * HVCLOCK + * TIMENS <- Not really required + * + * Now for a timens task the pages are installed in the following order: + * TIMENS + * PVCLOCK + * HVCLOCK + * VVAR + * + * The check for vdso_data->clock_mode is in the unlikely path of + * the seq begin magic. So for the non-timens case most of the time + * 'seq' is even, so the branch is not taken. + * + * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check + * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the + * update to finish and for 'seq' to become even anyway. + * + * Timens page has vdso_data->clock_mode set to VCLOCK_TIMENS which enforces + * the time namespace handling path. + */ +static void timens_setup_vdso_data(struct vdso_data *vdata, + struct time_namespace *ns) +{ + struct timens_offset *offset = vdata->offset; + struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); + struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); + + vdata->seq = 1; + vdata->clock_mode = VCLOCK_TIMENS; + offset[CLOCK_MONOTONIC] = monotonic; + offset[CLOCK_MONOTONIC_RAW] = monotonic; + offset[CLOCK_MONOTONIC_COARSE] = monotonic; + offset[CLOCK_BOOTTIME] = boottime; + offset[CLOCK_BOOTTIME_ALARM] = boottime; +} + +/* + * Protects possibly multiple offsets writers racing each other + * and tasks entering the namespace. + */ +static DEFINE_MUTEX(offset_lock); + +static void timens_set_vvar_page(struct task_struct *task, + struct time_namespace *ns) +{ + struct vdso_data *vdata; + unsigned int i; + + if (ns == &init_time_ns) + return; + + /* Fast-path, taken by every task in namespace except the first. */ + if (likely(ns->frozen_offsets)) + return; + + mutex_lock(&offset_lock); + /* Nothing to-do: vvar_page has been already initialized. */ + if (ns->frozen_offsets) + goto out; + + ns->frozen_offsets = true; + vdata = arch_get_vdso_data(page_address(ns->vvar_page)); + + for (i = 0; i < CS_BASES; i++) + timens_setup_vdso_data(&vdata[i], ns); + +out: + mutex_unlock(&offset_lock); +} + void free_time_ns(struct kref *kref) { struct time_namespace *ns; @@ -136,6 +232,7 @@ void free_time_ns(struct kref *kref) dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); + __free_page(ns->vvar_page); kfree(ns); } @@ -192,6 +289,8 @@ static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; + timens_set_vvar_page(current, ns); + get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; @@ -211,6 +310,8 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) if (nsproxy->time_ns == nsproxy->time_ns_for_children) return 0; + timens_set_vvar_page(tsk, ns); + get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; @@ -246,6 +347,7 @@ struct time_namespace init_time_ns = { .user_ns = &init_user_ns, .ns.inum = PROC_TIME_INIT_INO, .ns.ops = &timens_operations, + .frozen_offsets = true, }; static int __init time_ns_init(void) -- cgit From af34ebeb866fafc0a9a09dda51c52ccec007ace0 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 12 Nov 2019 01:27:13 +0000 Subject: x86/vdso: Handle faults on timens page If a task belongs to a time namespace then the VVAR page which contains the system wide VDSO data is replaced with a namespace specific page which has the same layout as the VVAR page. Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-25-dima@arista.com --- arch/x86/entry/vdso/vma.c | 54 +++++++++++++++++++++++++++++++++++++++++++++-- mm/mmap.c | 2 ++ 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 04e3498c6c41..e5f336112cb1 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -14,11 +14,14 @@ #include #include #include +#include + #include #include #include #include #include +#include #include #include #include @@ -107,10 +110,36 @@ static int vvar_mremap(const struct vm_special_mapping *sm, return 0; } +#ifdef CONFIG_TIME_NS +static struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + if (likely(vma->vm_mm == current->mm)) + return current->nsproxy->time_ns->vvar_page; + + /* + * VM_PFNMAP | VM_IO protect .fault() handler from being called + * through interfaces like /proc/$pid/mem or + * process_vm_{readv,writev}() as long as there's no .access() + * in special_mapping_vmops(). + * For more details check_vma_flags() and __access_remote_vm() + */ + + WARN(1, "vvar_page accessed remotely"); + + return NULL; +} +#else +static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + return NULL; +} +#endif + static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { const struct vdso_image *image = vma->vm_mm->context.vdso_image; + unsigned long pfn; long sym_offset; if (!image) @@ -130,8 +159,21 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, return VM_FAULT_SIGBUS; if (sym_offset == image->sym_vvar_page) { - return vmf_insert_pfn(vma, vmf->address, - __pa_symbol(&__vvar_page) >> PAGE_SHIFT); + struct page *timens_page = find_timens_vvar_page(vma); + + pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT; + + /* + * If a task belongs to a time namespace then a namespace + * specific VVAR is mapped with the sym_vvar_page offset and + * the real VVAR page is mapped with the sym_timens_page + * offset. + * See also the comment near timens_setup_vdso_data(). + */ + if (timens_page) + pfn = page_to_pfn(timens_page); + + return vmf_insert_pfn(vma, vmf->address, pfn); } else if (sym_offset == image->sym_pvclock_page) { struct pvclock_vsyscall_time_info *pvti = pvclock_get_pvti_cpu0_va(); @@ -146,6 +188,14 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK)) return vmf_insert_pfn(vma, vmf->address, virt_to_phys(tsc_pg) >> PAGE_SHIFT); + } else if (sym_offset == image->sym_timens_page) { + struct page *timens_page = find_timens_vvar_page(vma); + + if (!timens_page) + return VM_FAULT_SIGBUS; + + pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT; + return vmf_insert_pfn(vma, vmf->address, pfn); } return VM_FAULT_SIGBUS; diff --git a/mm/mmap.c b/mm/mmap.c index 9c648524e4dc..60c17d3c8762 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3342,6 +3342,8 @@ static const struct vm_operations_struct special_mapping_vmops = { .fault = special_mapping_fault, .mremap = special_mapping_mremap, .name = special_mapping_name, + /* vDSO code relies that VVAR can't be accessed remotely */ + .access = NULL, }; static const struct vm_operations_struct legacy_special_mapping_vmops = { -- cgit From e6b28ec65b6d433624a2c290073bc356c4fce914 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 12 Nov 2019 01:27:14 +0000 Subject: x86/vdso: On timens page fault prefault also VVAR page As timens page has offsets to data on VVAR page VVAR is going to be accessed shortly. Set it up with timens in one page fault as optimization. Suggested-by: Thomas Gleixner Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-26-dima@arista.com --- arch/x86/entry/vdso/vma.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index e5f336112cb1..d2fd8a57af7d 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -170,8 +170,23 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, * offset. * See also the comment near timens_setup_vdso_data(). */ - if (timens_page) + if (timens_page) { + unsigned long addr; + vm_fault_t err; + + /* + * Optimization: inside time namespace pre-fault + * VVAR page too. As on timens page there are only + * offsets for clocks on VVAR, it'll be faulted + * shortly by VDSO code. + */ + addr = vmf->address + (image->sym_timens_page - sym_offset); + err = vmf_insert_pfn(vma, addr, pfn); + if (unlikely(err & VM_FAULT_ERROR)) + return err; + pfn = page_to_pfn(timens_page); + } return vmf_insert_pfn(vma, vmf->address, pfn); } else if (sym_offset == image->sym_pvclock_page) { -- cgit From 70ddf65184ec1e8989322f35193e4fde7377f0cc Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 12 Nov 2019 01:27:15 +0000 Subject: x86/vdso: Zap vvar pages when switching to a time namespace The VVAR page layout depends on whether a task belongs to the root or non-root time namespace. Whenever a task changes its namespace, the VVAR page tables are cleared and then they will be re-faulted with a corresponding layout. Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-27-dima@arista.com --- arch/x86/entry/vdso/vma.c | 27 +++++++++++++++++++++++++++ include/linux/time_namespace.h | 9 +++++++++ kernel/time/namespace.c | 10 ++++++++++ 3 files changed, 46 insertions(+) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index d2fd8a57af7d..c1b8496b5606 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -51,6 +51,7 @@ void __init init_vdso_image(const struct vdso_image *image) image->alt_len)); } +static const struct vm_special_mapping vvar_mapping; struct linux_binprm; static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, @@ -128,6 +129,32 @@ static struct page *find_timens_vvar_page(struct vm_area_struct *vma) return NULL; } + +/* + * The vvar page layout depends on whether a task belongs to the root or + * non-root time namespace. Whenever a task changes its namespace, the VVAR + * page tables are cleared and then they will re-faulted with a + * corresponding layout. + * See also the comment near timens_setup_vdso_data() for details. + */ +int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + unsigned long size = vma->vm_end - vma->vm_start; + + if (vma_is_special_mapping(vma, &vvar_mapping)) + zap_page_range(vma, vma->vm_start, size); + } + + up_write(&mm->mmap_sem); + return 0; +} #else static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma) { diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 6b7767f7df4a..04a2ba8b8a06 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -31,6 +31,9 @@ struct time_namespace { extern struct time_namespace init_time_ns; #ifdef CONFIG_TIME_NS +extern int vdso_join_timens(struct task_struct *task, + struct time_namespace *ns); + static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { kref_get(&ns->kref); @@ -77,6 +80,12 @@ static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) } #else +static inline int vdso_join_timens(struct task_struct *task, + struct time_namespace *ns) +{ + return 0; +} + static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { return NULL; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index d705c15d0273..0732964803b9 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -281,6 +281,7 @@ static void timens_put(struct ns_common *ns) static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) { struct time_namespace *ns = to_time_ns(new); + int err; if (!current_is_single_threaded()) return -EUSERS; @@ -291,6 +292,10 @@ static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) timens_set_vvar_page(current, ns); + err = vdso_join_timens(current, ns); + if (err) + return err; + get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; @@ -305,6 +310,7 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) { struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; struct time_namespace *ns = to_time_ns(nsc); + int err; /* create_new_namespaces() already incremented the ref counter */ if (nsproxy->time_ns == nsproxy->time_ns_for_children) @@ -312,6 +318,10 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) timens_set_vvar_page(tsk, ns); + err = vdso_join_timens(tsk, ns); + if (err) + return err; + get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; -- cgit From 04a8682a71becdb639ec9c0d82b315a2baef7a5d Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:16 +0000 Subject: fs/proc: Introduce /proc/pid/timens_offsets API to set time namespace offsets for children processes, i.e.: echo "$clockid $offset_sec $offset_nsec" > /proc/self/timens_offsets Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-28-dima@arista.com --- fs/proc/base.c | 94 ++++++++++++++++++++++++++++++++++++++ include/linux/time_namespace.h | 10 ++++ kernel/time/namespace.c | 101 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 205 insertions(+) diff --git a/fs/proc/base.c b/fs/proc/base.c index ebea9501afb8..5adc6390ac3a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -94,6 +94,7 @@ #include #include #include +#include #include #include "internal.h" #include "fd.h" @@ -1533,6 +1534,96 @@ static const struct file_operations proc_pid_sched_autogroup_operations = { #endif /* CONFIG_SCHED_AUTOGROUP */ +#ifdef CONFIG_TIME_NS +static int timens_offsets_show(struct seq_file *m, void *v) +{ + struct task_struct *p; + + p = get_proc_task(file_inode(m->file)); + if (!p) + return -ESRCH; + proc_timens_show_offsets(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t timens_offsets_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct inode *inode = file_inode(file); + struct proc_timens_offset offsets[2]; + char *kbuf = NULL, *pos, *next_line; + struct task_struct *p; + int ret, noffsets; + + /* Only allow < page size writes at the beginning of the file */ + if ((*ppos != 0) || (count >= PAGE_SIZE)) + return -EINVAL; + + /* Slurp in the user data */ + kbuf = memdup_user_nul(buf, count); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + /* Parse the user data */ + ret = -EINVAL; + noffsets = 0; + for (pos = kbuf; pos; pos = next_line) { + struct proc_timens_offset *off = &offsets[noffsets]; + int err; + + /* Find the end of line and ensure we don't look past it */ + next_line = strchr(pos, '\n'); + if (next_line) { + *next_line = '\0'; + next_line++; + if (*next_line == '\0') + next_line = NULL; + } + + err = sscanf(pos, "%u %lld %lu", &off->clockid, + &off->val.tv_sec, &off->val.tv_nsec); + if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC) + goto out; + noffsets++; + if (noffsets == ARRAY_SIZE(offsets)) { + if (next_line) + count = next_line - kbuf; + break; + } + } + + ret = -ESRCH; + p = get_proc_task(inode); + if (!p) + goto out; + ret = proc_timens_set_offset(file, p, offsets, noffsets); + put_task_struct(p); + if (ret) + goto out; + + ret = count; +out: + kfree(kbuf); + return ret; +} + +static int timens_offsets_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, timens_offsets_show, inode); +} + +static const struct file_operations proc_timens_offsets_operations = { + .open = timens_offsets_open, + .read = seq_read, + .write = timens_offsets_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_TIME_NS */ + static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { @@ -3015,6 +3106,9 @@ static const struct pid_entry tgid_base_stuff[] = { #endif #ifdef CONFIG_SCHED_AUTOGROUP REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), +#endif +#ifdef CONFIG_TIME_NS + REG("timens_offsets", S_IRUGO|S_IWUSR, proc_timens_offsets_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 04a2ba8b8a06..824d54e057eb 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -52,6 +52,16 @@ static inline void put_time_ns(struct time_namespace *ns) kref_put(&ns->kref, free_time_ns); } +void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m); + +struct proc_timens_offset { + int clockid; + struct timespec64 val; +}; + +int proc_timens_set_offset(struct file *file, struct task_struct *p, + struct proc_timens_offset *offsets, int n); + static inline void timens_add_monotonic(struct timespec64 *ts) { struct timens_offsets *ns_offsets = ¤t->nsproxy->time_ns->offsets; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 0732964803b9..12858507d75a 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -334,6 +335,106 @@ static struct user_namespace *timens_owner(struct ns_common *ns) return to_time_ns(ns)->user_ns; } +static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) +{ + seq_printf(m, "%d %lld %ld\n", clockid, ts->tv_sec, ts->tv_nsec); +} + +void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) +{ + struct ns_common *ns; + struct time_namespace *time_ns; + + ns = timens_for_children_get(p); + if (!ns) + return; + time_ns = to_time_ns(ns); + + show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic); + show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime); + put_time_ns(time_ns); +} + +int proc_timens_set_offset(struct file *file, struct task_struct *p, + struct proc_timens_offset *offsets, int noffsets) +{ + struct ns_common *ns; + struct time_namespace *time_ns; + struct timespec64 tp; + int i, err; + + ns = timens_for_children_get(p); + if (!ns) + return -ESRCH; + time_ns = to_time_ns(ns); + + if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) { + put_time_ns(time_ns); + return -EPERM; + } + + for (i = 0; i < noffsets; i++) { + struct proc_timens_offset *off = &offsets[i]; + + switch (off->clockid) { + case CLOCK_MONOTONIC: + ktime_get_ts64(&tp); + break; + case CLOCK_BOOTTIME: + ktime_get_boottime_ts64(&tp); + break; + default: + err = -EINVAL; + goto out; + } + + err = -ERANGE; + + if (off->val.tv_sec > KTIME_SEC_MAX || + off->val.tv_sec < -KTIME_SEC_MAX) + goto out; + + tp = timespec64_add(tp, off->val); + /* + * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is + * still unreachable. + */ + if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2) + goto out; + } + + mutex_lock(&offset_lock); + if (time_ns->frozen_offsets) { + err = -EACCES; + goto out_unlock; + } + + err = 0; + /* Don't report errors after this line */ + for (i = 0; i < noffsets; i++) { + struct proc_timens_offset *off = &offsets[i]; + struct timespec64 *offset = NULL; + + switch (off->clockid) { + case CLOCK_MONOTONIC: + offset = &time_ns->offsets.monotonic; + break; + case CLOCK_BOOTTIME: + offset = &time_ns->offsets.boottime; + break; + } + + *offset = off->val; + } + +out_unlock: + mutex_unlock(&offset_lock); +out: + put_time_ns(time_ns); + + return err; +} + const struct proc_ns_operations timens_operations = { .name = "time", .type = CLONE_NEWTIME, -- cgit From 61c57676035df29a0a61991f4389e884ba0b68d7 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 12 Nov 2019 01:27:17 +0000 Subject: selftests/timens: Add Time Namespace test for supported clocks A test to check that all supported clocks work on host and inside a new time namespace. Use both ways to get time: through VDSO and by entering the kernel with implicit syscall. Introduce a new timens directory in selftests framework for the next timens tests. Output on success: 1..10 ok 1 Passed for CLOCK_BOOTTIME (syscall) ok 2 Passed for CLOCK_BOOTTIME (vdso) ok 3 Passed for CLOCK_BOOTTIME_ALARM (syscall) ok 4 Passed for CLOCK_BOOTTIME_ALARM (vdso) ok 5 Passed for CLOCK_MONOTONIC (syscall) ok 6 Passed for CLOCK_MONOTONIC (vdso) ok 7 Passed for CLOCK_MONOTONIC_COARSE (syscall) ok 8 Passed for CLOCK_MONOTONIC_COARSE (vdso) ok 9 Passed for CLOCK_MONOTONIC_RAW (syscall) ok 10 Passed for CLOCK_MONOTONIC_RAW (vdso) # Pass 10 Fail 0 Xfail 0 Xpass 0 Skip 0 Error 0 Output with lack of permissions: 1..10 not ok 1 # SKIP need to run as root Output without support of time namespaces: 1..10 not ok 1 # SKIP Time namespaces are not supported Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-29-dima@arista.com --- tools/testing/selftests/Makefile | 1 + tools/testing/selftests/timens/.gitignore | 1 + tools/testing/selftests/timens/Makefile | 6 + tools/testing/selftests/timens/config | 1 + tools/testing/selftests/timens/log.h | 26 ++++ tools/testing/selftests/timens/timens.c | 190 ++++++++++++++++++++++++++++++ tools/testing/selftests/timens/timens.h | 100 ++++++++++++++++ 7 files changed, 325 insertions(+) create mode 100644 tools/testing/selftests/timens/.gitignore create mode 100644 tools/testing/selftests/timens/Makefile create mode 100644 tools/testing/selftests/timens/config create mode 100644 tools/testing/selftests/timens/log.h create mode 100644 tools/testing/selftests/timens/timens.c create mode 100644 tools/testing/selftests/timens/timens.h diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index b001c602414b..c4939a2a5f5d 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -50,6 +50,7 @@ TARGETS += splice TARGETS += static_keys TARGETS += sync TARGETS += sysctl +TARGETS += timens ifneq (1, $(quicktest)) TARGETS += timers endif diff --git a/tools/testing/selftests/timens/.gitignore b/tools/testing/selftests/timens/.gitignore new file mode 100644 index 000000000000..27a693229ce1 --- /dev/null +++ b/tools/testing/selftests/timens/.gitignore @@ -0,0 +1 @@ +timens diff --git a/tools/testing/selftests/timens/Makefile b/tools/testing/selftests/timens/Makefile new file mode 100644 index 000000000000..49a9dcc26c3a --- /dev/null +++ b/tools/testing/selftests/timens/Makefile @@ -0,0 +1,6 @@ +TEST_GEN_PROGS := timens + +CFLAGS := -Wall -Werror +LDFLAGS := -lrt + +include ../lib.mk diff --git a/tools/testing/selftests/timens/config b/tools/testing/selftests/timens/config new file mode 100644 index 000000000000..4480620f6f49 --- /dev/null +++ b/tools/testing/selftests/timens/config @@ -0,0 +1 @@ +CONFIG_TIME_NS=y diff --git a/tools/testing/selftests/timens/log.h b/tools/testing/selftests/timens/log.h new file mode 100644 index 000000000000..db64df2a8483 --- /dev/null +++ b/tools/testing/selftests/timens/log.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __SELFTEST_TIMENS_LOG_H__ +#define __SELFTEST_TIMENS_LOG_H__ + +#define pr_msg(fmt, lvl, ...) \ + ksft_print_msg("[%s] (%s:%d)\t" fmt "\n", \ + lvl, __FILE__, __LINE__, ##__VA_ARGS__) + +#define pr_p(func, fmt, ...) func(fmt ": %m", ##__VA_ARGS__) + +#define pr_err(fmt, ...) \ + ({ \ + ksft_test_result_error(fmt "\n", ##__VA_ARGS__); \ + -1; \ + }) + +#define pr_fail(fmt, ...) \ + ({ \ + ksft_test_result_fail(fmt, ##__VA_ARGS__); \ + -1; \ + }) + +#define pr_perror(fmt, ...) pr_p(pr_err, fmt, ##__VA_ARGS__) + +#endif diff --git a/tools/testing/selftests/timens/timens.c b/tools/testing/selftests/timens/timens.c new file mode 100644 index 000000000000..559d26e21ba0 --- /dev/null +++ b/tools/testing/selftests/timens/timens.c @@ -0,0 +1,190 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "timens.h" + +/* + * Test shouldn't be run for a day, so add 10 days to child + * time and check parent's time to be in the same day. + */ +#define DAY_IN_SEC (60*60*24) +#define TEN_DAYS_IN_SEC (10*DAY_IN_SEC) + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) + +struct test_clock { + clockid_t id; + char *name; + /* + * off_id is -1 if a clock has own offset, or it contains an index + * which contains a right offset of this clock. + */ + int off_id; + time_t offset; +}; + +#define ct(clock, off_id) { clock, #clock, off_id } +static struct test_clock clocks[] = { + ct(CLOCK_BOOTTIME, -1), + ct(CLOCK_BOOTTIME_ALARM, 1), + ct(CLOCK_MONOTONIC, -1), + ct(CLOCK_MONOTONIC_COARSE, 1), + ct(CLOCK_MONOTONIC_RAW, 1), +}; +#undef ct + +static int child_ns, parent_ns = -1; + +static int switch_ns(int fd) +{ + if (setns(fd, CLONE_NEWTIME)) { + pr_perror("setns()"); + return -1; + } + + return 0; +} + +static int init_namespaces(void) +{ + char path[] = "/proc/self/ns/time_for_children"; + struct stat st1, st2; + + if (parent_ns == -1) { + parent_ns = open(path, O_RDONLY); + if (parent_ns <= 0) + return pr_perror("Unable to open %s", path); + } + + if (fstat(parent_ns, &st1)) + return pr_perror("Unable to stat the parent timens"); + + if (unshare_timens()) + return -1; + + child_ns = open(path, O_RDONLY); + if (child_ns <= 0) + return pr_perror("Unable to open %s", path); + + if (fstat(child_ns, &st2)) + return pr_perror("Unable to stat the timens"); + + if (st1.st_ino == st2.st_ino) + return pr_perror("The same child_ns after CLONE_NEWTIME"); + + return 0; +} + +static int test_gettime(clockid_t clock_index, bool raw_syscall, time_t offset) +{ + struct timespec child_ts_new, parent_ts_old, cur_ts; + char *entry = raw_syscall ? "syscall" : "vdso"; + double precision = 0.0; + + if (check_skip(clocks[clock_index].id)) + return 0; + + switch (clocks[clock_index].id) { + case CLOCK_MONOTONIC_COARSE: + case CLOCK_MONOTONIC_RAW: + precision = -2.0; + break; + } + + if (switch_ns(parent_ns)) + return pr_err("switch_ns(%d)", child_ns); + + if (_gettime(clocks[clock_index].id, &parent_ts_old, raw_syscall)) + return -1; + + child_ts_new.tv_nsec = parent_ts_old.tv_nsec; + child_ts_new.tv_sec = parent_ts_old.tv_sec + offset; + + if (switch_ns(child_ns)) + return pr_err("switch_ns(%d)", child_ns); + + if (_gettime(clocks[clock_index].id, &cur_ts, raw_syscall)) + return -1; + + if (difftime(cur_ts.tv_sec, child_ts_new.tv_sec) < precision) { + ksft_test_result_fail( + "Child's %s (%s) time has not changed: %lu -> %lu [%lu]\n", + clocks[clock_index].name, entry, parent_ts_old.tv_sec, + child_ts_new.tv_sec, cur_ts.tv_sec); + return -1; + } + + if (switch_ns(parent_ns)) + return pr_err("switch_ns(%d)", parent_ns); + + if (_gettime(clocks[clock_index].id, &cur_ts, raw_syscall)) + return -1; + + if (difftime(cur_ts.tv_sec, parent_ts_old.tv_sec) > DAY_IN_SEC) { + ksft_test_result_fail( + "Parent's %s (%s) time has changed: %lu -> %lu [%lu]\n", + clocks[clock_index].name, entry, parent_ts_old.tv_sec, + child_ts_new.tv_sec, cur_ts.tv_sec); + /* Let's play nice and put it closer to original */ + clock_settime(clocks[clock_index].id, &cur_ts); + return -1; + } + + ksft_test_result_pass("Passed for %s (%s)\n", + clocks[clock_index].name, entry); + return 0; +} + +int main(int argc, char *argv[]) +{ + unsigned int i; + time_t offset; + int ret = 0; + + nscheck(); + + check_config_posix_timers(); + + ksft_set_plan(ARRAY_SIZE(clocks) * 2); + + if (init_namespaces()) + return 1; + + /* Offsets have to be set before tasks enter the namespace. */ + for (i = 0; i < ARRAY_SIZE(clocks); i++) { + if (clocks[i].off_id != -1) + continue; + offset = TEN_DAYS_IN_SEC + i * 1000; + clocks[i].offset = offset; + if (_settime(clocks[i].id, offset)) + return 1; + } + + for (i = 0; i < ARRAY_SIZE(clocks); i++) { + if (clocks[i].off_id != -1) + offset = clocks[clocks[i].off_id].offset; + else + offset = clocks[i].offset; + ret |= test_gettime(i, true, offset); + ret |= test_gettime(i, false, offset); + } + + if (ret) + ksft_exit_fail(); + + ksft_exit_pass(); + return !!ret; +} diff --git a/tools/testing/selftests/timens/timens.h b/tools/testing/selftests/timens/timens.h new file mode 100644 index 000000000000..e09e7e39bc52 --- /dev/null +++ b/tools/testing/selftests/timens/timens.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __TIMENS_H__ +#define __TIMENS_H__ + +#include +#include +#include +#include + +#include "../kselftest.h" + +#ifndef CLONE_NEWTIME +# define CLONE_NEWTIME 0x00000080 +#endif + +static int config_posix_timers = true; + +static inline void check_config_posix_timers(void) +{ + if (timer_create(-1, 0, 0) == -1 && errno == ENOSYS) + config_posix_timers = false; +} + +static inline bool check_skip(int clockid) +{ + if (config_posix_timers) + return false; + + switch (clockid) { + /* Only these clocks are supported without CONFIG_POSIX_TIMERS. */ + case CLOCK_BOOTTIME: + case CLOCK_MONOTONIC: + case CLOCK_REALTIME: + return false; + default: + ksft_test_result_skip("Posix Clocks & timers are not supported\n"); + return true; + } + + return false; +} + +static inline int unshare_timens(void) +{ + if (unshare(CLONE_NEWTIME)) { + if (errno == EPERM) + ksft_exit_skip("need to run as root\n"); + return pr_perror("Can't unshare() timens"); + } + return 0; +} + +static inline int _settime(clockid_t clk_id, time_t offset) +{ + int fd, len; + char buf[4096]; + + if (clk_id == CLOCK_MONOTONIC_COARSE || clk_id == CLOCK_MONOTONIC_RAW) + clk_id = CLOCK_MONOTONIC; + + len = snprintf(buf, sizeof(buf), "%d %ld 0", clk_id, offset); + + fd = open("/proc/self/timens_offsets", O_WRONLY); + if (fd < 0) + return pr_perror("/proc/self/timens_offsets"); + + if (write(fd, buf, len) != len) + return pr_perror("/proc/self/timens_offsets"); + + close(fd); + + return 0; +} + +static inline int _gettime(clockid_t clk_id, struct timespec *res, bool raw_syscall) +{ + int err; + + if (!raw_syscall) { + if (clock_gettime(clk_id, res)) { + pr_perror("clock_gettime(%d)", (int)clk_id); + return -1; + } + return 0; + } + + err = syscall(SYS_clock_gettime, clk_id, res); + if (err) + pr_perror("syscall(SYS_clock_gettime(%d))", (int)clk_id); + + return err; +} + +static inline void nscheck(void) +{ + if (access("/proc/self/ns/time", F_OK) < 0) + ksft_exit_skip("Time namespaces are not supported\n"); +} + +#endif -- cgit From 11873de3ce4d2fe289d51932c03b3668cf519186 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:18 +0000 Subject: selftests/timens: Add a test for timerfd Check that timerfd_create() takes into account clock offsets. Output on success: 1..3 ok 1 clockid=7 ok 2 clockid=1 ok 3 clockid=9 # Pass 3 Fail 0 Xfail 0 Xpass 0 Skip 0 Error 0 Output on failure: 1..3 not ok 1 clockid: 7 elapsed: 0 not ok 2 clockid: 1 elapsed: 0 not ok 3 clockid: 9 elapsed: 0 Bail out! Output with lack of permissions: 1..3 not ok 1 # SKIP need to run as root Output without support of time namespaces: 1..3 not ok 1 # SKIP Time namespaces are not supported Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-30-dima@arista.com --- tools/testing/selftests/timens/.gitignore | 1 + tools/testing/selftests/timens/Makefile | 2 +- tools/testing/selftests/timens/timerfd.c | 128 ++++++++++++++++++++++++++++++ 3 files changed, 130 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/timens/timerfd.c diff --git a/tools/testing/selftests/timens/.gitignore b/tools/testing/selftests/timens/.gitignore index 27a693229ce1..b609f6ee9fb9 100644 --- a/tools/testing/selftests/timens/.gitignore +++ b/tools/testing/selftests/timens/.gitignore @@ -1 +1,2 @@ timens +timerfd diff --git a/tools/testing/selftests/timens/Makefile b/tools/testing/selftests/timens/Makefile index 49a9dcc26c3a..293aed6be95e 100644 --- a/tools/testing/selftests/timens/Makefile +++ b/tools/testing/selftests/timens/Makefile @@ -1,4 +1,4 @@ -TEST_GEN_PROGS := timens +TEST_GEN_PROGS := timens timerfd CFLAGS := -Wall -Werror LDFLAGS := -lrt diff --git a/tools/testing/selftests/timens/timerfd.c b/tools/testing/selftests/timens/timerfd.c new file mode 100644 index 000000000000..eff1ec5ff215 --- /dev/null +++ b/tools/testing/selftests/timens/timerfd.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "timens.h" + +static int tclock_gettime(clock_t clockid, struct timespec *now) +{ + if (clockid == CLOCK_BOOTTIME_ALARM) + clockid = CLOCK_BOOTTIME; + return clock_gettime(clockid, now); +} + +int run_test(int clockid, struct timespec now) +{ + struct itimerspec new_value; + long long elapsed; + int fd, i; + + if (tclock_gettime(clockid, &now)) + return pr_perror("clock_gettime(%d)", clockid); + + for (i = 0; i < 2; i++) { + int flags = 0; + + new_value.it_value.tv_sec = 3600; + new_value.it_value.tv_nsec = 0; + new_value.it_interval.tv_sec = 1; + new_value.it_interval.tv_nsec = 0; + + if (i == 1) { + new_value.it_value.tv_sec += now.tv_sec; + new_value.it_value.tv_nsec += now.tv_nsec; + } + + fd = timerfd_create(clockid, 0); + if (fd == -1) + return pr_perror("timerfd_create(%d)", clockid); + + if (i == 1) + flags |= TFD_TIMER_ABSTIME; + + if (timerfd_settime(fd, flags, &new_value, NULL)) + return pr_perror("timerfd_settime(%d)", clockid); + + if (timerfd_gettime(fd, &new_value)) + return pr_perror("timerfd_gettime(%d)", clockid); + + elapsed = new_value.it_value.tv_sec; + if (abs(elapsed - 3600) > 60) { + ksft_test_result_fail("clockid: %d elapsed: %lld\n", + clockid, elapsed); + return 1; + } + + close(fd); + } + + ksft_test_result_pass("clockid=%d\n", clockid); + + return 0; +} + +int main(int argc, char *argv[]) +{ + int ret, status, len, fd; + char buf[4096]; + pid_t pid; + struct timespec btime_now, mtime_now; + + nscheck(); + + ksft_set_plan(3); + + clock_gettime(CLOCK_MONOTONIC, &mtime_now); + clock_gettime(CLOCK_BOOTTIME, &btime_now); + + if (unshare_timens()) + return 1; + + len = snprintf(buf, sizeof(buf), "%d %d 0\n%d %d 0", + CLOCK_MONOTONIC, 70 * 24 * 3600, + CLOCK_BOOTTIME, 9 * 24 * 3600); + fd = open("/proc/self/timens_offsets", O_WRONLY); + if (fd < 0) + return pr_perror("/proc/self/timens_offsets"); + + if (write(fd, buf, len) != len) + return pr_perror("/proc/self/timens_offsets"); + + close(fd); + mtime_now.tv_sec += 70 * 24 * 3600; + btime_now.tv_sec += 9 * 24 * 3600; + + pid = fork(); + if (pid < 0) + return pr_perror("Unable to fork"); + if (pid == 0) { + ret = 0; + ret |= run_test(CLOCK_BOOTTIME, btime_now); + ret |= run_test(CLOCK_MONOTONIC, mtime_now); + ret |= run_test(CLOCK_BOOTTIME_ALARM, btime_now); + + if (ret) + ksft_exit_fail(); + ksft_exit_pass(); + return ret; + } + + if (waitpid(pid, &status, 0) != pid) + return pr_perror("Unable to wait the child process"); + + if (WIFEXITED(status)) + return WEXITSTATUS(status); + + return 1; +} -- cgit From 46e003433f8946283c3bfec1be854ca87b5ba402 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:19 +0000 Subject: selftests/timens: Add a test for clock_nanosleep() Check that clock_nanosleep() takes into account clock offsets. Output on success: 1..4 ok 1 clockid: 1 abs:0 ok 2 clockid: 1 abs:1 ok 3 clockid: 9 abs:0 ok 4 clockid: 9 abs:1 Output with lack of permissions: 1..4 not ok 1 # SKIP need to run as root Output without support of time namespaces: 1..4 not ok 1 # SKIP Time namespaces are not supported Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-31-dima@arista.com --- tools/testing/selftests/timens/.gitignore | 1 + tools/testing/selftests/timens/Makefile | 4 +- tools/testing/selftests/timens/clock_nanosleep.c | 149 +++++++++++++++++++++++ 3 files changed, 152 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/timens/clock_nanosleep.c diff --git a/tools/testing/selftests/timens/.gitignore b/tools/testing/selftests/timens/.gitignore index b609f6ee9fb9..9b6c8ddac2c8 100644 --- a/tools/testing/selftests/timens/.gitignore +++ b/tools/testing/selftests/timens/.gitignore @@ -1,2 +1,3 @@ +clock_nanosleep timens timerfd diff --git a/tools/testing/selftests/timens/Makefile b/tools/testing/selftests/timens/Makefile index 293aed6be95e..40f630d46ca8 100644 --- a/tools/testing/selftests/timens/Makefile +++ b/tools/testing/selftests/timens/Makefile @@ -1,6 +1,6 @@ -TEST_GEN_PROGS := timens timerfd +TEST_GEN_PROGS := timens timerfd clock_nanosleep -CFLAGS := -Wall -Werror +CFLAGS := -Wall -Werror -pthread LDFLAGS := -lrt include ../lib.mk diff --git a/tools/testing/selftests/timens/clock_nanosleep.c b/tools/testing/selftests/timens/clock_nanosleep.c new file mode 100644 index 000000000000..8e7b7c72ef65 --- /dev/null +++ b/tools/testing/selftests/timens/clock_nanosleep.c @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "timens.h" + +void test_sig(int sig) +{ + if (sig == SIGUSR2) + pthread_exit(NULL); +} + +struct thread_args { + struct timespec *now, *rem; + pthread_mutex_t *lock; + int clockid; + int abs; +}; + +void *call_nanosleep(void *_args) +{ + struct thread_args *args = _args; + + clock_nanosleep(args->clockid, args->abs ? TIMER_ABSTIME : 0, args->now, args->rem); + pthread_mutex_unlock(args->lock); + return NULL; +} + +int run_test(int clockid, int abs) +{ + struct timespec now = {}, rem; + struct thread_args args = { .now = &now, .rem = &rem, .clockid = clockid}; + struct timespec start; + pthread_mutex_t lock; + pthread_t thread; + int j, ok, ret; + + signal(SIGUSR1, test_sig); + signal(SIGUSR2, test_sig); + + pthread_mutex_init(&lock, NULL); + pthread_mutex_lock(&lock); + + if (clock_gettime(clockid, &start) == -1) { + if (errno == EINVAL && check_skip(clockid)) + return 0; + return pr_perror("clock_gettime"); + } + + + if (abs) { + now.tv_sec = start.tv_sec; + now.tv_nsec = start.tv_nsec; + } + + now.tv_sec += 3600; + args.abs = abs; + args.lock = &lock; + ret = pthread_create(&thread, NULL, call_nanosleep, &args); + if (ret != 0) { + pr_err("Unable to create a thread: %s", strerror(ret)); + return 1; + } + + /* Wait when the thread will call clock_nanosleep(). */ + ok = 0; + for (j = 0; j < 8; j++) { + /* The maximum timeout is about 5 seconds. */ + usleep(10000 << j); + + /* Try to interrupt clock_nanosleep(). */ + pthread_kill(thread, SIGUSR1); + + usleep(10000 << j); + /* Check whether clock_nanosleep() has been interrupted or not. */ + if (pthread_mutex_trylock(&lock) == 0) { + /**/ + ok = 1; + break; + } + } + if (!ok) + pthread_kill(thread, SIGUSR2); + pthread_join(thread, NULL); + pthread_mutex_destroy(&lock); + + if (!ok) { + ksft_test_result_pass("clockid: %d abs:%d timeout\n", clockid, abs); + return 1; + } + + if (rem.tv_sec < 3300 || rem.tv_sec > 3900) { + pr_fail("clockid: %d abs: %d remain: %ld\n", + clockid, abs, rem.tv_sec); + return 1; + } + ksft_test_result_pass("clockid: %d abs:%d\n", clockid, abs); + + return 0; +} + +int main(int argc, char *argv[]) +{ + int ret, nsfd; + + nscheck(); + + ksft_set_plan(4); + + check_config_posix_timers(); + + if (unshare_timens()) + return 1; + + if (_settime(CLOCK_MONOTONIC, 7 * 24 * 3600)) + return 1; + if (_settime(CLOCK_BOOTTIME, 9 * 24 * 3600)) + return 1; + + nsfd = open("/proc/self/ns/time_for_children", O_RDONLY); + if (nsfd < 0) + return pr_perror("Unable to open timens_for_children"); + + if (setns(nsfd, CLONE_NEWTIME)) + return pr_perror("Unable to set timens"); + + ret = 0; + ret |= run_test(CLOCK_MONOTONIC, 0); + ret |= run_test(CLOCK_MONOTONIC, 1); + ret |= run_test(CLOCK_BOOTTIME_ALARM, 0); + ret |= run_test(CLOCK_BOOTTIME_ALARM, 1); + + if (ret) + ksft_exit_fail(); + ksft_exit_pass(); + return ret; +} -- cgit From 9d1f5a8c9dadad29f72e40a409239d7b71cf3037 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 12 Nov 2019 01:27:20 +0000 Subject: selftests/timens: Add procfs selftest Check that /proc/uptime is correct inside a new time namespace. Output on success: 1..1 ok 1 Passed for /proc/uptime # Pass 1 Fail 0 Xfail 0 Xpass 0 Skip 0 Error 0 Output with lack of permissions: 1..1 not ok 1 # SKIP need to run as root Output without support of time namespaces: 1..1 not ok 1 # SKIP Time namespaces are not supported Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-32-dima@arista.com --- tools/testing/selftests/timens/.gitignore | 1 + tools/testing/selftests/timens/Makefile | 2 +- tools/testing/selftests/timens/procfs.c | 144 ++++++++++++++++++++++++++++++ 3 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/timens/procfs.c diff --git a/tools/testing/selftests/timens/.gitignore b/tools/testing/selftests/timens/.gitignore index 9b6c8ddac2c8..94ffdd9cead7 100644 --- a/tools/testing/selftests/timens/.gitignore +++ b/tools/testing/selftests/timens/.gitignore @@ -1,3 +1,4 @@ clock_nanosleep +procfs timens timerfd diff --git a/tools/testing/selftests/timens/Makefile b/tools/testing/selftests/timens/Makefile index 40f630d46ca8..8a33df7111c4 100644 --- a/tools/testing/selftests/timens/Makefile +++ b/tools/testing/selftests/timens/Makefile @@ -1,4 +1,4 @@ -TEST_GEN_PROGS := timens timerfd clock_nanosleep +TEST_GEN_PROGS := timens timerfd clock_nanosleep procfs CFLAGS := -Wall -Werror -pthread LDFLAGS := -lrt diff --git a/tools/testing/selftests/timens/procfs.c b/tools/testing/selftests/timens/procfs.c new file mode 100644 index 000000000000..43d93f4006b9 --- /dev/null +++ b/tools/testing/selftests/timens/procfs.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "timens.h" + +/* + * Test shouldn't be run for a day, so add 10 days to child + * time and check parent's time to be in the same day. + */ +#define MAX_TEST_TIME_SEC (60*5) +#define DAY_IN_SEC (60*60*24) +#define TEN_DAYS_IN_SEC (10*DAY_IN_SEC) + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) + +static int child_ns, parent_ns; + +static int switch_ns(int fd) +{ + if (setns(fd, CLONE_NEWTIME)) + return pr_perror("setns()"); + + return 0; +} + +static int init_namespaces(void) +{ + char path[] = "/proc/self/ns/time_for_children"; + struct stat st1, st2; + + parent_ns = open(path, O_RDONLY); + if (parent_ns <= 0) + return pr_perror("Unable to open %s", path); + + if (fstat(parent_ns, &st1)) + return pr_perror("Unable to stat the parent timens"); + + if (unshare_timens()) + return -1; + + child_ns = open(path, O_RDONLY); + if (child_ns <= 0) + return pr_perror("Unable to open %s", path); + + if (fstat(child_ns, &st2)) + return pr_perror("Unable to stat the timens"); + + if (st1.st_ino == st2.st_ino) + return pr_err("The same child_ns after CLONE_NEWTIME"); + + if (_settime(CLOCK_BOOTTIME, TEN_DAYS_IN_SEC)) + return -1; + + return 0; +} + +static int read_proc_uptime(struct timespec *uptime) +{ + unsigned long up_sec, up_nsec; + FILE *proc; + + proc = fopen("/proc/uptime", "r"); + if (proc == NULL) { + pr_perror("Unable to open /proc/uptime"); + return -1; + } + + if (fscanf(proc, "%lu.%02lu", &up_sec, &up_nsec) != 2) { + if (errno) { + pr_perror("fscanf"); + return -errno; + } + pr_err("failed to parse /proc/uptime"); + return -1; + } + fclose(proc); + + uptime->tv_sec = up_sec; + uptime->tv_nsec = up_nsec; + return 0; +} + +static int check_uptime(void) +{ + struct timespec uptime_new, uptime_old; + time_t uptime_expected; + double prec = MAX_TEST_TIME_SEC; + + if (switch_ns(parent_ns)) + return pr_err("switch_ns(%d)", parent_ns); + + if (read_proc_uptime(&uptime_old)) + return 1; + + if (switch_ns(child_ns)) + return pr_err("switch_ns(%d)", child_ns); + + if (read_proc_uptime(&uptime_new)) + return 1; + + uptime_expected = uptime_old.tv_sec + TEN_DAYS_IN_SEC; + if (fabs(difftime(uptime_new.tv_sec, uptime_expected)) > prec) { + pr_fail("uptime in /proc/uptime: old %ld, new %ld [%ld]", + uptime_old.tv_sec, uptime_new.tv_sec, + uptime_old.tv_sec + TEN_DAYS_IN_SEC); + return 1; + } + + ksft_test_result_pass("Passed for /proc/uptime\n"); + return 0; +} + +int main(int argc, char *argv[]) +{ + int ret = 0; + + nscheck(); + + ksft_set_plan(1); + + if (init_namespaces()) + return 1; + + ret |= check_uptime(); + + if (ret) + ksft_exit_fail(); + ksft_exit_pass(); + return ret; +} -- cgit From d5b0117ddd4949e9ed882b6ef91316719826e8a8 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:21 +0000 Subject: selftests/timens: Add timer offsets test Check that timer_create() takes into account clock offsets. Output on success: 1..3 ok 1 clockid=7 ok 2 clockid=1 ok 3 clockid=9 # Pass 3 Fail 0 Xfail 0 Xpass 0 Skip 0 Error 0 Output with lack of permissions: 1..3 not ok 1 # SKIP need to run as root Output without support of time namespaces: 1..3 not ok 1 # SKIP Time namespaces are not supported Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-33-dima@arista.com --- tools/testing/selftests/timens/.gitignore | 1 + tools/testing/selftests/timens/Makefile | 2 +- tools/testing/selftests/timens/timer.c | 122 ++++++++++++++++++++++++++++++ 3 files changed, 124 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/timens/timer.c diff --git a/tools/testing/selftests/timens/.gitignore b/tools/testing/selftests/timens/.gitignore index 94ffdd9cead7..3b7eda8f35ce 100644 --- a/tools/testing/selftests/timens/.gitignore +++ b/tools/testing/selftests/timens/.gitignore @@ -1,4 +1,5 @@ clock_nanosleep procfs timens +timer timerfd diff --git a/tools/testing/selftests/timens/Makefile b/tools/testing/selftests/timens/Makefile index 8a33df7111c4..08164548a49d 100644 --- a/tools/testing/selftests/timens/Makefile +++ b/tools/testing/selftests/timens/Makefile @@ -1,4 +1,4 @@ -TEST_GEN_PROGS := timens timerfd clock_nanosleep procfs +TEST_GEN_PROGS := timens timerfd timer clock_nanosleep procfs CFLAGS := -Wall -Werror -pthread LDFLAGS := -lrt diff --git a/tools/testing/selftests/timens/timer.c b/tools/testing/selftests/timens/timer.c new file mode 100644 index 000000000000..0cca7aafc4bd --- /dev/null +++ b/tools/testing/selftests/timens/timer.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "timens.h" + +int run_test(int clockid, struct timespec now) +{ + struct itimerspec new_value; + long long elapsed; + timer_t fd; + int i; + + for (i = 0; i < 2; i++) { + struct sigevent sevp = {.sigev_notify = SIGEV_NONE}; + int flags = 0; + + new_value.it_value.tv_sec = 3600; + new_value.it_value.tv_nsec = 0; + new_value.it_interval.tv_sec = 1; + new_value.it_interval.tv_nsec = 0; + + if (i == 1) { + new_value.it_value.tv_sec += now.tv_sec; + new_value.it_value.tv_nsec += now.tv_nsec; + } + + if (timer_create(clockid, &sevp, &fd) == -1) { + if (errno == ENOSYS) { + ksft_test_result_skip("Posix Clocks & timers are supported\n"); + return 0; + } + return pr_perror("timerfd_create"); + } + + if (i == 1) + flags |= TIMER_ABSTIME; + if (timer_settime(fd, flags, &new_value, NULL) == -1) + return pr_perror("timerfd_settime"); + + if (timer_gettime(fd, &new_value) == -1) + return pr_perror("timerfd_gettime"); + + elapsed = new_value.it_value.tv_sec; + if (abs(elapsed - 3600) > 60) { + ksft_test_result_fail("clockid: %d elapsed: %lld\n", + clockid, elapsed); + return 1; + } + } + + ksft_test_result_pass("clockid=%d\n", clockid); + + return 0; +} + +int main(int argc, char *argv[]) +{ + int ret, status, len, fd; + char buf[4096]; + pid_t pid; + struct timespec btime_now, mtime_now; + + nscheck(); + + ksft_set_plan(3); + + clock_gettime(CLOCK_MONOTONIC, &mtime_now); + clock_gettime(CLOCK_BOOTTIME, &btime_now); + + if (unshare_timens()) + return 1; + + len = snprintf(buf, sizeof(buf), "%d %d 0\n%d %d 0", + CLOCK_MONOTONIC, 70 * 24 * 3600, + CLOCK_BOOTTIME, 9 * 24 * 3600); + fd = open("/proc/self/timens_offsets", O_WRONLY); + if (fd < 0) + return pr_perror("/proc/self/timens_offsets"); + + if (write(fd, buf, len) != len) + return pr_perror("/proc/self/timens_offsets"); + + close(fd); + mtime_now.tv_sec += 70 * 24 * 3600; + btime_now.tv_sec += 9 * 24 * 3600; + + pid = fork(); + if (pid < 0) + return pr_perror("Unable to fork"); + if (pid == 0) { + ret = 0; + ret |= run_test(CLOCK_BOOTTIME, btime_now); + ret |= run_test(CLOCK_MONOTONIC, mtime_now); + ret |= run_test(CLOCK_BOOTTIME_ALARM, btime_now); + + if (ret) + ksft_exit_fail(); + ksft_exit_pass(); + return ret; + } + + if (waitpid(pid, &status, 0) != pid) + return pr_perror("Unable to wait the child process"); + + if (WIFEXITED(status)) + return WEXITSTATUS(status); + + return 1; +} -- cgit From 1854b97e4fa6a476d5cdc3dc30c42e1528699f87 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:22 +0000 Subject: selftests/timens: Add a simple perf test for clock_gettime() Output on success: 1..4 ok 1 host: clock: monotonic cycles: 148323947 ok 2 host: clock: boottime cycles: 148577503 ok 3 ns: clock: monotonic cycles: 137659217 ok 4 ns: clock: boottime cycles: 137959154 # Pass 4 Fail 0 Xfail 0 Xpass 0 Skip 0 Error 0 Output with lack of permissions: 1..4 ok 1 host: clock: monotonic cycles: 145671139 ok 2 host: clock: boottime cycles: 146958357 not ok 3 # SKIP need to run as root Output without support of time namespaces: 1..4 ok 1 host: clock: monotonic cycles: 145671139 ok 2 host: clock: boottime cycles: 146958357 not ok 3 # SKIP Time namespaces are not supported Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-34-dima@arista.com --- tools/testing/selftests/timens/.gitignore | 2 + tools/testing/selftests/timens/Makefile | 3 +- tools/testing/selftests/timens/gettime_perf.c | 95 +++++++++++++++++++++++++++ 3 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/timens/gettime_perf.c diff --git a/tools/testing/selftests/timens/.gitignore b/tools/testing/selftests/timens/.gitignore index 3b7eda8f35ce..16292e4d08a5 100644 --- a/tools/testing/selftests/timens/.gitignore +++ b/tools/testing/selftests/timens/.gitignore @@ -1,4 +1,6 @@ clock_nanosleep +gettime_perf +gettime_perf_cold procfs timens timer diff --git a/tools/testing/selftests/timens/Makefile b/tools/testing/selftests/timens/Makefile index 08164548a49d..6aefcaccb8f4 100644 --- a/tools/testing/selftests/timens/Makefile +++ b/tools/testing/selftests/timens/Makefile @@ -1,6 +1,7 @@ TEST_GEN_PROGS := timens timerfd timer clock_nanosleep procfs +TEST_GEN_PROGS_EXTENDED := gettime_perf CFLAGS := -Wall -Werror -pthread -LDFLAGS := -lrt +LDFLAGS := -lrt -ldl include ../lib.mk diff --git a/tools/testing/selftests/timens/gettime_perf.c b/tools/testing/selftests/timens/gettime_perf.c new file mode 100644 index 000000000000..7bf841a3967b --- /dev/null +++ b/tools/testing/selftests/timens/gettime_perf.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "timens.h" + +typedef int (*vgettime_t)(clockid_t, struct timespec *); + +vgettime_t vdso_clock_gettime; + +static void fill_function_pointers(void) +{ + void *vdso = dlopen("linux-vdso.so.1", + RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + if (!vdso) + vdso = dlopen("linux-gate.so.1", + RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + if (!vdso) { + pr_err("[WARN]\tfailed to find vDSO\n"); + return; + } + + vdso_clock_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime"); + if (!vdso_clock_gettime) + pr_err("Warning: failed to find clock_gettime in vDSO\n"); + +} + +static void test(clock_t clockid, char *clockstr, bool in_ns) +{ + struct timespec tp, start; + long i = 0; + const int timeout = 3; + + vdso_clock_gettime(clockid, &start); + tp = start; + for (tp = start; start.tv_sec + timeout > tp.tv_sec || + (start.tv_sec + timeout == tp.tv_sec && + start.tv_nsec > tp.tv_nsec); i++) { + vdso_clock_gettime(clockid, &tp); + } + + ksft_test_result_pass("%s:\tclock: %10s\tcycles:\t%10ld\n", + in_ns ? "ns" : "host", clockstr, i); +} + +int main(int argc, char *argv[]) +{ + time_t offset = 10; + int nsfd; + + ksft_set_plan(8); + + fill_function_pointers(); + + test(CLOCK_MONOTONIC, "monotonic", false); + test(CLOCK_MONOTONIC_COARSE, "monotonic-coarse", false); + test(CLOCK_MONOTONIC_RAW, "monotonic-raw", false); + test(CLOCK_BOOTTIME, "boottime", false); + + nscheck(); + + if (unshare_timens()) + return 1; + + nsfd = open("/proc/self/ns/time_for_children", O_RDONLY); + if (nsfd < 0) + return pr_perror("Can't open a time namespace"); + + if (_settime(CLOCK_MONOTONIC, offset)) + return 1; + if (_settime(CLOCK_BOOTTIME, offset)) + return 1; + + if (setns(nsfd, CLONE_NEWTIME)) + return pr_perror("setns"); + + test(CLOCK_MONOTONIC, "monotonic", true); + test(CLOCK_MONOTONIC_COARSE, "monotonic-coarse", true); + test(CLOCK_MONOTONIC_RAW, "monotonic-raw", true); + test(CLOCK_BOOTTIME, "boottime", true); + + ksft_exit_pass(); + return 0; +} -- cgit From a750c7474a5333a76e7278d353c460d26012deb6 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:23 +0000 Subject: selftests/timens: Check for right timens offsets after fork and exec Output on success: 1..1 ok 1 exec # Pass 1 Fail 0 Xfail 0 Xpass 0 Skip 0 Error 0 Output on failure: 1..1 not ok 1 36016 16 Bail out! Output with lack of permissions: 1..1 not ok 1 # SKIP need to run as root Output without support of time namespaces: 1..1 not ok 1 # SKIP Time namespaces are not supported Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-35-dima@arista.com --- tools/testing/selftests/timens/.gitignore | 1 + tools/testing/selftests/timens/Makefile | 2 +- tools/testing/selftests/timens/exec.c | 94 +++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/timens/exec.c diff --git a/tools/testing/selftests/timens/.gitignore b/tools/testing/selftests/timens/.gitignore index 16292e4d08a5..789f21e81028 100644 --- a/tools/testing/selftests/timens/.gitignore +++ b/tools/testing/selftests/timens/.gitignore @@ -1,4 +1,5 @@ clock_nanosleep +exec gettime_perf gettime_perf_cold procfs diff --git a/tools/testing/selftests/timens/Makefile b/tools/testing/selftests/timens/Makefile index 6aefcaccb8f4..e9fb30bd8aeb 100644 --- a/tools/testing/selftests/timens/Makefile +++ b/tools/testing/selftests/timens/Makefile @@ -1,4 +1,4 @@ -TEST_GEN_PROGS := timens timerfd timer clock_nanosleep procfs +TEST_GEN_PROGS := timens timerfd timer clock_nanosleep procfs exec TEST_GEN_PROGS_EXTENDED := gettime_perf CFLAGS := -Wall -Werror -pthread diff --git a/tools/testing/selftests/timens/exec.c b/tools/testing/selftests/timens/exec.c new file mode 100644 index 000000000000..87b47b557a7a --- /dev/null +++ b/tools/testing/selftests/timens/exec.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "timens.h" + +#define OFFSET (36000) + +int main(int argc, char *argv[]) +{ + struct timespec now, tst; + int status, i; + pid_t pid; + + if (argc > 1) { + if (sscanf(argv[1], "%ld", &now.tv_sec) != 1) + return pr_perror("sscanf"); + + for (i = 0; i < 2; i++) { + _gettime(CLOCK_MONOTONIC, &tst, i); + if (abs(tst.tv_sec - now.tv_sec) > 5) + return pr_fail("%ld %ld\n", now.tv_sec, tst.tv_sec); + } + return 0; + } + + nscheck(); + + ksft_set_plan(1); + + clock_gettime(CLOCK_MONOTONIC, &now); + + if (unshare_timens()) + return 1; + + if (_settime(CLOCK_MONOTONIC, OFFSET)) + return 1; + + for (i = 0; i < 2; i++) { + _gettime(CLOCK_MONOTONIC, &tst, i); + if (abs(tst.tv_sec - now.tv_sec) > 5) + return pr_fail("%ld %ld\n", + now.tv_sec, tst.tv_sec); + } + + if (argc > 1) + return 0; + + pid = fork(); + if (pid < 0) + return pr_perror("fork"); + + if (pid == 0) { + char now_str[64]; + char *cargv[] = {"exec", now_str, NULL}; + char *cenv[] = {NULL}; + + /* Check that a child process is in the new timens. */ + for (i = 0; i < 2; i++) { + _gettime(CLOCK_MONOTONIC, &tst, i); + if (abs(tst.tv_sec - now.tv_sec - OFFSET) > 5) + return pr_fail("%ld %ld\n", + now.tv_sec + OFFSET, tst.tv_sec); + } + + /* Check for proper vvar offsets after execve. */ + snprintf(now_str, sizeof(now_str), "%ld", now.tv_sec + OFFSET); + execve("/proc/self/exe", cargv, cenv); + return pr_perror("execve"); + } + + if (waitpid(pid, &status, 0) != pid) + return pr_perror("waitpid"); + + if (status) + ksft_exit_fail(); + + ksft_test_result_pass("exec\n"); + ksft_exit_pass(); + return 0; +} -- cgit From 6b6d188aae79a630957aefd88ff5c42af6553ee3 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 9 Jan 2020 07:59:07 -0800 Subject: alarmtimer: Unregister wakeup source when module get fails The alarmtimer_rtc_add_device() function creates a wakeup source and then tries to grab a module reference. If that fails the function returns early with an error code, but fails to remove the wakeup source. Cleanup this exit path so there is no dangling wakeup source, which is named 'alarmtime' left allocated which will conflict with another RTC device that may be registered later. Fixes: 51218298a25e ("alarmtimer: Ensure RTC module is not unloaded") Signed-off-by: Stephen Boyd Signed-off-by: Thomas Gleixner Reviewed-by: Douglas Anderson Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200109155910.907-2-swboyd@chromium.org --- kernel/time/alarmtimer.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index b51b36e533c4..9dc7a0913190 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -91,6 +91,7 @@ static int alarmtimer_rtc_add_device(struct device *dev, unsigned long flags; struct rtc_device *rtc = to_rtc_device(dev); struct wakeup_source *__ws; + int ret = 0; if (rtcdev) return -EBUSY; @@ -105,8 +106,8 @@ static int alarmtimer_rtc_add_device(struct device *dev, spin_lock_irqsave(&rtcdev_lock, flags); if (!rtcdev) { if (!try_module_get(rtc->owner)) { - spin_unlock_irqrestore(&rtcdev_lock, flags); - return -1; + ret = -1; + goto unlock; } rtcdev = rtc; @@ -115,11 +116,12 @@ static int alarmtimer_rtc_add_device(struct device *dev, ws = __ws; __ws = NULL; } +unlock: spin_unlock_irqrestore(&rtcdev_lock, flags); wakeup_source_unregister(__ws); - return 0; + return ret; } static inline void alarmtimer_rtc_timer_init(void) -- cgit From 5167c506d62dd9ffab73eba23c79b0a8845c9fe1 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Fri, 10 Jan 2020 16:39:02 +0800 Subject: tick/common: Touch watchdog in tick_unfreeze() on all CPUs Suspend to IDLE invokes tick_unfreeze() on resume. tick_unfreeze() on the first resuming CPU resumes timekeeping, which also has the side effect of resetting the softlockup watchdog on this CPU. But on the secondary CPUs the watchdog is not reset in the resume / unfreeze() path, which can result in false softlockup warnings on those CPUs depending on the time spent in suspend. Prevent this by clearing the softlock watchdog in the unfreeze path also on the secondary resuming CPUs. [ tglx: Massaged changelog ] Signed-off-by: Chunyan Zhang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20200110083902.27276-1-chunyan.zhang@unisoc.com --- kernel/time/tick-common.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 59225b484e4e..7e5d3524e924 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -558,6 +559,7 @@ void tick_unfreeze(void) trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), false); } else { + touch_softlockup_watchdog(); tick_resume_local(); } -- cgit From 1349401ff1aa425e7381ed26feb63e0d6b557fc6 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Tue, 19 Nov 2019 23:12:26 -0800 Subject: clocksource/drivers/hyper-v: Suspend/resume Hyper-V clocksource for hibernation This is needed for hibernation, e.g. when we resume the old kernel, we need to disable the "current" kernel's TSC page and then resume the old kernel's. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/1574233946-48377-1-git-send-email-decui@microsoft.com --- drivers/clocksource/hyperv_timer.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c index 287d8d58c21a..1aec08e82b7a 100644 --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -330,12 +330,37 @@ static u64 read_hv_sched_clock_tsc(void) return read_hv_clock_tsc(NULL) - hv_sched_clock_offset; } +static void suspend_hv_clock_tsc(struct clocksource *arg) +{ + u64 tsc_msr; + + /* Disable the TSC page */ + hv_get_reference_tsc(tsc_msr); + tsc_msr &= ~BIT_ULL(0); + hv_set_reference_tsc(tsc_msr); +} + + +static void resume_hv_clock_tsc(struct clocksource *arg) +{ + phys_addr_t phys_addr = virt_to_phys(&tsc_pg); + u64 tsc_msr; + + /* Re-enable the TSC page */ + hv_get_reference_tsc(tsc_msr); + tsc_msr &= GENMASK_ULL(11, 0); + tsc_msr |= BIT_ULL(0) | (u64)phys_addr; + hv_set_reference_tsc(tsc_msr); +} + static struct clocksource hyperv_cs_tsc = { .name = "hyperv_clocksource_tsc_page", .rating = 400, .read = read_hv_clock_tsc, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .suspend= suspend_hv_clock_tsc, + .resume = resume_hv_clock_tsc, }; static u64 notrace read_hv_clock_msr(struct clocksource *arg) -- cgit From 9ca9fe69eedb483c0811a4db7cb94942edfb1404 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 20 Nov 2019 21:42:36 +0800 Subject: clocksource: Fix Kconfig indentation Adjust indentation from spaces to tab (+optional two spaces) as in coding style with command like: $ sed -e 's/^ /\t/' -i */Kconfig Signed-off-by: Krzysztof Kozlowski Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20191120134236.15959-1-krzk@kernel.org --- drivers/clocksource/Kconfig | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 5fdd76cb1768..c981ff64bc13 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -470,7 +470,7 @@ config OXNAS_RPS_TIMER This enables support for the Oxford Semiconductor OXNAS RPS timers. config SYS_SUPPORTS_SH_CMT - bool + bool config MTK_TIMER bool "Mediatek timer driver" if COMPILE_TEST @@ -490,13 +490,13 @@ config SPRD_TIMER Enables support for the Spreadtrum timer driver. config SYS_SUPPORTS_SH_MTU2 - bool + bool config SYS_SUPPORTS_SH_TMU - bool + bool config SYS_SUPPORTS_EM_STI - bool + bool config CLKSRC_JCORE_PIT bool "J-Core PIT timer driver" if COMPILE_TEST @@ -591,21 +591,21 @@ config CLKSRC_PXA platforms. config H8300_TMR8 - bool "Clockevent timer for the H8300 platform" if COMPILE_TEST - depends on HAS_IOMEM + bool "Clockevent timer for the H8300 platform" if COMPILE_TEST + depends on HAS_IOMEM help This enables the 8 bits timer for the H8300 platform. config H8300_TMR16 - bool "Clockevent timer for the H83069 platform" if COMPILE_TEST - depends on HAS_IOMEM + bool "Clockevent timer for the H83069 platform" if COMPILE_TEST + depends on HAS_IOMEM help This enables the 16 bits timer for the H8300 platform with the H83069 cpu. config H8300_TPU - bool "Clocksource for the H8300 platform" if COMPILE_TEST - depends on HAS_IOMEM + bool "Clocksource for the H8300 platform" if COMPILE_TEST + depends on HAS_IOMEM help This enables the clocksource for the H8300 platform with the H8S2678 cpu. -- cgit From db95b8e3642a61b1576e600c3d313aca0c329db3 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Thu, 3 Oct 2019 13:03:49 +0100 Subject: dt-bindings: timer: renesas, cmt: Document r8a774b1 CMT support Document SoC specific bindings for RZ/G2N (r8a774b1) SoC. Signed-off-by: Biju Das Reviewed-by: Geert Uytterhoeven Acked-by: Rob Herring Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/1570104229-59144-1-git-send-email-biju.das@bp.renesas.com --- Documentation/devicetree/bindings/timer/renesas,cmt.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/timer/renesas,cmt.txt b/Documentation/devicetree/bindings/timer/renesas,cmt.txt index a444cfc5852a..a747fabab7d3 100644 --- a/Documentation/devicetree/bindings/timer/renesas,cmt.txt +++ b/Documentation/devicetree/bindings/timer/renesas,cmt.txt @@ -29,6 +29,8 @@ Required Properties: - "renesas,r8a77470-cmt1" for the 48-bit CMT1 device included in r8a77470. - "renesas,r8a774a1-cmt0" for the 32-bit CMT0 device included in r8a774a1. - "renesas,r8a774a1-cmt1" for the 48-bit CMT devices included in r8a774a1. + - "renesas,r8a774b1-cmt0" for the 32-bit CMT0 device included in r8a774b1. + - "renesas,r8a774b1-cmt1" for the 48-bit CMT devices included in r8a774b1. - "renesas,r8a774c0-cmt0" for the 32-bit CMT0 device included in r8a774c0. - "renesas,r8a774c0-cmt1" for the 48-bit CMT devices included in r8a774c0. - "renesas,r8a7790-cmt0" for the 32-bit CMT0 device included in r8a7790. -- cgit From 062934634dc3ae38baeb9961dcc80c44a00ffcf2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 27 Nov 2019 21:10:22 -0800 Subject: clocksource: Fix Kconfig miscues Fix lots of typo, spelling, punctuation, and grammar miscues in drivers/clocksource/Kconfig. Signed-off-by: Randy Dunlap Cc: Daniel Lezcano Cc: Thomas Gleixner Cc: John Stultz Cc: Stephen Boyd Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/4deb42a9-82f2-72f9-891f-972a9a399f4f@infradead.org --- drivers/clocksource/Kconfig | 46 ++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index c981ff64bc13..94192fb0533e 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -88,7 +88,7 @@ config ROCKCHIP_TIMER select TIMER_OF select CLKSRC_MMIO help - Enables the support for the rockchip timer driver. + Enables the support for the Rockchip timer driver. config ARMADA_370_XP_TIMER bool "Armada 370 and XP timer driver" if COMPILE_TEST @@ -162,13 +162,13 @@ config NPCM7XX_TIMER select CLKSRC_MMIO help Enable 24-bit TIMER0 and TIMER1 counters in the NPCM7xx architecture, - While TIMER0 serves as clockevent and TIMER1 serves as clocksource. + where TIMER0 serves as clockevent and TIMER1 serves as clocksource. config CADENCE_TTC_TIMER bool "Cadence TTC timer driver" if COMPILE_TEST depends on COMMON_CLK help - Enables support for the cadence ttc driver. + Enables support for the Cadence TTC driver. config ASM9260_TIMER bool "ASM9260 timer driver" if COMPILE_TEST @@ -190,10 +190,10 @@ config CLKSRC_DBX500_PRCMU bool "Clocksource PRCMU Timer" if COMPILE_TEST depends on HAS_IOMEM help - Use the always on PRCMU Timer as clocksource + Use the always on PRCMU Timer as clocksource. config CLPS711X_TIMER - bool "Cirrus logic timer driver" if COMPILE_TEST + bool "Cirrus Logic timer driver" if COMPILE_TEST select CLKSRC_MMIO help Enables support for the Cirrus Logic PS711 timer. @@ -205,11 +205,11 @@ config ATLAS7_TIMER Enables support for the Atlas7 timer. config MXS_TIMER - bool "Mxs timer driver" if COMPILE_TEST + bool "MXS timer driver" if COMPILE_TEST select CLKSRC_MMIO select STMP_DEVICE help - Enables support for the Mxs timer. + Enables support for the MXS timer. config PRIMA2_TIMER bool "Prima2 timer driver" if COMPILE_TEST @@ -238,10 +238,10 @@ config KEYSTONE_TIMER Enables support for the Keystone timer. config INTEGRATOR_AP_TIMER - bool "Integrator-ap timer driver" if COMPILE_TEST + bool "Integrator-AP timer driver" if COMPILE_TEST select CLKSRC_MMIO help - Enables support for the Integrator-ap timer. + Enables support for the Integrator-AP timer. config CLKSRC_EFM32 bool "Clocksource for Energy Micro's EFM32 SoCs" if !ARCH_EFM32 @@ -283,8 +283,8 @@ config CLKSRC_NPS select TIMER_OF if OF help NPS400 clocksource support. - Got 64 bit counter with update rate up to 1000MHz. - This counter is accessed via couple of 32 bit memory mapped registers. + It has a 64-bit counter with update rate up to 1000MHz. + This counter is accessed via couple of 32-bit memory-mapped registers. config CLKSRC_STM32 bool "Clocksource for STM32 SoCs" if !ARCH_STM32 @@ -305,14 +305,14 @@ config ARC_TIMERS help These are legacy 32-bit TIMER0 and TIMER1 counters found on all ARC cores (ARC700 as well as ARC HS38). - TIMER0 serves as clockevent while TIMER1 provides clocksource + TIMER0 serves as clockevent while TIMER1 provides clocksource. config ARC_TIMERS_64BIT bool "Support for 64-bit counters in ARC HS38 cores" if COMPILE_TEST depends on ARC_TIMERS select TIMER_OF help - This enables 2 different 64-bit timers: RTC (for UP) and GFRC (for SMP) + This enables 2 different 64-bit timers: RTC (for UP) and GFRC (for SMP). RTC is implemented inside the core, while GFRC sits outside the core in ARConnect IP block. Driver automatically picks one of them for clocksource as appropriate. @@ -390,7 +390,7 @@ config ARM_GLOBAL_TIMER select TIMER_OF if OF depends on ARM help - This options enables support for the ARM global timer unit + This option enables support for the ARM global timer unit. config ARM_TIMER_SP804 bool "Support for Dual Timer SP804 module" if COMPILE_TEST @@ -403,14 +403,14 @@ config CLKSRC_ARM_GLOBAL_TIMER_SCHED_CLOCK depends on ARM_GLOBAL_TIMER default y help - Use ARM global timer clock source as sched_clock + Use ARM global timer clock source as sched_clock. config ARMV7M_SYSTICK bool "Support for the ARMv7M system time" if COMPILE_TEST select TIMER_OF if OF select CLKSRC_MMIO help - This options enables support for the ARMv7M system timer unit + This option enables support for the ARMv7M system timer unit. config ATMEL_PIT bool "Atmel PIT support" if COMPILE_TEST @@ -460,7 +460,7 @@ config VF_PIT_TIMER bool select CLKSRC_MMIO help - Support for Period Interrupt Timer on Freescale Vybrid Family SoCs. + Support for Periodic Interrupt Timer on Freescale Vybrid Family SoCs. config OXNAS_RPS_TIMER bool "Oxford Semiconductor OXNAS RPS Timers driver" if COMPILE_TEST @@ -523,7 +523,7 @@ config SH_TIMER_MTU2 help This enables build of a clockevent driver for the Multi-Function Timer Pulse Unit 2 (MTU2) hardware available on SoCs from Renesas. - This hardware comes with 16 bit-timer registers. + This hardware comes with 16-bit timer registers. config RENESAS_OSTM bool "Renesas OSTM timer driver" if COMPILE_TEST @@ -580,7 +580,7 @@ config CLKSRC_TANGO_XTAL select TIMER_OF select CLKSRC_MMIO help - This enables the clocksource for Tango SoC + This enables the clocksource for Tango SoC. config CLKSRC_PXA bool "Clocksource for PXA or SA-11x0 platform" if COMPILE_TEST @@ -601,14 +601,14 @@ config H8300_TMR16 depends on HAS_IOMEM help This enables the 16 bits timer for the H8300 platform with the - H83069 cpu. + H83069 CPU. config H8300_TPU bool "Clocksource for the H8300 platform" if COMPILE_TEST depends on HAS_IOMEM help This enables the clocksource for the H8300 platform with the - H8S2678 cpu. + H8S2678 CPU. config CLKSRC_IMX_GPT bool "Clocksource using i.MX GPT" if COMPILE_TEST @@ -666,8 +666,8 @@ config CSKY_MP_TIMER help Say yes here to enable C-SKY SMP timer driver used for C-SKY SMP system. - csky,mptimer is not only used in SMP system, it also could be used - single core system. It's not a mmio reg and it use mtcr/mfcr instruction. + csky,mptimer is not only used in SMP system, it also could be used in + single core system. It's not a mmio reg and it uses mtcr/mfcr instruction. config GX6605S_TIMER bool "Gx6605s SOC system timer driver" if COMPILE_TEST -- cgit From ddc61bbc45017726a2b450350d476b4dc5ae25ce Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Tue, 26 Nov 2019 10:17:20 +0800 Subject: clocksource/drivers/hyper-v: Reserve PAGE_SIZE space for tsc page Currently, the reserved size for a tsc page is 4K, which is enough for communicating with hypervisor. However, in the case where we want to export the tsc page to userspace (e.g. for vDSO to read the clocksource), the tsc page should be at least PAGE_SIZE, otherwise, when PAGE_SIZE is larger than 4K, extra kernel data will be mapped into userspace, which means leaking kernel information. Therefore reserve PAGE_SIZE space for tsc_pg as a preparation for the vDSO support of ARM64 in the future. Also, while at it, replace all reference to tsc_pg with hv_get_tsc_page() since it should be the only interface to access tsc page. Signed-off-by: Boqun Feng (Microsoft) Cc: linux-hyperv@vger.kernel.org Reviewed-by: Michael Kelley Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20191126021723.4710-1-boqun.feng@gmail.com --- drivers/clocksource/hyperv_timer.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c index 1aec08e82b7a..12d75b50a317 100644 --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -307,17 +307,20 @@ EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup); struct clocksource *hyperv_cs; EXPORT_SYMBOL_GPL(hyperv_cs); -static struct ms_hyperv_tsc_page tsc_pg __aligned(PAGE_SIZE); +static union { + struct ms_hyperv_tsc_page page; + u8 reserved[PAGE_SIZE]; +} tsc_pg __aligned(PAGE_SIZE); struct ms_hyperv_tsc_page *hv_get_tsc_page(void) { - return &tsc_pg; + return &tsc_pg.page; } EXPORT_SYMBOL_GPL(hv_get_tsc_page); static u64 notrace read_hv_clock_tsc(struct clocksource *arg) { - u64 current_tick = hv_read_tsc_page(&tsc_pg); + u64 current_tick = hv_read_tsc_page(hv_get_tsc_page()); if (current_tick == U64_MAX) hv_get_time_ref_count(current_tick); @@ -397,7 +400,7 @@ static bool __init hv_init_tsc_clocksource(void) return false; hyperv_cs = &hyperv_cs_tsc; - phys_addr = virt_to_phys(&tsc_pg); + phys_addr = virt_to_phys(hv_get_tsc_page()); /* * The Hyper-V TLFS specifies to preserve the value of reserved -- cgit From 625022a5f160619ae180d54097ddd65bb3795913 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Fri, 13 Dec 2019 13:19:21 +0200 Subject: clocksource/drivers/timer-microchip-pit64b: Add Microchip PIT64B support Add driver for Microchip PIT64B timer. Timer could be used in continuous mode or oneshot mode. The hardware has 2x32 bit registers for period emulating a 64 bit timer. The LSB_PR and MSB_PR registers are used to set the period value (compare value). TLSB and TMSB keeps the current value of the counter. After a compare the TLSB and TMSB register resets. The driver uses PIT64B timer for clocksource or clockevent. First requested timer would be registered as clockevent, second one would be registered as clocksource. Individual PIT64B hardware resources were used for clocksource and clockevent to be able to support high resolution timers with this hardware implementation. Signed-off-by: Claudiu Beznea Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/1576235962-30123-3-git-send-email-claudiu.beznea@microchip.com --- .../devicetree/bindings/arm/atmel-sysregs.txt | 6 + drivers/clocksource/Kconfig | 10 + drivers/clocksource/Makefile | 1 + drivers/clocksource/timer-microchip-pit64b.c | 449 +++++++++++++++++++++ 4 files changed, 466 insertions(+) create mode 100644 drivers/clocksource/timer-microchip-pit64b.c diff --git a/Documentation/devicetree/bindings/arm/atmel-sysregs.txt b/Documentation/devicetree/bindings/arm/atmel-sysregs.txt index 9fbde401a090..e003a553b986 100644 --- a/Documentation/devicetree/bindings/arm/atmel-sysregs.txt +++ b/Documentation/devicetree/bindings/arm/atmel-sysregs.txt @@ -10,6 +10,12 @@ PIT Timer required properties: - interrupts: Should contain interrupt for the PIT which is the IRQ line shared across all System Controller members. +PIT64B Timer required properties: +- compatible: Should be "microchip,sam9x60-pit64b" +- reg: Should contain registers location and length +- interrupts: Should contain interrupt for PIT64B timer +- clocks: Should contain the available clock sources for PIT64B timer. + System Timer (ST) required properties: - compatible: Should be "atmel,at91rm9200-st", "syscon", "simple-mfd" - reg: Should contain registers location and length diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 94192fb0533e..cc909e465823 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -697,4 +697,14 @@ config INGENIC_TIMER help Support for the timer/counter unit of the Ingenic JZ SoCs. +config MICROCHIP_PIT64B + bool "Microchip PIT64B support" + depends on OF || COMPILE_TEST + select CLKSRC_MMIO + help + This option enables Microchip PIT64B timer for Atmel + based system. It supports the oneshot, the periodic + modes and high resolution. It is used as a clocksource + and a clockevent. + endmenu diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index 4dfe4225ece7..713686faa549 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -88,3 +88,4 @@ obj-$(CONFIG_RISCV_TIMER) += timer-riscv.o obj-$(CONFIG_CSKY_MP_TIMER) += timer-mp-csky.o obj-$(CONFIG_GX6605S_TIMER) += timer-gx6605s.o obj-$(CONFIG_HYPERV_TIMER) += hyperv_timer.o +obj-$(CONFIG_MICROCHIP_PIT64B) += timer-microchip-pit64b.o diff --git a/drivers/clocksource/timer-microchip-pit64b.c b/drivers/clocksource/timer-microchip-pit64b.c new file mode 100644 index 000000000000..27a389a7e078 --- /dev/null +++ b/drivers/clocksource/timer-microchip-pit64b.c @@ -0,0 +1,449 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * 64-bit Periodic Interval Timer driver + * + * Copyright (C) 2019 Microchip Technology Inc. and its subsidiaries + * + * Author: Claudiu Beznea + */ + +#include +#include +#include +#include +#include +#include +#include + +#define MCHP_PIT64B_CR 0x00 /* Control Register */ +#define MCHP_PIT64B_CR_START BIT(0) +#define MCHP_PIT64B_CR_SWRST BIT(8) + +#define MCHP_PIT64B_MR 0x04 /* Mode Register */ +#define MCHP_PIT64B_MR_CONT BIT(0) +#define MCHP_PIT64B_MR_ONE_SHOT (0) +#define MCHP_PIT64B_MR_SGCLK BIT(3) +#define MCHP_PIT64B_MR_PRES GENMASK(11, 8) + +#define MCHP_PIT64B_LSB_PR 0x08 /* LSB Period Register */ + +#define MCHP_PIT64B_MSB_PR 0x0C /* MSB Period Register */ + +#define MCHP_PIT64B_IER 0x10 /* Interrupt Enable Register */ +#define MCHP_PIT64B_IER_PERIOD BIT(0) + +#define MCHP_PIT64B_ISR 0x1C /* Interrupt Status Register */ + +#define MCHP_PIT64B_TLSBR 0x20 /* Timer LSB Register */ + +#define MCHP_PIT64B_TMSBR 0x24 /* Timer MSB Register */ + +#define MCHP_PIT64B_PRES_MAX 0x10 +#define MCHP_PIT64B_LSBMASK GENMASK_ULL(31, 0) +#define MCHP_PIT64B_PRES_TO_MODE(p) (MCHP_PIT64B_MR_PRES & ((p) << 8)) +#define MCHP_PIT64B_MODE_TO_PRES(m) ((MCHP_PIT64B_MR_PRES & (m)) >> 8) +#define MCHP_PIT64B_DEF_CS_FREQ 5000000UL /* 5 MHz */ +#define MCHP_PIT64B_DEF_CE_FREQ 32768 /* 32 KHz */ + +#define MCHP_PIT64B_NAME "pit64b" + +/** + * struct mchp_pit64b_timer - PIT64B timer data structure + * @base: base address of PIT64B hardware block + * @pclk: PIT64B's peripheral clock + * @gclk: PIT64B's generic clock + * @mode: precomputed value for mode register + */ +struct mchp_pit64b_timer { + void __iomem *base; + struct clk *pclk; + struct clk *gclk; + u32 mode; +}; + +/** + * mchp_pit64b_clkevt - PIT64B clockevent data structure + * @timer: PIT64B timer + * @clkevt: clockevent + */ +struct mchp_pit64b_clkevt { + struct mchp_pit64b_timer timer; + struct clock_event_device clkevt; +}; + +#define to_mchp_pit64b_timer(x) \ + ((struct mchp_pit64b_timer *)container_of(x,\ + struct mchp_pit64b_clkevt, clkevt)) + +/* Base address for clocksource timer. */ +static void __iomem *mchp_pit64b_cs_base; +/* Default cycles for clockevent timer. */ +static u64 mchp_pit64b_ce_cycles; + +static inline u64 mchp_pit64b_cnt_read(void __iomem *base) +{ + unsigned long flags; + u32 low, high; + + raw_local_irq_save(flags); + + /* + * When using a 64 bit period TLSB must be read first, followed by the + * read of TMSB. This sequence generates an atomic read of the 64 bit + * timer value whatever the lapse of time between the accesses. + */ + low = readl_relaxed(base + MCHP_PIT64B_TLSBR); + high = readl_relaxed(base + MCHP_PIT64B_TMSBR); + + raw_local_irq_restore(flags); + + return (((u64)high << 32) | low); +} + +static inline void mchp_pit64b_reset(struct mchp_pit64b_timer *timer, + u64 cycles, u32 mode, u32 irqs) +{ + u32 low, high; + + low = cycles & MCHP_PIT64B_LSBMASK; + high = cycles >> 32; + + writel_relaxed(MCHP_PIT64B_CR_SWRST, timer->base + MCHP_PIT64B_CR); + writel_relaxed(mode | timer->mode, timer->base + MCHP_PIT64B_MR); + writel_relaxed(high, timer->base + MCHP_PIT64B_MSB_PR); + writel_relaxed(low, timer->base + MCHP_PIT64B_LSB_PR); + writel_relaxed(irqs, timer->base + MCHP_PIT64B_IER); + writel_relaxed(MCHP_PIT64B_CR_START, timer->base + MCHP_PIT64B_CR); +} + +static u64 mchp_pit64b_clksrc_read(struct clocksource *cs) +{ + return mchp_pit64b_cnt_read(mchp_pit64b_cs_base); +} + +static u64 mchp_pit64b_sched_read_clk(void) +{ + return mchp_pit64b_cnt_read(mchp_pit64b_cs_base); +} + +static int mchp_pit64b_clkevt_shutdown(struct clock_event_device *cedev) +{ + struct mchp_pit64b_timer *timer = to_mchp_pit64b_timer(cedev); + + writel_relaxed(MCHP_PIT64B_CR_SWRST, timer->base + MCHP_PIT64B_CR); + + return 0; +} + +static int mchp_pit64b_clkevt_set_periodic(struct clock_event_device *cedev) +{ + struct mchp_pit64b_timer *timer = to_mchp_pit64b_timer(cedev); + + mchp_pit64b_reset(timer, mchp_pit64b_ce_cycles, MCHP_PIT64B_MR_CONT, + MCHP_PIT64B_IER_PERIOD); + + return 0; +} + +static int mchp_pit64b_clkevt_set_next_event(unsigned long evt, + struct clock_event_device *cedev) +{ + struct mchp_pit64b_timer *timer = to_mchp_pit64b_timer(cedev); + + mchp_pit64b_reset(timer, evt, MCHP_PIT64B_MR_ONE_SHOT, + MCHP_PIT64B_IER_PERIOD); + + return 0; +} + +static void mchp_pit64b_clkevt_suspend(struct clock_event_device *cedev) +{ + struct mchp_pit64b_timer *timer = to_mchp_pit64b_timer(cedev); + + writel_relaxed(MCHP_PIT64B_CR_SWRST, timer->base + MCHP_PIT64B_CR); + if (timer->mode & MCHP_PIT64B_MR_SGCLK) + clk_disable_unprepare(timer->gclk); + clk_disable_unprepare(timer->pclk); +} + +static void mchp_pit64b_clkevt_resume(struct clock_event_device *cedev) +{ + struct mchp_pit64b_timer *timer = to_mchp_pit64b_timer(cedev); + + clk_prepare_enable(timer->pclk); + if (timer->mode & MCHP_PIT64B_MR_SGCLK) + clk_prepare_enable(timer->gclk); +} + +static irqreturn_t mchp_pit64b_interrupt(int irq, void *dev_id) +{ + struct mchp_pit64b_clkevt *irq_data = dev_id; + + /* Need to clear the interrupt. */ + readl_relaxed(irq_data->timer.base + MCHP_PIT64B_ISR); + + irq_data->clkevt.event_handler(&irq_data->clkevt); + + return IRQ_HANDLED; +} + +static void __init mchp_pit64b_pres_compute(u32 *pres, u32 clk_rate, + u32 max_rate) +{ + u32 tmp; + + for (*pres = 0; *pres < MCHP_PIT64B_PRES_MAX; (*pres)++) { + tmp = clk_rate / (*pres + 1); + if (tmp <= max_rate) + break; + } + + /* Use the bigest prescaler if we didn't match one. */ + if (*pres == MCHP_PIT64B_PRES_MAX) + *pres = MCHP_PIT64B_PRES_MAX - 1; +} + +/** + * mchp_pit64b_init_mode - prepare PIT64B mode register value to be used at + * runtime; this includes prescaler and SGCLK bit + * + * PIT64B timer may be fed by gclk or pclk. When gclk is used its rate has to + * be at least 3 times lower that pclk's rate. pclk rate is fixed, gclk rate + * could be changed via clock APIs. The chosen clock (pclk or gclk) could be + * divided by the internal PIT64B's divider. + * + * This function, first tries to use GCLK by requesting the desired rate from + * PMC and then using the internal PIT64B prescaler, if any, to reach the + * requested rate. If PCLK/GCLK < 3 (condition requested by PIT64B hardware) + * then the function falls back on using PCLK as clock source for PIT64B timer + * choosing the highest prescaler in case it doesn't locate one to match the + * requested frequency. + * + * Below is presented the PIT64B block in relation with PMC: + * + * PIT64B + * PMC +------------------------------------+ + * +----+ | +-----+ | + * | |-->gclk -->|-->| | +---------+ +-----+ | + * | | | | MUX |--->| Divider |->|timer| | + * | |-->pclk -->|-->| | +---------+ +-----+ | + * +----+ | +-----+ | + * | ^ | + * | sel | + * +------------------------------------+ + * + * Where: + * - gclk rate <= pclk rate/3 + * - gclk rate could be requested from PMC + * - pclk rate is fixed (cannot be requested from PMC) + */ +static int __init mchp_pit64b_init_mode(struct mchp_pit64b_timer *timer, + unsigned long max_rate) +{ + unsigned long pclk_rate, diff = 0, best_diff = ULONG_MAX; + long gclk_round = 0; + u32 pres, best_pres = 0; + + pclk_rate = clk_get_rate(timer->pclk); + if (!pclk_rate) + return -EINVAL; + + /* Try using GCLK. */ + gclk_round = clk_round_rate(timer->gclk, max_rate); + if (gclk_round < 0) + goto pclk; + + if (pclk_rate / gclk_round < 3) + goto pclk; + + mchp_pit64b_pres_compute(&pres, gclk_round, max_rate); + best_diff = abs(gclk_round / (pres + 1) - max_rate); + best_pres = pres; + + if (!best_diff) { + timer->mode |= MCHP_PIT64B_MR_SGCLK; + goto done; + } + +pclk: + /* Check if requested rate could be obtained using PCLK. */ + mchp_pit64b_pres_compute(&pres, pclk_rate, max_rate); + diff = abs(pclk_rate / (pres + 1) - max_rate); + + if (best_diff > diff) { + /* Use PCLK. */ + best_pres = pres; + } else { + /* Use GCLK. */ + timer->mode |= MCHP_PIT64B_MR_SGCLK; + clk_set_rate(timer->gclk, gclk_round); + } + +done: + timer->mode |= MCHP_PIT64B_PRES_TO_MODE(best_pres); + + pr_info("PIT64B: using clk=%s with prescaler %u, freq=%lu [Hz]\n", + timer->mode & MCHP_PIT64B_MR_SGCLK ? "gclk" : "pclk", best_pres, + timer->mode & MCHP_PIT64B_MR_SGCLK ? + gclk_round / (best_pres + 1) : pclk_rate / (best_pres + 1)); + + return 0; +} + +static int __init mchp_pit64b_init_clksrc(struct mchp_pit64b_timer *timer, + u32 clk_rate) +{ + int ret; + + mchp_pit64b_reset(timer, ULLONG_MAX, MCHP_PIT64B_MR_CONT, 0); + + mchp_pit64b_cs_base = timer->base; + + ret = clocksource_mmio_init(timer->base, MCHP_PIT64B_NAME, clk_rate, + 210, 64, mchp_pit64b_clksrc_read); + if (ret) { + pr_debug("clksrc: Failed to register PIT64B clocksource!\n"); + + /* Stop timer. */ + writel_relaxed(MCHP_PIT64B_CR_SWRST, + timer->base + MCHP_PIT64B_CR); + + return ret; + } + + sched_clock_register(mchp_pit64b_sched_read_clk, 64, clk_rate); + + return 0; +} + +static int __init mchp_pit64b_init_clkevt(struct mchp_pit64b_timer *timer, + u32 clk_rate, u32 irq) +{ + struct mchp_pit64b_clkevt *ce; + int ret; + + ce = kzalloc(sizeof(*ce), GFP_KERNEL); + if (!ce) + return -ENOMEM; + + mchp_pit64b_ce_cycles = DIV_ROUND_CLOSEST(clk_rate, HZ); + + ce->timer.base = timer->base; + ce->timer.pclk = timer->pclk; + ce->timer.gclk = timer->gclk; + ce->timer.mode = timer->mode; + ce->clkevt.name = MCHP_PIT64B_NAME; + ce->clkevt.features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_PERIODIC; + ce->clkevt.rating = 150; + ce->clkevt.set_state_shutdown = mchp_pit64b_clkevt_shutdown; + ce->clkevt.set_state_periodic = mchp_pit64b_clkevt_set_periodic; + ce->clkevt.set_next_event = mchp_pit64b_clkevt_set_next_event; + ce->clkevt.suspend = mchp_pit64b_clkevt_suspend; + ce->clkevt.resume = mchp_pit64b_clkevt_resume; + ce->clkevt.cpumask = cpumask_of(0); + ce->clkevt.irq = irq; + + ret = request_irq(irq, mchp_pit64b_interrupt, IRQF_TIMER, + "pit64b_tick", ce); + if (ret) { + pr_debug("clkevt: Failed to setup PIT64B IRQ\n"); + kfree(ce); + return ret; + } + + clockevents_config_and_register(&ce->clkevt, clk_rate, 1, ULONG_MAX); + + return 0; +} + +static int __init mchp_pit64b_dt_init_timer(struct device_node *node, + bool clkevt) +{ + u32 freq = clkevt ? MCHP_PIT64B_DEF_CE_FREQ : MCHP_PIT64B_DEF_CS_FREQ; + struct mchp_pit64b_timer timer = { 0 }; + unsigned long clk_rate; + u32 irq = 0; + int ret; + + /* Parse DT node. */ + timer.pclk = of_clk_get_by_name(node, "pclk"); + if (IS_ERR(timer.pclk)) + return PTR_ERR(timer.pclk); + + timer.gclk = of_clk_get_by_name(node, "gclk"); + if (IS_ERR(timer.gclk)) + return PTR_ERR(timer.gclk); + + timer.base = of_iomap(node, 0); + if (!timer.base) + return -ENXIO; + + if (clkevt) { + irq = irq_of_parse_and_map(node, 0); + if (!irq) { + ret = -ENODEV; + goto io_unmap; + } + } + + /* Initialize mode (prescaler + SGCK bit). To be used at runtime. */ + ret = mchp_pit64b_init_mode(&timer, freq); + if (ret) + goto irq_unmap; + + ret = clk_prepare_enable(timer.pclk); + if (ret) + goto irq_unmap; + + if (timer.mode & MCHP_PIT64B_MR_SGCLK) { + ret = clk_prepare_enable(timer.gclk); + if (ret) + goto pclk_unprepare; + + clk_rate = clk_get_rate(timer.gclk); + } else { + clk_rate = clk_get_rate(timer.pclk); + } + clk_rate = clk_rate / (MCHP_PIT64B_MODE_TO_PRES(timer.mode) + 1); + + if (clkevt) + ret = mchp_pit64b_init_clkevt(&timer, clk_rate, irq); + else + ret = mchp_pit64b_init_clksrc(&timer, clk_rate); + + if (ret) + goto gclk_unprepare; + + return 0; + +gclk_unprepare: + if (timer.mode & MCHP_PIT64B_MR_SGCLK) + clk_disable_unprepare(timer.gclk); +pclk_unprepare: + clk_disable_unprepare(timer.pclk); +irq_unmap: + irq_dispose_mapping(irq); +io_unmap: + iounmap(timer.base); + + return ret; +} + +static int __init mchp_pit64b_dt_init(struct device_node *node) +{ + static int inits; + + switch (inits++) { + case 0: + /* 1st request, register clockevent. */ + return mchp_pit64b_dt_init_timer(node, true); + case 1: + /* 2nd request, register clocksource. */ + return mchp_pit64b_dt_init_timer(node, false); + } + + /* The rest, don't care. */ + return -EINVAL; +} + +TIMER_OF_DECLARE(mchp_pit64b, "microchip,sam9x60-pit64b", mchp_pit64b_dt_init); -- cgit From f5ac896b6a23eb46681cdbef440c1d991b04e519 Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Thu, 7 Nov 2019 02:36:28 -0800 Subject: clocksource/drivers/cadence-ttc: Use ttc driver as platform driver Currently TTC driver is TIMER_OF_DECLARE type driver. Because of that, TTC driver may be initialized before other clock drivers. If TTC driver is dependent on that clock driver then initialization of TTC driver will failed. So use TTC driver as platform driver instead of using TIMER_OF_DECLARE. Signed-off-by: Rajan Vaja Tested-by: Michal Simek Acked-by: Michal Simek Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/1573122988-18399-1-git-send-email-rajan.vaja@xilinx.com --- drivers/clocksource/timer-cadence-ttc.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/clocksource/timer-cadence-ttc.c b/drivers/clocksource/timer-cadence-ttc.c index 88fe2e9ba9a3..38858e141731 100644 --- a/drivers/clocksource/timer-cadence-ttc.c +++ b/drivers/clocksource/timer-cadence-ttc.c @@ -15,6 +15,8 @@ #include #include #include +#include +#include /* * This driver configures the 2 16/32-bit count-up timers as follows: @@ -464,13 +466,7 @@ static int __init ttc_setup_clockevent(struct clk *clk, return 0; } -/** - * ttc_timer_init - Initialize the timer - * - * Initializes the timer hardware and register the clock source and clock event - * timers with Linux kernal timer framework - */ -static int __init ttc_timer_init(struct device_node *timer) +static int __init ttc_timer_probe(struct platform_device *pdev) { unsigned int irq; void __iomem *timer_baseaddr; @@ -478,6 +474,7 @@ static int __init ttc_timer_init(struct device_node *timer) static int initialized; int clksel, ret; u32 timer_width = 16; + struct device_node *timer = pdev->dev.of_node; if (initialized) return 0; @@ -532,4 +529,17 @@ static int __init ttc_timer_init(struct device_node *timer) return 0; } -TIMER_OF_DECLARE(ttc, "cdns,ttc", ttc_timer_init); +static const struct of_device_id ttc_timer_of_match[] = { + {.compatible = "cdns,ttc"}, + {}, +}; + +MODULE_DEVICE_TABLE(of, ttc_timer_of_match); + +static struct platform_driver ttc_timer_driver = { + .driver = { + .name = "cdns_ttc_timer", + .of_match_table = ttc_timer_of_match, + }, +}; +builtin_platform_driver_probe(ttc_timer_driver, ttc_timer_probe); -- cgit From 2052d032c06761330bca4944bb7858b00960e868 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 19 Dec 2019 21:32:46 +0000 Subject: clocksource/drivers/bcm2835_timer: Fix memory leak of timer Currently when setup_irq fails the error exit path will leak the recently allocated timer structure. Originally the code would throw a panic but a later commit changed the behaviour to return via the err_iounmap path and hence we now have a memory leak. Fix this by adding a err_timer_free error path that kfree's timer. Addresses-Coverity: ("Resource Leak") Fixes: 524a7f08983d ("clocksource/drivers/bcm2835_timer: Convert init function to return error") Signed-off-by: Colin Ian King Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20191219213246.34437-1-colin.king@canonical.com --- drivers/clocksource/bcm2835_timer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/clocksource/bcm2835_timer.c b/drivers/clocksource/bcm2835_timer.c index 2b196cbfadb6..b235f446ee50 100644 --- a/drivers/clocksource/bcm2835_timer.c +++ b/drivers/clocksource/bcm2835_timer.c @@ -121,7 +121,7 @@ static int __init bcm2835_timer_init(struct device_node *node) ret = setup_irq(irq, &timer->act); if (ret) { pr_err("Can't set up timer IRQ\n"); - goto err_iounmap; + goto err_timer_free; } clockevents_config_and_register(&timer->evt, freq, 0xf, 0xffffffff); @@ -130,6 +130,9 @@ static int __init bcm2835_timer_init(struct device_node *node) return 0; +err_timer_free: + kfree(timer); + err_iounmap: iounmap(base); return ret; -- cgit From 9a97b2fb070d497c683aed9fb86b7ec5245cea86 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 21 Dec 2019 17:30:24 +0000 Subject: clocksource/drivers/em_sti: Convert to devm_platform_ioremap_resource Use devm_platform_ioremap_resource() to simplify code, which wraps 'platform_get_resource' and 'devm_ioremap_resource' in a single helper. Signed-off-by: Yangtao Li Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20191221173027.30716-2-tiny.windzz@gmail.com --- drivers/clocksource/em_sti.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/clocksource/em_sti.c b/drivers/clocksource/em_sti.c index 9039df4f90e2..086fd5d80b99 100644 --- a/drivers/clocksource/em_sti.c +++ b/drivers/clocksource/em_sti.c @@ -279,7 +279,6 @@ static void em_sti_register_clockevent(struct em_sti_priv *p) static int em_sti_probe(struct platform_device *pdev) { struct em_sti_priv *p; - struct resource *res; int irq; int ret; @@ -295,8 +294,7 @@ static int em_sti_probe(struct platform_device *pdev) return irq; /* map memory, let base point to the STI instance */ - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - p->base = devm_ioremap_resource(&pdev->dev, res); + p->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(p->base)) return PTR_ERR(p->base); -- cgit From ba25322edd600300e55cd58eb7fbdf9cbdc5a82d Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 21 Dec 2019 17:30:25 +0000 Subject: clocksource/drivers/em_sti: Fix variable declaration in em_sti_probe 'irq' and 'ret' are variables of the same type and there is no need to use two lines. Signed-off-by: Yangtao Li Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20191221173027.30716-3-tiny.windzz@gmail.com --- drivers/clocksource/em_sti.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/clocksource/em_sti.c b/drivers/clocksource/em_sti.c index 086fd5d80b99..ab190dffb1ed 100644 --- a/drivers/clocksource/em_sti.c +++ b/drivers/clocksource/em_sti.c @@ -279,8 +279,7 @@ static void em_sti_register_clockevent(struct em_sti_priv *p) static int em_sti_probe(struct platform_device *pdev) { struct em_sti_priv *p; - int irq; - int ret; + int irq, ret; p = devm_kzalloc(&pdev->dev, sizeof(*p), GFP_KERNEL); if (p == NULL) -- cgit From cdab83f9d0fb13926f6633f20c3327545fd6f70f Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 21 Dec 2019 17:30:26 +0000 Subject: clocksource/drivers/timer-ti-dm: Convert to devm_platform_ioremap_resource Use devm_platform_ioremap_resource() to simplify code, which wraps 'platform_get_resource' and 'devm_ioremap_resource' in a single helper. Signed-off-by: Yangtao Li Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20191221173027.30716-4-tiny.windzz@gmail.com --- drivers/clocksource/timer-ti-dm.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index 5394d9dbdfbc..aa2ede266edf 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -780,7 +780,7 @@ static int omap_dm_timer_probe(struct platform_device *pdev) { unsigned long flags; struct omap_dm_timer *timer; - struct resource *mem, *irq; + struct resource *irq; struct device *dev = &pdev->dev; const struct dmtimer_platform_data *pdata; int ret; @@ -802,18 +802,12 @@ static int omap_dm_timer_probe(struct platform_device *pdev) return -ENODEV; } - mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (unlikely(!mem)) { - dev_err(dev, "%s: no memory resource.\n", __func__); - return -ENODEV; - } - timer = devm_kzalloc(dev, sizeof(*timer), GFP_KERNEL); if (!timer) return -ENOMEM; timer->fclk = ERR_PTR(-ENODEV); - timer->io_base = devm_ioremap_resource(dev, mem); + timer->io_base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(timer->io_base)) return PTR_ERR(timer->io_base); -- cgit From bc83caddf17bd592cc19887e252c4ba416484d79 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 21 Dec 2019 17:30:27 +0000 Subject: clocksource/drivers/timer-ti-dm: Switch to platform_get_irq platform_get_resource(pdev, IORESOURCE_IRQ) is not recommended for requesting IRQ's resources, as they can be not ready yet. Using platform_get_irq() instead is preferred for getting IRQ even if it was not retrieved earlier. Signed-off-by: Yangtao Li Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20191221173027.30716-5-tiny.windzz@gmail.com --- drivers/clocksource/timer-ti-dm.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index aa2ede266edf..bd16efb2740b 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -780,7 +780,6 @@ static int omap_dm_timer_probe(struct platform_device *pdev) { unsigned long flags; struct omap_dm_timer *timer; - struct resource *irq; struct device *dev = &pdev->dev; const struct dmtimer_platform_data *pdata; int ret; @@ -796,11 +795,9 @@ static int omap_dm_timer_probe(struct platform_device *pdev) return -ENODEV; } - irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0); - if (unlikely(!irq)) { - dev_err(dev, "%s: no IRQ resource.\n", __func__); - return -ENODEV; - } + timer->irq = platform_get_irq(pdev, 0); + if (timer->irq < 0) + return timer->irq; timer = devm_kzalloc(dev, sizeof(*timer), GFP_KERNEL); if (!timer) @@ -830,7 +827,6 @@ static int omap_dm_timer_probe(struct platform_device *pdev) if (pdata) timer->errata = pdata->timer_errata; - timer->irq = irq->start; timer->pdev = pdev; pm_runtime_enable(dev); -- cgit From 4341067cfc20582195f47383cf059589b2641465 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 6 Jan 2020 12:37:00 -0800 Subject: clocksource/drivers/timer-ti-dm: Fix uninitialized pointer access Clean-up commit 8c82723414d5 ("clocksource/drivers/timer-ti-dm: Switch to platform_get_irq") caused a regression where we now try to access uninitialized data for timer: drivers/clocksource/timer-ti-dm.c: In function 'omap_dm_timer_probe': drivers/clocksource/timer-ti-dm.c:798:13: warning: 'timer' may be used uninitialized in this function [-Wmaybe-uninitialized] On boot we now get: Unable to handle kernel NULL pointer dereference at virtual address 00000004 ... (omap_dm_timer_probe) from [] (platform_drv_probe+0x48/0x98) (platform_drv_probe) from [] (really_probe+0x1dc/0x348) (really_probe) from [] (driver_probe_device+0x5c/0x160) Let's fix the issue by moving platform_get_irq to happen after timer has been allocated. Fixes: bc83caddf17b ("clocksource/drivers/timer-ti-dm: Switch to platform_get_irq") Cc: Yangtao Li Signed-off-by: Tony Lindgren Acked-by: Olof Johansson Acked-by: Yangtao Li Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20200106203700.21009-1-tony@atomide.com --- drivers/clocksource/timer-ti-dm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index bd16efb2740b..269a994d6a99 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -795,14 +795,14 @@ static int omap_dm_timer_probe(struct platform_device *pdev) return -ENODEV; } - timer->irq = platform_get_irq(pdev, 0); - if (timer->irq < 0) - return timer->irq; - timer = devm_kzalloc(dev, sizeof(*timer), GFP_KERNEL); if (!timer) return -ENOMEM; + timer->irq = platform_get_irq(pdev, 0); + if (timer->irq < 0) + return timer->irq; + timer->fclk = ERR_PTR(-ENODEV); timer->io_base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(timer->io_base)) -- cgit From 4ad35346da9cb99c02a4c5e99633c36f8f344be0 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 4 Jan 2020 16:20:58 +0100 Subject: clocksource/drivers/exynos_mct: Rename Exynos to lowercase Fix up inconsistent usage of upper and lowercase letters in "Exynos" name. "EXYNOS" is not an abbreviation but a regular trademarked name. Therefore it should be written with lowercase letters starting with capital letter. The lowercase "Exynos" name is promoted by its manufacturer Samsung Electronics Co., Ltd., in advertisement materials and on website. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20200104152107.11407-12-krzk@kernel.org --- drivers/clocksource/exynos_mct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c index 74cb299f5089..a267fe31ef13 100644 --- a/drivers/clocksource/exynos_mct.c +++ b/drivers/clocksource/exynos_mct.c @@ -4,7 +4,7 @@ * Copyright (c) 2011 Samsung Electronics Co., Ltd. * http://www.samsung.com * - * EXYNOS4 MCT(Multi-Core Timer) support + * Exynos4 MCT(Multi-Core Timer) support */ #include -- cgit From b9c60a741f06eda56d12c7216accb317b74266b4 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Mon, 6 Jan 2020 11:58:08 +0200 Subject: clocksource/drivers/timer-microchip-pit64b: Fix sparse warning Fix sparse warning: "warning: Using plain integer as NULL pointer" Reported-by: kbuild test robot Signed-off-by: Claudiu Beznea Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/1578304688-14882-1-git-send-email-claudiu.beznea@microchip.com --- drivers/clocksource/timer-microchip-pit64b.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/clocksource/timer-microchip-pit64b.c b/drivers/clocksource/timer-microchip-pit64b.c index 27a389a7e078..bd63d3484838 100644 --- a/drivers/clocksource/timer-microchip-pit64b.c +++ b/drivers/clocksource/timer-microchip-pit64b.c @@ -248,6 +248,8 @@ static int __init mchp_pit64b_init_mode(struct mchp_pit64b_timer *timer, if (!pclk_rate) return -EINVAL; + timer->mode = 0; + /* Try using GCLK. */ gclk_round = clk_round_rate(timer->gclk, max_rate); if (gclk_round < 0) @@ -360,7 +362,7 @@ static int __init mchp_pit64b_dt_init_timer(struct device_node *node, bool clkevt) { u32 freq = clkevt ? MCHP_PIT64B_DEF_CE_FREQ : MCHP_PIT64B_DEF_CS_FREQ; - struct mchp_pit64b_timer timer = { 0 }; + struct mchp_pit64b_timer timer; unsigned long clk_rate; u32 irq = 0; int ret; -- cgit From 0af3e137c144377fbaf5025ba784ff5ba7ad40c9 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Thu, 9 Jan 2020 17:06:49 +0100 Subject: clocksource/drivers/hyper-v: Untangle stimers and timesync from clocksources hyperv_timer.c exports hyperv_cs, which is used by stimers and the timesync mechanism. However, the clocksource dependency is not needed: these mechanisms only depend on the partition reference counter (which can be read via a MSR or via the TSC Reference Page). Introduce the (function) pointer hv_read_reference_counter, as an embodiment of the partition reference counter read, and export it in place of the hyperv_cs pointer. The latter can be removed. This should clarify that there's no relationship between Hyper-V stimers & timesync and the Linux clocksource abstractions. No functional or semantic change. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri Reviewed-by: Michael Kelley Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20200109160650.16150-2-parri.andrea@gmail.com --- drivers/clocksource/hyperv_timer.c | 36 +++++++++++++++++++++++------------- drivers/hv/hv_util.c | 8 ++++---- include/clocksource/hyperv_timer.h | 2 +- 3 files changed, 28 insertions(+), 18 deletions(-) diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c index 12d75b50a317..42748adccc98 100644 --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -66,7 +66,7 @@ static int hv_ce_set_next_event(unsigned long delta, { u64 current_tick; - current_tick = hyperv_cs->read(NULL); + current_tick = hv_read_reference_counter(); current_tick += delta; hv_init_timer(0, current_tick); return 0; @@ -304,8 +304,8 @@ EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup); * Hyper-V and 32-bit x86. The TSC reference page version is preferred. */ -struct clocksource *hyperv_cs; -EXPORT_SYMBOL_GPL(hyperv_cs); +u64 (*hv_read_reference_counter)(void); +EXPORT_SYMBOL_GPL(hv_read_reference_counter); static union { struct ms_hyperv_tsc_page page; @@ -318,7 +318,7 @@ struct ms_hyperv_tsc_page *hv_get_tsc_page(void) } EXPORT_SYMBOL_GPL(hv_get_tsc_page); -static u64 notrace read_hv_clock_tsc(struct clocksource *arg) +static u64 notrace read_hv_clock_tsc(void) { u64 current_tick = hv_read_tsc_page(hv_get_tsc_page()); @@ -328,9 +328,14 @@ static u64 notrace read_hv_clock_tsc(struct clocksource *arg) return current_tick; } +static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg) +{ + return read_hv_clock_tsc(); +} + static u64 read_hv_sched_clock_tsc(void) { - return read_hv_clock_tsc(NULL) - hv_sched_clock_offset; + return read_hv_clock_tsc() - hv_sched_clock_offset; } static void suspend_hv_clock_tsc(struct clocksource *arg) @@ -359,14 +364,14 @@ static void resume_hv_clock_tsc(struct clocksource *arg) static struct clocksource hyperv_cs_tsc = { .name = "hyperv_clocksource_tsc_page", .rating = 400, - .read = read_hv_clock_tsc, + .read = read_hv_clock_tsc_cs, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, .suspend= suspend_hv_clock_tsc, .resume = resume_hv_clock_tsc, }; -static u64 notrace read_hv_clock_msr(struct clocksource *arg) +static u64 notrace read_hv_clock_msr(void) { u64 current_tick; /* @@ -378,15 +383,20 @@ static u64 notrace read_hv_clock_msr(struct clocksource *arg) return current_tick; } +static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg) +{ + return read_hv_clock_msr(); +} + static u64 read_hv_sched_clock_msr(void) { - return read_hv_clock_msr(NULL) - hv_sched_clock_offset; + return read_hv_clock_msr() - hv_sched_clock_offset; } static struct clocksource hyperv_cs_msr = { .name = "hyperv_clocksource_msr", .rating = 400, - .read = read_hv_clock_msr, + .read = read_hv_clock_msr_cs, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -399,7 +409,7 @@ static bool __init hv_init_tsc_clocksource(void) if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) return false; - hyperv_cs = &hyperv_cs_tsc; + hv_read_reference_counter = read_hv_clock_tsc; phys_addr = virt_to_phys(hv_get_tsc_page()); /* @@ -417,7 +427,7 @@ static bool __init hv_init_tsc_clocksource(void) hv_set_clocksource_vdso(hyperv_cs_tsc); clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); - hv_sched_clock_offset = hyperv_cs->read(hyperv_cs); + hv_sched_clock_offset = hv_read_reference_counter(); hv_setup_sched_clock(read_hv_sched_clock_tsc); return true; @@ -439,10 +449,10 @@ void __init hv_init_clocksource(void) if (!(ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE)) return; - hyperv_cs = &hyperv_cs_msr; + hv_read_reference_counter = read_hv_clock_msr; clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); - hv_sched_clock_offset = hyperv_cs->read(hyperv_cs); + hv_sched_clock_offset = hv_read_reference_counter(); hv_setup_sched_clock(read_hv_sched_clock_msr); } EXPORT_SYMBOL_GPL(hv_init_clocksource); diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index 766bd8457346..296f9098c9e4 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -211,7 +211,7 @@ static struct timespec64 hv_get_adj_host_time(void) unsigned long flags; spin_lock_irqsave(&host_ts.lock, flags); - reftime = hyperv_cs->read(hyperv_cs); + reftime = hv_read_reference_counter(); newtime = host_ts.host_time + (reftime - host_ts.ref_time); ts = ns_to_timespec64((newtime - WLTIMEDELTA) * 100); spin_unlock_irqrestore(&host_ts.lock, flags); @@ -250,7 +250,7 @@ static inline void adj_guesttime(u64 hosttime, u64 reftime, u8 adj_flags) */ spin_lock_irqsave(&host_ts.lock, flags); - cur_reftime = hyperv_cs->read(hyperv_cs); + cur_reftime = hv_read_reference_counter(); host_ts.host_time = hosttime; host_ts.ref_time = cur_reftime; @@ -315,7 +315,7 @@ static void timesync_onchannelcallback(void *context) sizeof(struct vmbuspipe_hdr) + sizeof(struct icmsg_hdr)]; adj_guesttime(timedatap->parenttime, - hyperv_cs->read(hyperv_cs), + hv_read_reference_counter(), timedatap->flags); } } @@ -524,7 +524,7 @@ static struct ptp_clock *hv_ptp_clock; static int hv_timesync_init(struct hv_util_service *srv) { /* TimeSync requires Hyper-V clocksource. */ - if (!hyperv_cs) + if (!hv_read_reference_counter) return -ENODEV; spin_lock_init(&host_ts.lock); diff --git a/include/clocksource/hyperv_timer.h b/include/clocksource/hyperv_timer.h index 553e539469f0..34eef083c988 100644 --- a/include/clocksource/hyperv_timer.h +++ b/include/clocksource/hyperv_timer.h @@ -30,7 +30,7 @@ extern void hv_stimer_global_cleanup(void); extern void hv_stimer0_isr(void); #ifdef CONFIG_HYPERV_TIMER -extern struct clocksource *hyperv_cs; +extern u64 (*hv_read_reference_counter)(void); extern void hv_init_clocksource(void); extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void); -- cgit From 9e0333ae38eeb42249e10f95d209244a6e22ac9f Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Thu, 9 Jan 2020 17:06:50 +0100 Subject: clocksource/drivers/hyper-v: Set TSC clocksource as default w/ InvariantTSC Change the Hyper-V clocksource ratings to 250, below the TSC clocksource rating of 300. In configurations where Hyper-V offers an InvariantTSC, the TSC is not marked "unstable", so the TSC clocksource is available and preferred. With the higher rating, it will be the default. On older hardware and Hyper-V versions, the TSC is marked "unstable", so no TSC clocksource is created and the selected Hyper-V clocksource will be the default. Signed-off-by: Andrea Parri Reviewed-by: Michael Kelley Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20200109160650.16150-3-parri.andrea@gmail.com --- drivers/clocksource/hyperv_timer.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c index 42748adccc98..9d808d595ca8 100644 --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -302,6 +302,14 @@ EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup); * the other that uses the TSC reference page feature as defined in the * TLFS. The MSR version is for compatibility with old versions of * Hyper-V and 32-bit x86. The TSC reference page version is preferred. + * + * The Hyper-V clocksource ratings of 250 are chosen to be below the + * TSC clocksource rating of 300. In configurations where Hyper-V offers + * an InvariantTSC, the TSC is not marked "unstable", so the TSC clocksource + * is available and preferred. With the higher rating, it will be the + * default. On older hardware and Hyper-V versions, the TSC is marked + * "unstable", so no TSC clocksource is created and the selected Hyper-V + * clocksource will be the default. */ u64 (*hv_read_reference_counter)(void); @@ -363,7 +371,7 @@ static void resume_hv_clock_tsc(struct clocksource *arg) static struct clocksource hyperv_cs_tsc = { .name = "hyperv_clocksource_tsc_page", - .rating = 400, + .rating = 250, .read = read_hv_clock_tsc_cs, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, @@ -395,7 +403,7 @@ static u64 read_hv_sched_clock_msr(void) static struct clocksource hyperv_cs_msr = { .name = "hyperv_clocksource_msr", - .rating = 400, + .rating = 250, .read = read_hv_clock_msr_cs, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, -- cgit From 99570c3da96a0f7aa11c6ad4981776f3adabf3b5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 16 Jan 2020 20:43:41 +0100 Subject: MIPS: vdso: Define BUILD_VDSO32 when building a 32bit kernel The confinement of the 32bit specific VDSO functions missed to define BUILD_VDSO32 when building a 32bit MIPS kernel: arch/mips/vdso/vgettimeofday.c: In function __vdso_clock_gettime: arch/mips/vdso/vgettimeofday.c:17:9: error: implicit declaration of function __cvdso_clock_gettime32 arch/mips/vdso/vgettimeofday.c: In function __vdso_clock_getres: arch/mips/vdso/vgettimeofday.c:39:9: error: implicit declaration of function __cvdso_clock_getres_time32 Force the define for 32bit builds in the VDSO Makefile. Fixes: bf279849ad59 ("lib/vdso: Build 32 bit specific functions in the right context") Reported-by: kbuild test robot Signed-off-by: Thomas Gleixner Acked-by: Paul Burton Link: https://lore.kernel.org/r/87d0bjfaqa.fsf@nanos.tec.linutronix.de --- arch/mips/vdso/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile index e05938997e69..b2a2e032dc99 100644 --- a/arch/mips/vdso/Makefile +++ b/arch/mips/vdso/Makefile @@ -18,6 +18,10 @@ ccflags-vdso := \ $(filter -mno-loongson-%,$(KBUILD_CFLAGS)) \ -D__VDSO__ +ifndef CONFIG_64BIT +ccflags-vdso += -DBUILD_VDSO32 +endif + ifdef CONFIG_CC_IS_CLANG ccflags-vdso += $(filter --target=%,$(KBUILD_CFLAGS)) endif -- cgit From 49a101d7169c7729c7bab6b2f896faae34bd6c3d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 16 Jan 2020 17:58:27 +0000 Subject: lib/vdso: Only read hrtimer_res when needed in __cvdso_clock_getres() Only perform READ_ONCE(vd[CS_HRES_COARSE].hrtimer_res) for HRES and RAW clocks. Signed-off-by: Christophe Leroy Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/7ac2f0d21652f95e2bbdfa6bd514ae6c7caf53ab.1579196675.git.christophe.leroy@c-s.fr --- lib/vdso/gettimeofday.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index f342ac1fce77..f8b8ec5e63ac 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -312,7 +312,6 @@ static __maybe_unused int __cvdso_clock_getres_common(clockid_t clock, struct __kernel_timespec *res) { const struct vdso_data *vd = __arch_get_vdso_data(); - u64 hrtimer_res; u32 msk; u64 ns; @@ -323,7 +322,6 @@ int __cvdso_clock_getres_common(clockid_t clock, struct __kernel_timespec *res) if (IS_ENABLED(CONFIG_TIME_NS) && vd->clock_mode == VCLOCK_TIMENS) vd = __arch_get_timens_vdso_data(); - hrtimer_res = READ_ONCE(vd[CS_HRES_COARSE].hrtimer_res); /* * Convert the clockid to a bitmask and use it to check which * clocks are handled in the VDSO directly. @@ -333,7 +331,7 @@ int __cvdso_clock_getres_common(clockid_t clock, struct __kernel_timespec *res) /* * Preserves the behaviour of posix_get_hrtimer_res(). */ - ns = hrtimer_res; + ns = READ_ONCE(vd[CS_HRES_COARSE].hrtimer_res); } else if (msk & VDSO_COARSE) { /* * Preserves the behaviour of posix_get_coarse_res(). -- cgit From eb5a4d0a9ee976008d1add75e3d64545399e80a3 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Mon, 20 Jan 2020 22:43:47 +0000 Subject: hrtimer: Add missing sparse annotation for __run_timer() Sparse reports a warning at __run_hrtimer() |warning: context imbalance in __run_hrtimer() - unexpected unlock Add the missing must_hold() annotation. Signed-off-by: Jules Irenge Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20200120224347.51843-1-jbi.octave@gmail.com --- kernel/time/hrtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d8b62f93fc8d..3a609e7344f3 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1477,7 +1477,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active); static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, struct hrtimer *timer, ktime_t *now, - unsigned long flags) + unsigned long flags) __must_hold(&cpu_base->lock) { enum hrtimer_restart (*fn)(struct hrtimer *); int restart; -- cgit From 6b088cefbeaa87ba48bf838edfc1e19c9ff3976b Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 23 Jan 2020 21:58:49 -0800 Subject: alarmtimer: Update alarmtimer_get_rtcdev() docs to reflect reality This function doesn't do anything like this comment says when an RTC device hasn't been chosen. It looks like we used to do something like that before commit 8bc0dafb5cf3 ("alarmtimers: Rework RTC device selection using class interface") but that's long gone now. Remove this sentence to avoid confusing the reader. Signed-off-by: Stephen Boyd Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20200124055849.154411-5-swboyd@chromium.org --- kernel/time/alarmtimer.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 9dc7a0913190..564ff5df2b0b 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -69,8 +69,6 @@ static DEFINE_SPINLOCK(rtcdev_lock); * alarmtimer_get_rtcdev - Return selected rtcdevice * * This function returns the rtc device to use for wakealarms. - * If one has not already been chosen, it checks to see if a - * functional rtc device is available. */ struct rtc_device *alarmtimer_get_rtcdev(void) { -- cgit From c79108bd19a8490315847e0c95ac6526fcd8e770 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 23 Jan 2020 21:58:46 -0800 Subject: alarmtimer: Make alarmtimer platform device child of RTC device The alarmtimer_suspend() function will fail if an RTC device is on a bus such as SPI or i2c and that RTC device registers and probes after alarmtimer_init() registers and probes the 'alarmtimer' platform device. This is because system wide suspend suspends devices in the reverse order of their probe. When alarmtimer_suspend() attempts to program the RTC for a wakeup it will try to program an RTC device on a bus that has already been suspended. Move the alarmtimer device registration to happen when the RTC which is used for wakeup is registered. Register the 'alarmtimer' platform device as a child of the RTC device too, so that it can be guaranteed that the RTC device won't be suspended when alarmtimer_suspend() is called. Reported-by: Douglas Anderson Signed-off-by: Stephen Boyd Signed-off-by: Thomas Gleixner Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20200124055849.154411-2-swboyd@chromium.org --- kernel/time/alarmtimer.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 564ff5df2b0b..f0469ccc84ee 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -89,6 +89,7 @@ static int alarmtimer_rtc_add_device(struct device *dev, unsigned long flags; struct rtc_device *rtc = to_rtc_device(dev); struct wakeup_source *__ws; + struct platform_device *pdev; int ret = 0; if (rtcdev) @@ -100,9 +101,11 @@ static int alarmtimer_rtc_add_device(struct device *dev, return -1; __ws = wakeup_source_register(dev, "alarmtimer"); + pdev = platform_device_register_data(dev, "alarmtimer", + PLATFORM_DEVID_AUTO, NULL, 0); spin_lock_irqsave(&rtcdev_lock, flags); - if (!rtcdev) { + if (__ws && !IS_ERR(pdev) && !rtcdev) { if (!try_module_get(rtc->owner)) { ret = -1; goto unlock; @@ -113,10 +116,14 @@ static int alarmtimer_rtc_add_device(struct device *dev, get_device(dev); ws = __ws; __ws = NULL; + pdev = NULL; + } else { + ret = -1; } unlock: spin_unlock_irqrestore(&rtcdev_lock, flags); + platform_device_unregister(pdev); wakeup_source_unregister(__ws); return ret; @@ -903,8 +910,7 @@ static void get_boottime_timespec(struct timespec64 *tp) */ static int __init alarmtimer_init(void) { - struct platform_device *pdev; - int error = 0; + int error; int i; alarmtimer_rtc_timer_init(); @@ -929,15 +935,7 @@ static int __init alarmtimer_init(void) if (error) goto out_if; - pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0); - if (IS_ERR(pdev)) { - error = PTR_ERR(pdev); - goto out_drv; - } return 0; - -out_drv: - platform_driver_unregister(&alarmtimer_driver); out_if: alarmtimer_rtc_interface_remove(); return error; -- cgit From 7c94caca877b0feeca6f5f7b07d48c508e20d58f Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 23 Jan 2020 21:58:47 -0800 Subject: alarmtimer: Use wakeup source from alarmtimer platform device Use the wakeup source that can be associated with the 'alarmtimer' platform device instead of registering another one by hand. Signed-off-by: Stephen Boyd Signed-off-by: Thomas Gleixner Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20200124055849.154411-3-swboyd@chromium.org --- kernel/time/alarmtimer.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index f0469ccc84ee..685ff57a1d87 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -58,8 +58,6 @@ static DEFINE_SPINLOCK(freezer_delta_lock); #endif #ifdef CONFIG_RTC_CLASS -static struct wakeup_source *ws; - /* rtc timer and device for setting alarm wakeups at suspend */ static struct rtc_timer rtctimer; static struct rtc_device *rtcdev; @@ -88,7 +86,6 @@ static int alarmtimer_rtc_add_device(struct device *dev, { unsigned long flags; struct rtc_device *rtc = to_rtc_device(dev); - struct wakeup_source *__ws; struct platform_device *pdev; int ret = 0; @@ -100,12 +97,13 @@ static int alarmtimer_rtc_add_device(struct device *dev, if (!device_may_wakeup(rtc->dev.parent)) return -1; - __ws = wakeup_source_register(dev, "alarmtimer"); pdev = platform_device_register_data(dev, "alarmtimer", PLATFORM_DEVID_AUTO, NULL, 0); + if (!IS_ERR(pdev)) + device_init_wakeup(&pdev->dev, true); spin_lock_irqsave(&rtcdev_lock, flags); - if (__ws && !IS_ERR(pdev) && !rtcdev) { + if (!IS_ERR(pdev) && !rtcdev) { if (!try_module_get(rtc->owner)) { ret = -1; goto unlock; @@ -114,8 +112,6 @@ static int alarmtimer_rtc_add_device(struct device *dev, rtcdev = rtc; /* hold a reference so it doesn't go away */ get_device(dev); - ws = __ws; - __ws = NULL; pdev = NULL; } else { ret = -1; @@ -124,7 +120,6 @@ unlock: spin_unlock_irqrestore(&rtcdev_lock, flags); platform_device_unregister(pdev); - wakeup_source_unregister(__ws); return ret; } @@ -291,7 +286,7 @@ static int alarmtimer_suspend(struct device *dev) return 0; if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { - __pm_wakeup_event(ws, 2 * MSEC_PER_SEC); + pm_wakeup_event(dev, 2 * MSEC_PER_SEC); return -EBUSY; } @@ -306,7 +301,7 @@ static int alarmtimer_suspend(struct device *dev) /* Set alarm, if in the past reject suspend briefly to handle */ ret = rtc_timer_start(rtc, &rtctimer, now, 0); if (ret < 0) - __pm_wakeup_event(ws, MSEC_PER_SEC); + pm_wakeup_event(dev, MSEC_PER_SEC); return ret; } -- cgit From fd928f3e32ba09381b287f8b732418434d932855 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 23 Jan 2020 21:58:48 -0800 Subject: alarmtimer: Make alarmtimer_get_rtcdev() a stub when CONFIG_RTC_CLASS=n The stubbed version of alarmtimer_get_rtcdev() is not exported. so this won't work if this function is used in a module when CONFIG_RTC_CLASS=n. Move the stub function to the header file and make it inline so that callers don't have to worry about linking against this symbol. rtcdev isn't used outside of this ifdef so it's not required to be redefined to NULL. Drop that while touching this area. Signed-off-by: Stephen Boyd Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20200124055849.154411-4-swboyd@chromium.org --- include/linux/alarmtimer.h | 4 ++++ kernel/time/alarmtimer.c | 5 ----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h index 74748e306f4b..05e758b8b894 100644 --- a/include/linux/alarmtimer.h +++ b/include/linux/alarmtimer.h @@ -60,7 +60,11 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval); u64 alarm_forward_now(struct alarm *alarm, ktime_t interval); ktime_t alarm_expires_remaining(const struct alarm *alarm); +#ifdef CONFIG_RTC_CLASS /* Provide way to access the rtc device being used by alarmtimers */ struct rtc_device *alarmtimer_get_rtcdev(void); +#else +static inline struct rtc_device *alarmtimer_get_rtcdev(void) { return NULL; } +#endif #endif diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 685ff57a1d87..2ffb466af77e 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -143,11 +143,6 @@ static void alarmtimer_rtc_interface_remove(void) class_interface_unregister(&alarmtimer_rtc_interface); } #else -struct rtc_device *alarmtimer_get_rtcdev(void) -{ - return NULL; -} -#define rtcdev (NULL) static inline int alarmtimer_rtc_interface_setup(void) { return 0; } static inline void alarmtimer_rtc_interface_remove(void) { } static inline void alarmtimer_rtc_timer_init(void) { } -- cgit