diff options
Diffstat (limited to 'arch/x86/kernel/tsc.c')
| -rw-r--r-- | arch/x86/kernel/tsc.c | 746 |
1 files changed, 495 insertions, 251 deletions
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 796d96bb0821..7d3e13e14eab 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> @@ -10,10 +11,13 @@ #include <linux/cpufreq.h> #include <linux/delay.h> #include <linux/clocksource.h> +#include <linux/kvm_types.h> #include <linux/percpu.h> #include <linux/timex.h> #include <linux/static_key.h> +#include <linux/static_call.h> +#include <asm/cpuid/api.h> #include <asm/hpet.h> #include <asm/timer.h> #include <asm/vgtod.h> @@ -24,7 +28,12 @@ #include <asm/x86_init.h> #include <asm/geode.h> #include <asm/apic.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> +#include <asm/i8259.h> +#include <asm/msr.h> +#include <asm/topology.h> +#include <asm/uv/uv.h> +#include <asm/sev.h> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -32,51 +41,61 @@ EXPORT_SYMBOL(cpu_khz); unsigned int __read_mostly tsc_khz; EXPORT_SYMBOL(tsc_khz); +#define KHZ 1000 + /* * TSC can be unstable due to cpufreq or due to unsynced TSCs */ static int __read_mostly tsc_unstable; +static unsigned int __initdata tsc_early_khz; -/* native_sched_clock() is called before tsc_init(), so - we must start with the TSC soft disabled to prevent - erroneous rdtsc usage on !boot_cpu_has(X86_FEATURE_TSC) processors */ -static int __read_mostly tsc_disabled = -1; - -static DEFINE_STATIC_KEY_FALSE(__use_tsc); +static DEFINE_STATIC_KEY_FALSE_RO(__use_tsc); int tsc_clocksource_reliable; -static u32 art_to_tsc_numerator; -static u32 art_to_tsc_denominator; -static u64 art_to_tsc_offset; -struct clocksource *art_related_clocksource; +static int __read_mostly tsc_force_recalibrate; + +static struct clocksource_base art_base_clk = { + .id = CSID_X86_ART, +}; +static bool have_art; struct cyc2ns { struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ - seqcount_t seq; /* 32 + 4 = 36 */ + seqcount_latch_t seq; /* 32 + 4 = 36 */ }; /* fits one cacheline */ static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); -void cyc2ns_read_begin(struct cyc2ns_data *data) +static int __init tsc_early_khz_setup(char *buf) { - int seq, idx; + return kstrtouint(buf, 0, &tsc_early_khz); +} +early_param("tsc_early_khz", tsc_early_khz_setup); - preempt_disable_notrace(); +__always_inline void __cyc2ns_read(struct cyc2ns_data *data) +{ + int seq, idx; do { - seq = this_cpu_read(cyc2ns.seq.sequence); + seq = this_cpu_read(cyc2ns.seq.seqcount.sequence); idx = seq & 1; data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset); data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul); data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift); - } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence))); + } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence))); +} + +__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) +{ + preempt_disable_notrace(); + __cyc2ns_read(data); } -void cyc2ns_read_end(void) +__always_inline void cyc2ns_read_end(void) { preempt_enable_notrace(); } @@ -105,50 +124,33 @@ void cyc2ns_read_end(void) * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static void cyc2ns_data_init(struct cyc2ns_data *data) -{ - data->cyc2ns_mul = 0; - data->cyc2ns_shift = 0; - data->cyc2ns_offset = 0; -} - -static void cyc2ns_init(int cpu) -{ - struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); - - cyc2ns_data_init(&c2n->data[0]); - cyc2ns_data_init(&c2n->data[1]); - - seqcount_init(&c2n->seq); -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) +static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc) { struct cyc2ns_data data; unsigned long long ns; - cyc2ns_read_begin(&data); + __cyc2ns_read(&data); ns = data.cyc2ns_offset; ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); - cyc2ns_read_end(); + return ns; +} +static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + unsigned long long ns; + preempt_disable_notrace(); + ns = __cycles_2_ns(cyc); + preempt_enable_notrace(); return ns; } -static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) +static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { unsigned long long ns_now; struct cyc2ns_data data; struct cyc2ns *c2n; - unsigned long flags; - - local_irq_save(flags); - sched_clock_idle_sleep_event(); - - if (!khz) - goto done; ns_now = cycles_2_ns(tsc_now); @@ -176,26 +178,69 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_ c2n = per_cpu_ptr(&cyc2ns, cpu); - raw_write_seqcount_latch(&c2n->seq); + write_seqcount_latch_begin(&c2n->seq); c2n->data[0] = data; - raw_write_seqcount_latch(&c2n->seq); + write_seqcount_latch(&c2n->seq); c2n->data[1] = data; + write_seqcount_latch_end(&c2n->seq); +} + +static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) +{ + unsigned long flags; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + if (khz) + __set_cyc2ns_scale(khz, cpu, tsc_now); -done: sched_clock_idle_wakeup_event(); local_irq_restore(flags); } /* + * Initialize cyc2ns for boot cpu + */ +static void __init cyc2ns_init_boot_cpu(void) +{ + struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); + + seqcount_latch_init(&c2n->seq); + __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc()); +} + +/* + * Secondary CPUs do not run through tsc_init(), so set up + * all the scale factors for all CPUs, assuming the same + * speed as the bootup CPU. + */ +static void __init cyc2ns_init_secondary_cpus(void) +{ + unsigned int cpu, this_cpu = smp_processor_id(); + struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); + struct cyc2ns_data *data = c2n->data; + + for_each_possible_cpu(cpu) { + if (cpu != this_cpu) { + seqcount_latch_init(&c2n->seq); + c2n = per_cpu_ptr(&cyc2ns, cpu); + c2n->data[0] = data[0]; + c2n->data[1] = data[1]; + } + } +} + +/* * Scheduler clock - returns current time in nanosec units. */ -u64 native_sched_clock(void) +noinstr u64 native_sched_clock(void) { if (static_branch_likely(&__use_tsc)) { u64 tsc_now = rdtsc(); /* return the value in ns */ - return cycles_2_ns(tsc_now); + return __cycles_2_ns(tsc_now); } /* @@ -222,22 +267,30 @@ u64 native_sched_clock_from_tsc(u64 tsc) /* We need to define a real function for sched_clock, to override the weak default version */ #ifdef CONFIG_PARAVIRT -unsigned long long sched_clock(void) +noinstr u64 sched_clock_noinstr(void) { return paravirt_sched_clock(); } bool using_native_sched_clock(void) { - return pv_time_ops.sched_clock == native_sched_clock; + return static_call_query(pv_sched_clock) == native_sched_clock; } #else -unsigned long long -sched_clock(void) __attribute__((alias("native_sched_clock"))); +u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock"))); bool using_native_sched_clock(void) { return true; } #endif +notrace u64 sched_clock(void) +{ + u64 now; + preempt_disable_notrace(); + now = sched_clock_noinstr(); + preempt_enable_notrace(); + return now; +} + int check_tsc_unstable(void) { return tsc_unstable; @@ -247,8 +300,7 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable); #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { - pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n"); - tsc_disabled = 1; + mark_tsc_unstable("boot parameter notsc"); return 1; } #else @@ -266,6 +318,8 @@ int __init notsc_setup(char *str) __setup("notsc", notsc_setup); static int no_sched_irq_time; +static int no_tsc_watchdog; +static int tsc_as_watchdog; static int __init tsc_setup(char *str) { @@ -275,20 +329,37 @@ static int __init tsc_setup(char *str) no_sched_irq_time = 1; if (!strcmp(str, "unstable")) mark_tsc_unstable("boot parameter"); + if (!strcmp(str, "nowatchdog")) { + no_tsc_watchdog = 1; + if (tsc_as_watchdog) + pr_alert("%s: Overriding earlier tsc=watchdog with tsc=nowatchdog\n", + __func__); + tsc_as_watchdog = 0; + } + if (!strcmp(str, "recalibrate")) + tsc_force_recalibrate = 1; + if (!strcmp(str, "watchdog")) { + if (no_tsc_watchdog) + pr_alert("%s: tsc=watchdog overridden by earlier tsc=nowatchdog\n", + __func__); + else + tsc_as_watchdog = 1; + } return 1; } __setup("tsc=", tsc_setup); -#define MAX_RETRIES 5 -#define SMI_TRESHOLD 50000 +#define MAX_RETRIES 5 +#define TSC_DEFAULT_THRESHOLD 0x20000 /* - * Read TSC and the reference counters. Take care of SMI disturbance + * Read TSC and the reference counters. Take care of any disturbances */ static u64 tsc_read_refs(u64 *p, int hpet) { u64 t1, t2; + u64 thresh = tsc_khz ? tsc_khz >> 5 : TSC_DEFAULT_THRESHOLD; int i; for (i = 0; i < MAX_RETRIES; i++) { @@ -298,7 +369,7 @@ static u64 tsc_read_refs(u64 *p, int hpet) else *p = acpi_pm_read_early(); t2 = get_cycles(); - if ((t2 - t1) < SMI_TRESHOLD) + if ((t2 - t1) < thresh) return t2; } return ULLONG_MAX; @@ -316,7 +387,7 @@ static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2) hpet2 -= hpet1; tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); do_div(tmp, 1000000); - do_div(deltatsc, tmp); + deltatsc = div64_u64(deltatsc, tmp); return (unsigned long) deltatsc; } @@ -363,6 +434,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) unsigned long tscmin, tscmax; int pitcnt; + if (!has_legacy_pic()) { + /* + * Relies on tsc_early_delay_calibrate() to have given us semi + * usable udelay(), wait for the same 50ms we would have with + * the PIT loop below. + */ + udelay(10 * USEC_PER_MSEC); + udelay(10 * USEC_PER_MSEC); + udelay(10 * USEC_PER_MSEC); + udelay(10 * USEC_PER_MSEC); + udelay(10 * USEC_PER_MSEC); + return ULONG_MAX; + } + /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); @@ -442,7 +527,7 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) * transition from one expected value to another with a fairly * high accuracy, and we didn't miss any events. We can thus * use the TSC value at the transitions to calculate a pretty - * good value for the TSC frequencty. + * good value for the TSC frequency. */ static inline int pit_verify_msb(unsigned char val) { @@ -487,6 +572,9 @@ static unsigned long quick_pit_calibrate(void) u64 tsc, delta; unsigned long d1, d2; + if (!has_legacy_pic()) + return 0; + /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); @@ -570,7 +658,7 @@ success: } /** - * native_calibrate_tsc + * native_calibrate_tsc - determine TSC frequency * Determine TSC frequency via CPUID, else return 0. */ unsigned long native_calibrate_tsc(void) @@ -581,51 +669,69 @@ unsigned long native_calibrate_tsc(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; - if (boot_cpu_data.cpuid_level < 0x15) + if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC) return 0; eax_denominator = ebx_numerator = ecx_hz = edx = 0; /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */ - cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); + cpuid(CPUID_LEAF_TSC, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); if (ebx_numerator == 0 || eax_denominator == 0) return 0; crystal_khz = ecx_hz / 1000; - if (crystal_khz == 0) { - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_SKYLAKE_MOBILE: - case INTEL_FAM6_SKYLAKE_DESKTOP: - case INTEL_FAM6_KABYLAKE_MOBILE: - case INTEL_FAM6_KABYLAKE_DESKTOP: - crystal_khz = 24000; /* 24.0 MHz */ - break; - case INTEL_FAM6_SKYLAKE_X: - case INTEL_FAM6_ATOM_DENVERTON: - crystal_khz = 25000; /* 25.0 MHz */ - break; - case INTEL_FAM6_ATOM_GOLDMONT: - crystal_khz = 19200; /* 19.2 MHz */ - break; - } - } + /* + * Denverton SoCs don't report crystal clock, and also don't support + * CPUID_LEAF_FREQ for the calculation below, so hardcode the 25MHz + * crystal clock. + */ + if (crystal_khz == 0 && + boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D) + crystal_khz = 25000; /* - * TSC frequency determined by CPUID is a "hardware reported" + * TSC frequency reported directly by CPUID is a "hardware reported" * frequency and is the most accurate one so far we have. This * is considered a known frequency. */ - setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + if (crystal_khz != 0) + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + + /* + * Some Intel SoCs like Skylake and Kabylake don't report the crystal + * clock, but we can easily calculate it to a high degree of accuracy + * by considering the crystal ratio and the CPU speed. + */ + if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= CPUID_LEAF_FREQ) { + unsigned int eax_base_mhz, ebx, ecx, edx; + + cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx, &ecx, &edx); + crystal_khz = eax_base_mhz * 1000 * + eax_denominator / ebx_numerator; + } + + if (crystal_khz == 0) + return 0; /* * For Atom SoCs TSC is the only reliable clocksource. * Mark TSC reliable so no watchdog on it. */ - if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT) + if (boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT) setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); +#ifdef CONFIG_X86_LOCAL_APIC + /* + * The local APIC appears to be fed by the core crystal clock + * (which sounds entirely sensible). We can set the global + * lapic_timer_period here to avoid having to calibrate the APIC + * timer later. + */ + lapic_timer_period = crystal_khz * 1000 / HZ; +#endif + return crystal_khz * ebx_numerator / eax_denominator; } @@ -636,40 +742,27 @@ static unsigned long cpu_khz_from_cpuid(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; - if (boot_cpu_data.cpuid_level < 0x16) + if (boot_cpu_data.cpuid_level < CPUID_LEAF_FREQ) return 0; eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0; - cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx); + cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx); return eax_base_mhz * 1000; } -/** - * native_calibrate_cpu - calibrate the cpu on boot +/* + * calibrate cpu using pit, hpet, and ptimer methods. They are available + * later in boot after acpi is initialized. */ -unsigned long native_calibrate_cpu(void) +static unsigned long pit_hpet_ptimer_calibrate_cpu(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate; + unsigned long flags, latch, ms; int hpet = is_hpet_enabled(), i, loopmin; - fast_calibrate = cpu_khz_from_cpuid(); - if (fast_calibrate) - return fast_calibrate; - - fast_calibrate = cpu_khz_from_msr(); - if (fast_calibrate) - return fast_calibrate; - - local_irq_save(flags); - fast_calibrate = quick_pit_calibrate(); - local_irq_restore(flags); - if (fast_calibrate) - return fast_calibrate; - /* * Run 5 calibration loops to get the lowest frequency value * (the best estimate). We use two different calibration modes @@ -681,15 +774,15 @@ unsigned long native_calibrate_cpu(void) * zero. In each wait loop iteration we read the TSC and check * the delta to the previous read. We keep track of the min * and max values of that delta. The delta is mostly defined - * by the IO time of the PIT access, so we can detect when a - * SMI/SMM disturbance happened between the two reads. If the + * by the IO time of the PIT access, so we can detect when + * any disturbance happened between the two reads. If the * maximum time is significantly larger than the minimum time, * then we discard the result and have another try. * * 2) Reference counter. If available we use the HPET or the * PMTIMER as a reference to check the sanity of that value. * We use separate TSC readouts and check inside of the - * reference read for a SMI/SMM disturbance. We dicard + * reference read for any possible disturbance. We discard * disturbed values here as well. We do that around the PIT * calibration delay loop as we have to wait for a certain * amount of time anyway. @@ -722,7 +815,7 @@ unsigned long native_calibrate_cpu(void) if (ref1 == ref2) continue; - /* Check, whether the sampling was disturbed by an SMI */ + /* Check, whether the sampling was disturbed */ if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) continue; @@ -812,13 +905,44 @@ unsigned long native_calibrate_cpu(void) return tsc_pit_min; } -int recalibrate_cpu_khz(void) +/** + * native_calibrate_cpu_early - can calibrate the cpu early in boot + */ +unsigned long native_calibrate_cpu_early(void) +{ + unsigned long flags, fast_calibrate = cpu_khz_from_cpuid(); + + if (!fast_calibrate) + fast_calibrate = cpu_khz_from_msr(); + if (!fast_calibrate) { + local_irq_save(flags); + fast_calibrate = quick_pit_calibrate(); + local_irq_restore(flags); + } + return fast_calibrate; +} + + +/** + * native_calibrate_cpu - calibrate the cpu + */ +static unsigned long native_calibrate_cpu(void) +{ + unsigned long tsc_freq = native_calibrate_cpu_early(); + + if (!tsc_freq) + tsc_freq = pit_hpet_ptimer_calibrate_cpu(); + + return tsc_freq; +} + +void recalibrate_cpu_khz(void) { #ifndef CONFIG_SMP unsigned long cpu_khz_old = cpu_khz; if (!boot_cpu_has(X86_FEATURE_TSC)) - return -ENODEV; + return; cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); @@ -828,21 +952,16 @@ int recalibrate_cpu_khz(void) cpu_khz = tsc_khz; cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, cpu_khz_old, cpu_khz); - - return 0; -#else - return -ENODEV; #endif } - -EXPORT_SYMBOL(recalibrate_cpu_khz); +EXPORT_SYMBOL_GPL(recalibrate_cpu_khz); static unsigned long long cyc2ns_suspend; void tsc_save_sched_clock_state(void) { - if (!sched_clock_stable()) + if (!static_branch_likely(&__use_tsc) && !sched_clock_stable()) return; cyc2ns_suspend = sched_clock(); @@ -862,7 +981,7 @@ void tsc_restore_sched_clock_state(void) unsigned long flags; int cpu; - if (!sched_clock_stable()) + if (!static_branch_likely(&__use_tsc) && !sched_clock_stable()) return; local_irq_save(flags); @@ -887,12 +1006,12 @@ void tsc_restore_sched_clock_state(void) } #ifdef CONFIG_CPU_FREQ -/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency +/* + * Frequency scaling support. Adjust the TSC based timer when the CPU frequency * changes. * - * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's - * not that important because current Opteron setups do not support - * scaling on SMP anyroads. + * NOTE: On SMP the situation is not fixable in general, so simply mark the TSC + * as unstable and give up in those cases. * * Should fix up last_tsc too. Currently gettimeofday in the * first tick after the change will be slightly wrong. @@ -906,28 +1025,28 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; - unsigned long *lpj; - lpj = &boot_cpu_data.loops_per_jiffy; -#ifdef CONFIG_SMP - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - lpj = &cpu_data(freq->cpu).loops_per_jiffy; -#endif + if (num_online_cpus() > 1) { + mark_tsc_unstable("cpufreq changes on SMP"); + return 0; + } if (!ref_freq) { ref_freq = freq->old; - loops_per_jiffy_ref = *lpj; + loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy; tsc_khz_ref = tsc_khz; } + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) { - *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) { + boot_cpu_data.loops_per_jiffy = + cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) mark_tsc_unstable("cpufreq changes"); - set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc()); + set_cyc2ns_scale(tsc_khz, freq->policy->cpu, rdtsc()); } return 0; @@ -952,33 +1071,36 @@ core_initcall(cpufreq_register_tsc_scaling); #endif /* CONFIG_CPU_FREQ */ -#define ART_CPUID_LEAF (0x15) #define ART_MIN_DENOMINATOR (1) - /* * If ART is present detect the numerator:denominator to convert to TSC */ -static void detect_art(void) +static void __init detect_art(void) { - unsigned int unused[2]; + unsigned int unused; - if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF) + if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC) return; - /* Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required */ + /* + * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required, + * and the TSC counter resets must not occur asynchronously. + */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR) || !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || - !boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + !boot_cpu_has(X86_FEATURE_TSC_ADJUST) || + tsc_async_resets) return; - cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, - &art_to_tsc_numerator, unused, unused+1); + cpuid(CPUID_LEAF_TSC, &art_base_clk.denominator, + &art_base_clk.numerator, &art_base_clk.freq_khz, &unused); - if (art_to_tsc_denominator < ART_MIN_DENOMINATOR) + art_base_clk.freq_khz /= KHZ; + if (art_base_clk.denominator < ART_MIN_DENOMINATOR) return; - rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset); + rdmsrq(MSR_IA32_TSC_ADJUST, art_base_clk.offset); /* Make this sticky over multiple CPU init calls */ setup_force_cpu_cap(X86_FEATURE_ART); @@ -987,8 +1109,6 @@ static void detect_art(void) /* clocksource code */ -static struct clocksource clocksource_tsc; - static void tsc_resume(struct clocksource *cs) { tsc_verify_tsc_adjust(true); @@ -1000,7 +1120,7 @@ static void tsc_resume(struct clocksource *cs) * very small window right after one CPU updated cycle_last under * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which * is smaller than the cycle_last reference value due to a TSC which - * is slighty behind. This delta is nowhere else observable, but in + * is slightly behind. This delta is nowhere else observable, but in * that case it results in a forward time jump in the range of hours * due to the unsigned delta calculation of the time keeping core * code, which is necessary to support wrapping clocksources like pm @@ -1036,20 +1156,53 @@ static void tsc_cs_tick_stable(struct clocksource *cs) sched_clock_tick_stable(); } +static int tsc_cs_enable(struct clocksource *cs) +{ + vclocks_set_used(VDSO_CLOCKMODE_TSC); + return 0; +} + /* * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() */ -static struct clocksource clocksource_tsc = { - .name = "tsc", - .rating = 300, - .read = read_tsc, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS | +static struct clocksource clocksource_tsc_early = { + .name = "tsc-early", + .rating = 299, + .uncertainty_margin = 32 * NSEC_PER_MSEC, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, - .archdata = { .vclock_mode = VCLOCK_TSC }, + .id = CSID_X86_TSC_EARLY, + .vdso_clock_mode = VDSO_CLOCKMODE_TSC, + .enable = tsc_cs_enable, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, .tick_stable = tsc_cs_tick_stable, + .list = LIST_HEAD_INIT(clocksource_tsc_early.list), +}; + +/* + * Must mark VALID_FOR_HRES early such that when we unregister tsc_early + * this one will immediately take over. We will only register if TSC has + * been found good. + */ +static struct clocksource clocksource_tsc = { + .name = "tsc", + .rating = 300, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS | + CLOCK_SOURCE_VALID_FOR_HRES | + CLOCK_SOURCE_MUST_VERIFY | + CLOCK_SOURCE_VERIFY_PERCPU, + .id = CSID_X86_TSC, + .vdso_clock_mode = VDSO_CLOCKMODE_TSC, + .enable = tsc_cs_enable, + .resume = tsc_resume, + .mark_unstable = tsc_cs_mark_unstable, + .tick_stable = tsc_cs_tick_stable, + .list = LIST_HEAD_INIT(clocksource_tsc.list), }; void mark_tsc_unstable(char *reason) @@ -1062,17 +1215,25 @@ void mark_tsc_unstable(char *reason) clear_sched_clock_stable(); disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to %s\n", reason); - /* Change only the rating, when not registered */ - if (clocksource_tsc.mult) { - clocksource_mark_unstable(&clocksource_tsc); - } else { - clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE; - clocksource_tsc.rating = 0; - } + + clocksource_mark_unstable(&clocksource_tsc_early); + clocksource_mark_unstable(&clocksource_tsc); } EXPORT_SYMBOL_GPL(mark_tsc_unstable); +static void __init tsc_disable_clocksource_watchdog(void) +{ + clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; + clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; +} + +bool tsc_clocksource_watchdog_disabled(void) +{ + return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY) && + tsc_as_watchdog && !no_tsc_watchdog; +} + static void __init check_system_tsc_reliable(void) { #if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC) @@ -1089,6 +1250,20 @@ static void __init check_system_tsc_reliable(void) #endif if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) tsc_clocksource_reliable = 1; + + /* + * Disable the clocksource watchdog when the system has: + * - TSC running at constant frequency + * - TSC which does not stop in C-States + * - the TSC_ADJUST register which allows to detect even minimal + * modifications + * - not more than four packages + */ + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && + boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && + boot_cpu_has(X86_FEATURE_TSC_ADJUST) && + topology_max_packages() <= 4) + tsc_disable_clocksource_watchdog(); } /* @@ -1116,38 +1291,18 @@ int unsynchronized_tsc(void) */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { /* assume multi socket systems are not synchronized: */ - if (num_possible_cpus() > 1) + if (topology_max_packages() > 1) return 1; } return 0; } -/* - * Convert ART to TSC given numerator/denominator found in detect_art() - */ -struct system_counterval_t convert_art_to_tsc(u64 art) -{ - u64 tmp, res, rem; - - rem = do_div(art, art_to_tsc_denominator); - - res = art * art_to_tsc_numerator; - tmp = rem * art_to_tsc_numerator; - - do_div(tmp, art_to_tsc_denominator); - res += tmp + art_to_tsc_offset; - - return (struct system_counterval_t) {.cs = art_related_clocksource, - .cycles = res}; -} -EXPORT_SYMBOL(convert_art_to_tsc); - static void tsc_refine_calibration_work(struct work_struct *work); static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); /** * tsc_refine_calibration_work - Further refine tsc freq calibration - * @work - ignored. + * @work: ignored. * * This functions uses delayed work over a period of a * second to further refine the TSC freq value. Since this is @@ -1161,29 +1316,30 @@ static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); */ static void tsc_refine_calibration_work(struct work_struct *work) { - static u64 tsc_start = -1, ref_start; + static u64 tsc_start = ULLONG_MAX, ref_start; static int hpet; u64 tsc_stop, ref_stop, delta; unsigned long freq; int cpu; /* Don't bother refining TSC on unstable systems */ - if (check_tsc_unstable()) - goto out; + if (tsc_unstable) + goto unreg; /* * Since the work is started early in boot, we may be * delayed the first time we expire. So set the workqueue * again once we know timers are working. */ - if (tsc_start == -1) { + if (tsc_start == ULLONG_MAX) { +restart: /* * Only set hpet once, to avoid mixing hardware * if the hpet becomes enabled later. */ hpet = is_hpet_enabled(); - schedule_delayed_work(&tsc_irqwork, HZ); tsc_start = tsc_read_refs(&ref_start, hpet); + schedule_delayed_work(&tsc_irqwork, HZ); return; } @@ -1193,9 +1349,9 @@ static void tsc_refine_calibration_work(struct work_struct *work) if (ref_start == ref_stop) goto out; - /* Check, whether the sampling was disturbed by an SMI */ - if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX) - goto out; + /* Check, whether the sampling was disturbed */ + if (tsc_stop == ULLONG_MAX) + goto restart; delta = tsc_stop - tsc_start; delta *= 1000000LL; @@ -1204,6 +1360,25 @@ static void tsc_refine_calibration_work(struct work_struct *work) else freq = calc_pmtimer_ref(delta, ref_start, ref_stop); + /* Will hit this only if tsc_force_recalibrate has been set */ + if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { + + /* Warn if the deviation exceeds 500 ppm */ + if (abs(tsc_khz - freq) > (tsc_khz >> 11)) { + pr_warn("Warning: TSC freq calibrated by CPUID/MSR differs from what is calibrated by HW timer, please check with vendor!!\n"); + pr_info("Previous calibrated TSC freq:\t %lu.%03lu MHz\n", + (unsigned long)tsc_khz / 1000, + (unsigned long)tsc_khz % 1000); + } + + pr_info("TSC freq recalibrated by [%s]:\t %lu.%03lu MHz\n", + hpet ? "HPET" : "PM_TIMER", + (unsigned long)freq / 1000, + (unsigned long)freq % 1000); + + return; + } + /* Make sure we're within 1% */ if (abs(tsc_khz - freq) > tsc_khz/100) goto out; @@ -1221,23 +1396,27 @@ static void tsc_refine_calibration_work(struct work_struct *work) set_cyc2ns_scale(tsc_khz, cpu, tsc_stop); out: - if (boot_cpu_has(X86_FEATURE_ART)) - art_related_clocksource = &clocksource_tsc; + if (tsc_unstable) + goto unreg; + + if (boot_cpu_has(X86_FEATURE_ART)) { + have_art = true; + clocksource_tsc.base = &art_base_clk; + } clocksource_register_khz(&clocksource_tsc, tsc_khz); +unreg: + clocksource_unregister(&clocksource_tsc_early); } static int __init init_tsc_clocksource(void) { - if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz) + if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz) return 0; - if (tsc_clocksource_reliable) - clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; - /* lower the rating if we already know its unstable: */ - if (check_tsc_unstable()) { - clocksource_tsc.rating = 0; - clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; + if (tsc_unstable) { + clocksource_unregister(&clocksource_tsc_early); + return 0; } if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) @@ -1248,10 +1427,15 @@ static int __init init_tsc_clocksource(void) * the refined calibration and directly register it as a clocksource. */ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { - if (boot_cpu_has(X86_FEATURE_ART)) - art_related_clocksource = &clocksource_tsc; + if (boot_cpu_has(X86_FEATURE_ART)) { + have_art = true; + clocksource_tsc.base = &art_base_clk; + } clocksource_register_khz(&clocksource_tsc, tsc_khz); - return 0; + clocksource_unregister(&clocksource_tsc_early); + + if (!tsc_force_recalibrate) + return 0; } schedule_delayed_work(&tsc_irqwork, 0); @@ -1263,21 +1447,27 @@ static int __init init_tsc_clocksource(void) */ device_initcall(init_tsc_clocksource); -void __init tsc_init(void) +static bool __init determine_cpu_tsc_frequencies(bool early) { - u64 lpj, cyc; - int cpu; - - if (!boot_cpu_has(X86_FEATURE_TSC)) { - setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); - return; + /* Make sure that cpu and tsc are not already calibrated */ + WARN_ON(cpu_khz || tsc_khz); + + if (early) { + cpu_khz = x86_platform.calibrate_cpu(); + if (tsc_early_khz) { + tsc_khz = tsc_early_khz; + } else { + tsc_khz = x86_platform.calibrate_tsc(); + clocksource_tsc.freq_khz = tsc_khz; + } + } else { + /* We should not be here with non-native cpu calibration */ + WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu); + cpu_khz = pit_hpet_ptimer_calibrate_cpu(); } - cpu_khz = x86_platform.calibrate_cpu(); - tsc_khz = x86_platform.calibrate_tsc(); - /* - * Trust non-zero tsc_khz as authorative, + * Trust non-zero tsc_khz as authoritative, * and use it to sanity check cpu_khz, * which will be off if system timer is off. */ @@ -1286,72 +1476,126 @@ void __init tsc_init(void) else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; - if (!tsc_khz) { - mark_tsc_unstable("could not calculate TSC khz"); - setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); - return; - } + if (tsc_khz == 0) + return false; pr_info("Detected %lu.%03lu MHz processor\n", - (unsigned long)cpu_khz / 1000, - (unsigned long)cpu_khz % 1000); + (unsigned long)cpu_khz / KHZ, + (unsigned long)cpu_khz % KHZ); + + if (cpu_khz != tsc_khz) { + pr_info("Detected %lu.%03lu MHz TSC", + (unsigned long)tsc_khz / KHZ, + (unsigned long)tsc_khz % KHZ); + } + return true; +} + +static unsigned long __init get_loops_per_jiffy(void) +{ + u64 lpj = (u64)tsc_khz * KHZ; + + do_div(lpj, HZ); + return lpj; +} + +static void __init tsc_enable_sched_clock(void) +{ + loops_per_jiffy = get_loops_per_jiffy(); + use_tsc_delay(); /* Sanitize TSC ADJUST before cyc2ns gets initialized */ tsc_store_and_check_tsc_adjust(true); + cyc2ns_init_boot_cpu(); + static_branch_enable(&__use_tsc); +} - /* - * Secondary CPUs do not run through tsc_init(), so set up - * all the scale factors for all CPUs, assuming the same - * speed as the bootup CPU. (cpufreq notifiers will fix this - * up if their speed diverges) - */ - cyc = rdtsc(); - for_each_possible_cpu(cpu) { - cyc2ns_init(cpu); - set_cyc2ns_scale(tsc_khz, cpu, cyc); - } +void __init tsc_early_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_TSC)) + return; + /* Don't change UV TSC multi-chassis synchronization */ + if (is_early_uv_system()) + return; - if (tsc_disabled > 0) + snp_secure_tsc_init(); + + if (!determine_cpu_tsc_frequencies(true)) return; + tsc_enable_sched_clock(); +} - /* now allow native_sched_clock() to use rdtsc */ +void __init tsc_init(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_TSC)) { + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + return; + } - tsc_disabled = 0; - static_branch_enable(&__use_tsc); + /* + * native_calibrate_cpu_early can only calibrate using methods that are + * available early in boot. + */ + if (x86_platform.calibrate_cpu == native_calibrate_cpu_early) + x86_platform.calibrate_cpu = native_calibrate_cpu; + + if (!tsc_khz) { + /* We failed to determine frequencies earlier, try again */ + if (!determine_cpu_tsc_frequencies(false)) { + mark_tsc_unstable("could not calculate TSC khz"); + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + return; + } + tsc_enable_sched_clock(); + } + + cyc2ns_init_secondary_cpus(); if (!no_sched_irq_time) enable_sched_clock_irqtime(); - lpj = ((u64)tsc_khz * 1000); - do_div(lpj, HZ); - lpj_fine = lpj; - - use_tsc_delay(); + lpj_fine = get_loops_per_jiffy(); check_system_tsc_reliable(); - if (unsynchronized_tsc()) + if (unsynchronized_tsc()) { mark_tsc_unstable("TSCs unsynchronized"); + return; + } + if (tsc_clocksource_reliable || no_tsc_watchdog) + tsc_disable_clocksource_watchdog(); + + clocksource_register_khz(&clocksource_tsc_early, tsc_khz); detect_art(); } #ifdef CONFIG_SMP /* - * If we have a constant TSC and are using the TSC for the delay loop, - * we can skip clock calibration if another cpu in the same socket has already - * been calibrated. This assumes that CONSTANT_TSC applies to all - * cpus in the socket - this should be a safe assumption. + * Check whether existing calibration data can be reused. */ unsigned long calibrate_delay_is_known(void) { int sibling, cpu = smp_processor_id(); - struct cpumask *mask = topology_core_cpumask(cpu); + int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC); + const struct cpumask *mask = topology_core_cpumask(cpu); - if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC)) - return 0; + /* + * If TSC has constant frequency and TSC is synchronized across + * sockets then reuse CPU0 calibration. + */ + if (constant_tsc && !tsc_unstable) + return cpu_data(0).loops_per_jiffy; - if (!mask) + /* + * If TSC has constant frequency and TSC is not synchronized across + * sockets and this is not the first CPU in the socket, then reuse + * the calibration value of an already online CPU on that socket. + * + * This assumes that CONSTANT_TSC is consistent for all CPUs in a + * socket. + */ + if (!constant_tsc || !mask) return 0; sibling = cpumask_any_but(mask, cpu); |
