diff options
Diffstat (limited to 'arch/x86/kernel')
37 files changed, 1094 insertions, 173 deletions
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index d2b7f27781bc..e69182fd01cf 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -150,29 +150,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, } EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); -/* - * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, - * which can obviate IPI to trigger checking of need_resched. - * We execute MONITOR against need_resched and enter optimized wait state - * through MWAIT. Whenever someone changes need_resched, we would be woken - * up from MWAIT (without an IPI). - * - * New with Core Duo processors, MWAIT can take some hints based on CPU - * capability. - */ -void mwait_idle_with_hints(unsigned long ax, unsigned long cx) -{ - if (!need_resched()) { - if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) - clflush((void *)¤t_thread_info()->flags); - - __monitor((void *)¤t_thread_info()->flags, 0, 0); - smp_mb(); - if (!need_resched()) - __mwait(ax, cx); - } -} - void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) { unsigned int cpu = smp_processor_id(); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d278736bf774..7f26c9a70a9e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -75,6 +75,13 @@ unsigned int max_physical_apicid; physid_mask_t phys_cpu_present_map; /* + * Processor to be disabled specified by kernel parameter + * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to + * avoid undefined behaviour caused by sending INIT from AP to BSP. + */ +static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID; + +/* * Map cpu index to physical APIC ID */ DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); @@ -1968,7 +1975,7 @@ __visible void smp_trace_spurious_interrupt(struct pt_regs *regs) */ static inline void __smp_error_interrupt(struct pt_regs *regs) { - u32 v0, v1; + u32 v; u32 i = 0; static const char * const error_interrupt_reason[] = { "Send CS error", /* APIC Error Bit 0 */ @@ -1982,21 +1989,20 @@ static inline void __smp_error_interrupt(struct pt_regs *regs) }; /* First tickle the hardware, only then report what went on. -- REW */ - v0 = apic_read(APIC_ESR); apic_write(APIC_ESR, 0); - v1 = apic_read(APIC_ESR); + v = apic_read(APIC_ESR); ack_APIC_irq(); atomic_inc(&irq_err_count); - apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)", - smp_processor_id(), v0 , v1); + apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x", + smp_processor_id(), v); - v1 = v1 & 0xff; - while (v1) { - if (v1 & 0x1) + v &= 0xff; + while (v) { + if (v & 0x1) apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); i++; - v1 >>= 1; + v >>= 1; } apic_printk(APIC_DEBUG, KERN_CONT "\n"); @@ -2115,6 +2121,39 @@ int generic_processor_info(int apicid, int version) phys_cpu_present_map); /* + * boot_cpu_physical_apicid is designed to have the apicid + * returned by read_apic_id(), i.e, the apicid of the + * currently booting-up processor. However, on some platforms, + * it is temporarily modified by the apicid reported as BSP + * through MP table. Concretely: + * + * - arch/x86/kernel/mpparse.c: MP_processor_info() + * - arch/x86/mm/amdtopology.c: amd_numa_init() + * - arch/x86/platform/visws/visws_quirks.c: MP_processor_info() + * + * This function is executed with the modified + * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel + * parameter doesn't work to disable APs on kdump 2nd kernel. + * + * Since fixing handling of boot_cpu_physical_apicid requires + * another discussion and tests on each platform, we leave it + * for now and here we use read_apic_id() directly in this + * function, generic_processor_info(). + */ + if (disabled_cpu_apicid != BAD_APICID && + disabled_cpu_apicid != read_apic_id() && + disabled_cpu_apicid == apicid) { + int thiscpu = num_processors + disabled_cpus; + + pr_warning("APIC: Disabling requested cpu." + " Processor %d/0x%x ignored.\n", + thiscpu, apicid); + + disabled_cpus++; + return -ENODEV; + } + + /* * If boot cpu has not been detected yet, then only allow upto * nr_cpu_ids - 1 processors and keep one slot free for boot cpu */ @@ -2592,3 +2631,12 @@ static int __init lapic_insert_resource(void) * that is using request_resource */ late_initcall(lapic_insert_resource); + +static int __init apic_set_disabled_cpu_apicid(char *arg) +{ + if (!arg || !get_option(&arg, &disabled_cpu_apicid)) + return -EINVAL; + + return 0; +} +early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid); diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 00c77cf78e9e..5d5b9eb2b7a4 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -14,7 +14,6 @@ #include <linux/string.h> #include <linux/kernel.h> #include <linux/ctype.h> -#include <linux/init.h> #include <linux/hardirq.h> #include <linux/module.h> #include <asm/smp.h> diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index e145f28b4099..191ce75c0e54 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -15,7 +15,6 @@ #include <linux/string.h> #include <linux/kernel.h> #include <linux/ctype.h> -#include <linux/init.h> #include <linux/errno.h> #include <asm/fixmap.h> #include <asm/mpspec.h> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e63a5bd2a78f..a43f068ebec1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1142,9 +1142,10 @@ next: if (test_bit(vector, used_vectors)) goto next; - for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) - if (per_cpu(vector_irq, new_cpu)[vector] != -1) + for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) { + if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNDEFINED) goto next; + } /* Found one! */ current_vector = vector; current_offset = offset; @@ -1183,7 +1184,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) vector = cfg->vector; for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) - per_cpu(vector_irq, cpu)[vector] = -1; + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; cfg->vector = 0; cpumask_clear(cfg->domain); @@ -1191,11 +1192,10 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) if (likely(!cfg->move_in_progress)) return; for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; - vector++) { + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != irq) continue; - per_cpu(vector_irq, cpu)[vector] = -1; + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; break; } } @@ -1228,12 +1228,12 @@ void __setup_vector_irq(int cpu) /* Mark the free vectors */ for (vector = 0; vector < NR_VECTORS; ++vector) { irq = per_cpu(vector_irq, cpu)[vector]; - if (irq < 0) + if (irq <= VECTOR_UNDEFINED) continue; cfg = irq_cfg(irq); if (!cpumask_test_cpu(cpu, cfg->domain)) - per_cpu(vector_irq, cpu)[vector] = -1; + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; } raw_spin_unlock(&vector_lock); } @@ -2202,13 +2202,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) me = smp_processor_id(); for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - unsigned int irq; + int irq; unsigned int irr; struct irq_desc *desc; struct irq_cfg *cfg; irq = __this_cpu_read(vector_irq[vector]); - if (irq == -1) + if (irq <= VECTOR_UNDEFINED) continue; desc = irq_to_desc(irq); diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 7434d8556d09..62071569bd50 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -1,6 +1,5 @@ #include <linux/cpumask.h> #include <linux/interrupt.h> -#include <linux/init.h> #include <linux/mm.h> #include <linux/delay.h> diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 77c95c0e1bf7..00146f9b0254 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -29,7 +29,6 @@ #define pr_fmt(fmt) "summit: %s: " fmt, __func__ #include <linux/mm.h> -#include <linux/init.h> #include <asm/io.h> #include <asm/bios_ebda.h> diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 140e29db478d..cac85ee6913f 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -3,7 +3,6 @@ #include <linux/string.h> #include <linux/kernel.h> #include <linux/ctype.h> -#include <linux/init.h> #include <linux/dmar.h> #include <linux/cpu.h> diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 562a76d433c8..de231e328cae 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -3,7 +3,6 @@ #include <linux/string.h> #include <linux/kernel.h> #include <linux/ctype.h> -#include <linux/init.h> #include <linux/dmar.h> #include <asm/smp.h> diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 47b56a7e99cb..6359506a19ee 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -36,7 +36,7 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o endif obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o perf_event_intel_rapl.o endif diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bca023bdd6b2..525712c7ffa2 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1,5 +1,4 @@ #include <linux/export.h> -#include <linux/init.h> #include <linux/bitops.h> #include <linux/elf.h> #include <linux/mm.h> @@ -487,7 +486,7 @@ static void early_init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); if (!check_tsc_unstable()) - sched_clock_stable = 1; + set_sched_clock_stable(); } #ifdef CONFIG_X86_64 @@ -790,14 +789,10 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) } /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ - if (!((eax >> 16) & mask)) { - u32 a, b, c, d; - - cpuid(0x80000005, &a, &b, &c, &d); - tlb_lld_2m[ENTRIES] = (a >> 16) & 0xff; - } else { + if (!((eax >> 16) & mask)) + tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; + else tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; - } /* a 4M entry uses two 2M entries */ tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 8d5652dc99dd..8779edab684e 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -1,6 +1,5 @@ #include <linux/bitops.h> #include <linux/kernel.h> -#include <linux/init.h> #include <asm/processor.h> #include <asm/e820.h> diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index d0969c75ab54..aaf152e79637 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -1,4 +1,3 @@ -#include <linux/init.h> #include <linux/bitops.h> #include <linux/delay.h> #include <linux/pci.h> diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 5eb7ea5fae15..3db61c644e44 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1,4 +1,3 @@ -#include <linux/init.h> #include <linux/kernel.h> #include <linux/string.h> @@ -93,7 +92,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); if (!check_tsc_unstable()) - sched_clock_stable = 1; + set_sched_clock_stable(); } /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 4cfe0458ca66..fb6156fee6f7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -6,7 +6,6 @@ */ #include <linux/gfp.h> -#include <linux/init.h> #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/sched.h> diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 1c044b1ccc59..a3042989398c 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -5,7 +5,6 @@ #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/types.h> -#include <linux/init.h> #include <linux/smp.h> #include <asm/processor.h> diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index e9a701aecaa1..7dc5564d0cdf 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -5,7 +5,6 @@ #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/types.h> -#include <linux/init.h> #include <asm/processor.h> #include <asm/mce.h> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8e132931614d..b88645191fe5 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1883,21 +1883,27 @@ static struct pmu pmu = { void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { + struct cyc2ns_data *data; + userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; userpg->pmc_width = x86_pmu.cntval_bits; - if (!sched_clock_stable) + if (!sched_clock_stable()) return; + data = cyc2ns_read_begin(); + userpg->cap_user_time = 1; - userpg->time_mult = this_cpu_read(cyc2ns); - userpg->time_shift = CYC2NS_SCALE_FACTOR; - userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; + userpg->time_mult = data->cyc2ns_mul; + userpg->time_shift = data->cyc2ns_shift; + userpg->time_offset = data->cyc2ns_offset - now; userpg->cap_user_time_zero = 1; - userpg->time_zero = this_cpu_read(cyc2ns_offset); + userpg->time_zero = data->cyc2ns_offset; + + cyc2ns_read_end(data); } /* diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index e09f0bfb7b8f..4b8e4d3cd6ea 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/pci.h> #include <linux/ptrace.h> +#include <linux/syscore_ops.h> #include <asm/apic.h> @@ -816,6 +817,18 @@ out: return ret; } +static void ibs_eilvt_setup(void) +{ + /* + * Force LVT offset assignment for family 10h: The offsets are + * not assigned by the BIOS for this family, so the OS is + * responsible for doing it. If the OS assignment fails, fall + * back to BIOS settings and try to setup this. + */ + if (boot_cpu_data.x86 == 0x10) + force_ibs_eilvt_setup(); +} + static inline int get_ibs_lvt_offset(void) { u64 val; @@ -851,6 +864,36 @@ static void clear_APIC_ibs(void *dummy) setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); } +#ifdef CONFIG_PM + +static int perf_ibs_suspend(void) +{ + clear_APIC_ibs(NULL); + return 0; +} + +static void perf_ibs_resume(void) +{ + ibs_eilvt_setup(); + setup_APIC_ibs(NULL); +} + +static struct syscore_ops perf_ibs_syscore_ops = { + .resume = perf_ibs_resume, + .suspend = perf_ibs_suspend, +}; + +static void perf_ibs_pm_init(void) +{ + register_syscore_ops(&perf_ibs_syscore_ops); +} + +#else + +static inline void perf_ibs_pm_init(void) { } + +#endif + static int perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -877,18 +920,12 @@ static __init int amd_ibs_init(void) if (!caps) return -ENODEV; /* ibs not supported by the cpu */ - /* - * Force LVT offset assignment for family 10h: The offsets are - * not assigned by the BIOS for this family, so the OS is - * responsible for doing it. If the OS assignment fails, fall - * back to BIOS settings and try to setup this. - */ - if (boot_cpu_data.x86 == 0x10) - force_ibs_eilvt_setup(); + ibs_eilvt_setup(); if (!ibs_eilvt_valid()) goto out; + perf_ibs_pm_init(); get_online_cpus(); ibs_caps = caps; /* make ibs_caps visible to other cpus: */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c new file mode 100644 index 000000000000..5ad35ad94d0f --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -0,0 +1,679 @@ +/* + * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters + * Copyright (C) 2013 Google, Inc., Stephane Eranian + * + * Intel RAPL interface is specified in the IA-32 Manual Vol3b + * section 14.7.1 (September 2013) + * + * RAPL provides more controls than just reporting energy consumption + * however here we only expose the 3 energy consumption free running + * counters (pp0, pkg, dram). + * + * Each of those counters increments in a power unit defined by the + * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules + * but it can vary. + * + * Counter to rapl events mappings: + * + * pp0 counter: consumption of all physical cores (power plane 0) + * event: rapl_energy_cores + * perf code: 0x1 + * + * pkg counter: consumption of the whole processor package + * event: rapl_energy_pkg + * perf code: 0x2 + * + * dram counter: consumption of the dram domain (servers only) + * event: rapl_energy_dram + * perf code: 0x3 + * + * dram counter: consumption of the builtin-gpu domain (client only) + * event: rapl_energy_gpu + * perf code: 0x4 + * + * We manage those counters as free running (read-only). They may be + * use simultaneously by other tools, such as turbostat. + * + * The events only support system-wide mode counting. There is no + * sampling support because it does not make sense and is not + * supported by the RAPL hardware. + * + * Because we want to avoid floating-point operations in the kernel, + * the events are all reported in fixed point arithmetic (32.32). + * Tools must adjust the counts to convert them to Watts using + * the duration of the measurement. Tools may use a function such as + * ldexp(raw_count, -32); + */ +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/perf_event.h> +#include <asm/cpu_device_id.h> +#include "perf_event.h" + +/* + * RAPL energy status counters + */ +#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */ +#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */ +#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */ +#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */ +#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */ +#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ +#define RAPL_IDX_PP1_NRG_STAT 3 /* DRAM */ +#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ + +/* Clients have PP0, PKG */ +#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\ + 1<<RAPL_IDX_PKG_NRG_STAT|\ + 1<<RAPL_IDX_PP1_NRG_STAT) + +/* Servers have PP0, PKG, RAM */ +#define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\ + 1<<RAPL_IDX_PKG_NRG_STAT|\ + 1<<RAPL_IDX_RAM_NRG_STAT) + +/* + * event code: LSB 8 bits, passed in attr->config + * any other bit is reserved + */ +#define RAPL_EVENT_MASK 0xFFULL + +#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __rapl_##_var##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __rapl_##_var##_show, NULL) + +#define RAPL_EVENT_DESC(_name, _config) \ +{ \ + .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \ + .config = _config, \ +} + +#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */ + +struct rapl_pmu { + spinlock_t lock; + int hw_unit; /* 1/2^hw_unit Joule */ + int n_active; /* number of active events */ + struct list_head active_list; + struct pmu *pmu; /* pointer to rapl_pmu_class */ + ktime_t timer_interval; /* in ktime_t unit */ + struct hrtimer hrtimer; +}; + +static struct pmu rapl_pmu_class; +static cpumask_t rapl_cpu_mask; +static int rapl_cntr_mask; + +static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu); +static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free); + +static inline u64 rapl_read_counter(struct perf_event *event) +{ + u64 raw; + rdmsrl(event->hw.event_base, raw); + return raw; +} + +static inline u64 rapl_scale(u64 v) +{ + /* + * scale delta to smallest unit (1/2^32) + * users must then scale back: count * 1/(1e9*2^32) to get Joules + * or use ldexp(count, -32). + * Watts = Joules/Time delta + */ + return v << (32 - __get_cpu_var(rapl_pmu)->hw_unit); +} + +static u64 rapl_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev_raw_count, new_raw_count; + s64 delta, sdelta; + int shift = RAPL_CNTR_WIDTH; + +again: + prev_raw_count = local64_read(&hwc->prev_count); + rdmsrl(event->hw.event_base, new_raw_count); + + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) { + cpu_relax(); + goto again; + } + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + sdelta = rapl_scale(delta); + + local64_add(sdelta, &event->count); + + return new_raw_count; +} + +static void rapl_start_hrtimer(struct rapl_pmu *pmu) +{ + __hrtimer_start_range_ns(&pmu->hrtimer, + pmu->timer_interval, 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static void rapl_stop_hrtimer(struct rapl_pmu *pmu) +{ + hrtimer_cancel(&pmu->hrtimer); +} + +static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct perf_event *event; + unsigned long flags; + + if (!pmu->n_active) + return HRTIMER_NORESTART; + + spin_lock_irqsave(&pmu->lock, flags); + + list_for_each_entry(event, &pmu->active_list, active_entry) { + rapl_event_update(event); + } + + spin_unlock_irqrestore(&pmu->lock, flags); + + hrtimer_forward_now(hrtimer, pmu->timer_interval); + + return HRTIMER_RESTART; +} + +static void rapl_hrtimer_init(struct rapl_pmu *pmu) +{ + struct hrtimer *hr = &pmu->hrtimer; + + hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hr->function = rapl_hrtimer_handle; +} + +static void __rapl_pmu_event_start(struct rapl_pmu *pmu, + struct perf_event *event) +{ + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + event->hw.state = 0; + + list_add_tail(&event->active_entry, &pmu->active_list); + + local64_set(&event->hw.prev_count, rapl_read_counter(event)); + + pmu->n_active++; + if (pmu->n_active == 1) + rapl_start_hrtimer(pmu); +} + +static void rapl_pmu_event_start(struct perf_event *event, int mode) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + unsigned long flags; + + spin_lock_irqsave(&pmu->lock, flags); + __rapl_pmu_event_start(pmu, event); + spin_unlock_irqrestore(&pmu->lock, flags); +} + +static void rapl_pmu_event_stop(struct perf_event *event, int mode) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + spin_lock_irqsave(&pmu->lock, flags); + + /* mark event as deactivated and stopped */ + if (!(hwc->state & PERF_HES_STOPPED)) { + WARN_ON_ONCE(pmu->n_active <= 0); + pmu->n_active--; + if (pmu->n_active == 0) + rapl_stop_hrtimer(pmu); + + list_del(&event->active_entry); + + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } + + /* check if update of sw counter is necessary */ + if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + rapl_event_update(event); + hwc->state |= PERF_HES_UPTODATE; + } + + spin_unlock_irqrestore(&pmu->lock, flags); +} + +static int rapl_pmu_event_add(struct perf_event *event, int mode) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + spin_lock_irqsave(&pmu->lock, flags); + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (mode & PERF_EF_START) + __rapl_pmu_event_start(pmu, event); + + spin_unlock_irqrestore(&pmu->lock, flags); + + return 0; +} + +static void rapl_pmu_event_del(struct perf_event *event, int flags) +{ + rapl_pmu_event_stop(event, PERF_EF_UPDATE); +} + +static int rapl_pmu_event_init(struct perf_event *event) +{ + u64 cfg = event->attr.config & RAPL_EVENT_MASK; + int bit, msr, ret = 0; + + /* only look at RAPL events */ + if (event->attr.type != rapl_pmu_class.type) + return -ENOENT; + + /* check only supported bits are set */ + if (event->attr.config & ~RAPL_EVENT_MASK) + return -EINVAL; + + /* + * check event is known (determines counter) + */ + switch (cfg) { + case INTEL_RAPL_PP0: + bit = RAPL_IDX_PP0_NRG_STAT; + msr = MSR_PP0_ENERGY_STATUS; + break; + case INTEL_RAPL_PKG: + bit = RAPL_IDX_PKG_NRG_STAT; + msr = MSR_PKG_ENERGY_STATUS; + break; + case INTEL_RAPL_RAM: + bit = RAPL_IDX_RAM_NRG_STAT; + msr = MSR_DRAM_ENERGY_STATUS; + break; + case INTEL_RAPL_PP1: + bit = RAPL_IDX_PP1_NRG_STAT; + msr = MSR_PP1_ENERGY_STATUS; + break; + default: + return -EINVAL; + } + /* check event supported */ + if (!(rapl_cntr_mask & (1 << bit))) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + event->attr.sample_period) /* no sampling */ + return -EINVAL; + + /* must be done before validate_group */ + event->hw.event_base = msr; + event->hw.config = cfg; + event->hw.idx = bit; + + return ret; +} + +static void rapl_pmu_event_read(struct perf_event *event) +{ + rapl_event_update(event); +} + +static ssize_t rapl_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask); + + buf[n++] = '\n'; + buf[n] = '\0'; + return n; +} + +static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); + +static struct attribute *rapl_pmu_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group rapl_pmu_attr_group = { + .attrs = rapl_pmu_attrs, +}; + +EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); +EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); +EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); +EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); + +EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); +EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); +EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); +EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); + +/* + * we compute in 0.23 nJ increments regardless of MSR + */ +EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); + +static struct attribute *rapl_events_srv_attr[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_ram), + + EVENT_PTR(rapl_cores_unit), + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_ram_unit), + + EVENT_PTR(rapl_cores_scale), + EVENT_PTR(rapl_pkg_scale), + EVENT_PTR(rapl_ram_scale), + NULL, +}; + +static struct attribute *rapl_events_cln_attr[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_gpu), + + EVENT_PTR(rapl_cores_unit), + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_gpu_unit), + + EVENT_PTR(rapl_cores_scale), + EVENT_PTR(rapl_pkg_scale), + EVENT_PTR(rapl_gpu_scale), + NULL, +}; + +static struct attribute_group rapl_pmu_events_group = { + .name = "events", + .attrs = NULL, /* patched at runtime */ +}; + +DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7"); +static struct attribute *rapl_formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group rapl_pmu_format_group = { + .name = "format", + .attrs = rapl_formats_attr, +}; + +const struct attribute_group *rapl_attr_groups[] = { + &rapl_pmu_attr_group, + &rapl_pmu_format_group, + &rapl_pmu_events_group, + NULL, +}; + +static struct pmu rapl_pmu_class = { + .attr_groups = rapl_attr_groups, + .task_ctx_nr = perf_invalid_context, /* system-wide only */ + .event_init = rapl_pmu_event_init, + .add = rapl_pmu_event_add, /* must have */ + .del = rapl_pmu_event_del, /* must have */ + .start = rapl_pmu_event_start, + .stop = rapl_pmu_event_stop, + .read = rapl_pmu_event_read, +}; + +static void rapl_cpu_exit(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); + int i, phys_id = topology_physical_package_id(cpu); + int target = -1; + + /* find a new cpu on same package */ + for_each_online_cpu(i) { + if (i == cpu) + continue; + if (phys_id == topology_physical_package_id(i)) { + target = i; + break; + } + } + /* + * clear cpu from cpumask + * if was set in cpumask and still some cpu on package, + * then move to new cpu + */ + if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0) + cpumask_set_cpu(target, &rapl_cpu_mask); + + WARN_ON(cpumask_empty(&rapl_cpu_mask)); + /* + * migrate events and context to new cpu + */ + if (target >= 0) + perf_pmu_migrate_context(pmu->pmu, cpu, target); + + /* cancel overflow polling timer for CPU */ + rapl_stop_hrtimer(pmu); +} + +static void rapl_cpu_init(int cpu) +{ + int i, phys_id = topology_physical_package_id(cpu); + + /* check if phys_is is already covered */ + for_each_cpu(i, &rapl_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) + return; + } + /* was not found, so add it */ + cpumask_set_cpu(cpu, &rapl_cpu_mask); +} + +static int rapl_cpu_prepare(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); + int phys_id = topology_physical_package_id(cpu); + u64 ms; + + if (pmu) + return 0; + + if (phys_id < 0) + return -1; + + pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); + if (!pmu) + return -1; + + spin_lock_init(&pmu->lock); + + INIT_LIST_HEAD(&pmu->active_list); + + /* + * grab power unit as: 1/2^unit Joules + * + * we cache in local PMU instance + */ + rdmsrl(MSR_RAPL_POWER_UNIT, pmu->hw_unit); + pmu->hw_unit = (pmu->hw_unit >> 8) & 0x1FULL; + pmu->pmu = &rapl_pmu_class; + + /* + * use reference of 200W for scaling the timeout + * to avoid missing counter overflows. + * 200W = 200 Joules/sec + * divide interval by 2 to avoid lockstep (2 * 100) + * if hw unit is 32, then we use 2 ms 1/200/2 + */ + if (pmu->hw_unit < 32) + ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1)); + else + ms = 2; + + pmu->timer_interval = ms_to_ktime(ms); + + rapl_hrtimer_init(pmu); + + /* set RAPL pmu for this cpu for now */ + per_cpu(rapl_pmu, cpu) = pmu; + per_cpu(rapl_pmu_to_free, cpu) = NULL; + + return 0; +} + +static void rapl_cpu_kfree(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu); + + kfree(pmu); + + per_cpu(rapl_pmu_to_free, cpu) = NULL; +} + +static int rapl_cpu_dying(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); + + if (!pmu) + return 0; + + per_cpu(rapl_pmu, cpu) = NULL; + + per_cpu(rapl_pmu_to_free, cpu) = pmu; + + return 0; +} + +static int rapl_cpu_notifier(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + rapl_cpu_prepare(cpu); + break; + case CPU_STARTING: + rapl_cpu_init(cpu); + break; + case CPU_UP_CANCELED: + case CPU_DYING: + rapl_cpu_dying(cpu); + break; + case CPU_ONLINE: + case CPU_DEAD: + rapl_cpu_kfree(cpu); + break; + case CPU_DOWN_PREPARE: + rapl_cpu_exit(cpu); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static const struct x86_cpu_id rapl_cpu_match[] = { + [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, + [1] = {}, +}; + +static int __init rapl_pmu_init(void) +{ + struct rapl_pmu *pmu; + int cpu, ret; + + /* + * check for Intel processor family 6 + */ + if (!x86_match_cpu(rapl_cpu_match)) + return 0; + + /* check supported CPU */ + switch (boot_cpu_data.x86_model) { + case 42: /* Sandy Bridge */ + case 58: /* Ivy Bridge */ + case 60: /* Haswell */ + case 69: /* Haswell-Celeron */ + rapl_cntr_mask = RAPL_IDX_CLN; + rapl_pmu_events_group.attrs = rapl_events_cln_attr; + break; + case 45: /* Sandy Bridge-EP */ + case 62: /* IvyTown */ + rapl_cntr_mask = RAPL_IDX_SRV; + rapl_pmu_events_group.attrs = rapl_events_srv_attr; + break; + + default: + /* unsupported */ + return 0; + } + get_online_cpus(); + + for_each_online_cpu(cpu) { + rapl_cpu_prepare(cpu); + rapl_cpu_init(cpu); + } + + perf_cpu_notifier(rapl_cpu_notifier); + + ret = perf_pmu_register(&rapl_pmu_class, "power", -1); + if (WARN_ON(ret)) { + pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret); + put_online_cpus(); + return -1; + } + + pmu = __get_cpu_var(rapl_pmu); + + pr_info("RAPL PMU detected, hw unit 2^-%d Joules," + " API unit is 2^-32 Joules," + " %d fixed counters" + " %llu ms ovfl timer\n", + pmu->hw_unit, + hweight32(rapl_cntr_mask), + ktime_to_ms(pmu->timer_interval)); + + put_online_cpus(); + + return 0; +} +device_initcall(rapl_pmu_init); diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index aa0430d69b90..3fa0e5ad86b4 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -1,6 +1,5 @@ #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/init.h> #include <asm/processor.h> #include <asm/msr.h> #include "cpu.h" diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c index 75c5ad5d35cc..ef9c2a0078bd 100644 --- a/arch/x86/kernel/cpu/umc.c +++ b/arch/x86/kernel/cpu/umc.c @@ -1,5 +1,4 @@ #include <linux/kernel.h> -#include <linux/init.h> #include <asm/processor.h> #include "cpu.h" diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 18677a90d6a3..a57902efe2d5 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -7,7 +7,6 @@ * */ -#include <linux/init.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/smp.h> diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c index 5d3fe8d36e4a..f6dfd9334b67 100644 --- a/arch/x86/kernel/doublefault.c +++ b/arch/x86/kernel/doublefault.c @@ -1,6 +1,5 @@ #include <linux/mm.h> #include <linux/sched.h> -#include <linux/init.h> #include <linux/init_task.h> #include <linux/fs.h> diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 51e2988c5728..a2a4f4697889 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1082,7 +1082,7 @@ ENTRY(ftrace_caller) pushl $0 /* Pass NULL as regs pointer */ movl 4*4(%esp), %eax movl 0x4(%ebp), %edx - leal function_trace_op, %ecx + movl function_trace_op, %ecx subl $MCOUNT_INSN_SIZE, %eax .globl ftrace_call @@ -1140,7 +1140,7 @@ ENTRY(ftrace_regs_caller) movl 12*4(%esp), %eax /* Load ip (1st parameter) */ subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ - leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ + movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ pushl %esp /* Save pt_regs as 4th parameter */ GLOBAL(ftrace_regs_call) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e21b0785a85b..1e96c3628bf2 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -88,7 +88,7 @@ END(function_hook) MCOUNT_SAVE_FRAME \skip /* Load the ftrace_ops into the 3rd parameter */ - leaq function_trace_op, %rdx + movq function_trace_op(%rip), %rdx /* Load ip into the first parameter */ movq RIP(%rsp), %rdi diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index f66ff162dce8..a67b47c31314 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -38,7 +38,6 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/sched.h> -#include <linux/init.h> #include <linux/smp.h> #include <asm/hw_breakpoint.h> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 22d0687e7fda..884d875c1434 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -193,9 +193,13 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) if (!handle_irq(irq, regs)) { ack_APIC_irq(); - if (printk_ratelimit()) - pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", - __func__, smp_processor_id(), vector, irq); + if (irq != VECTOR_RETRIGGERED) { + pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n", + __func__, smp_processor_id(), + vector, irq); + } else { + __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED); + } } irq_exit(); @@ -344,7 +348,7 @@ void fixup_irqs(void) for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { unsigned int irr; - if (__this_cpu_read(vector_irq[vector]) < 0) + if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED) continue; irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); @@ -355,11 +359,14 @@ void fixup_irqs(void) data = irq_desc_get_irq_data(desc); chip = irq_data_get_irq_chip(data); raw_spin_lock(&desc->lock); - if (chip->irq_retrigger) + if (chip->irq_retrigger) { chip->irq_retrigger(data); + __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED); + } raw_spin_unlock(&desc->lock); } - __this_cpu_write(vector_irq[vector], -1); + if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED) + __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED); } } #endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index a2a1fbc594ff..7f50156542fb 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -52,7 +52,7 @@ static struct irqaction irq2 = { }; DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... NR_VECTORS - 1] = -1, + [0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED, }; int vector_used_by_percpu_irq(unsigned int vector) @@ -60,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector) int cpu; for_each_online_cpu(cpu) { - if (per_cpu(vector_irq, cpu)[vector] != -1) + if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED) return 1; } diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 836f8322960e..7ec1d5f8d283 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -39,7 +39,6 @@ #include <linux/sched.h> #include <linux/delay.h> #include <linux/kgdb.h> -#include <linux/init.h> #include <linux/smp.h> #include <linux/nmi.h> #include <linux/hw_breakpoint.h> diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 5b19e4d78b00..1667b1de8d5d 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -9,7 +9,6 @@ #include <linux/mm.h> #include <linux/kexec.h> #include <linux/delay.h> -#include <linux/init.h> #include <linux/numa.h> #include <linux/ftrace.h> #include <linux/suspend.h> diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 871be4a84c7d..da15918d1c81 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -3,7 +3,6 @@ #include <linux/dma-mapping.h> #include <linux/scatterlist.h> #include <linux/string.h> -#include <linux/init.h> #include <linux/gfp.h> #include <linux/pci.h> #include <linux/mm.h> diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 6f1236c29c4b..0de43e98ce08 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -24,7 +24,6 @@ #include <linux/interrupt.h> #include <linux/delay.h> #include <linux/reboot.h> -#include <linux/init.h> #include <linux/mc146818rtc.h> #include <linux/module.h> #include <linux/kallsyms.h> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 85dc05a3aa02..f5252c4eec8c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1417,7 +1417,9 @@ static inline void mwait_play_dead(void) * The WBINVD is insufficient due to the spurious-wakeup * case where we return around the loop. */ + mb(); clflush(mwait_ptr); + mb(); __monitor(mwait_ptr, 0, 0); mb(); __mwait(eax, 0); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b857ed890b4c..57409f6b8c62 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -211,21 +211,17 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ exception_exit(prev_state); \ } -DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, - regs->ip) -DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) -DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, - regs->ip) -DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", - coprocessor_segment_overrun) -DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) +DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip ) +DO_ERROR (X86_TRAP_OF, SIGSEGV, "overflow", overflow ) +DO_ERROR (X86_TRAP_BR, SIGSEGV, "bounds", bounds ) +DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip ) +DO_ERROR (X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun ) +DO_ERROR (X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS ) +DO_ERROR (X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present ) #ifdef CONFIG_X86_32 -DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) +DO_ERROR (X86_TRAP_SS, SIGBUS, "stack segment", stack_segment ) #endif -DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, - BUS_ADRALN, 0) +DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0 ) #ifdef CONFIG_X86_64 /* Runs on IST stack */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 930e5d48f560..6377fb28b958 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -11,6 +11,7 @@ #include <linux/clocksource.h> #include <linux/percpu.h> #include <linux/timex.h> +#include <linux/static_key.h> #include <asm/hpet.h> #include <asm/timer.h> @@ -37,13 +38,244 @@ static int __read_mostly tsc_unstable; erroneous rdtsc usage on !cpu_has_tsc processors */ static int __read_mostly tsc_disabled = -1; +static struct static_key __use_tsc = STATIC_KEY_INIT; + int tsc_clocksource_reliable; + +/* + * Use a ring-buffer like data structure, where a writer advances the head by + * writing a new data entry and a reader advances the tail when it observes a + * new entry. + * + * Writers are made to wait on readers until there's space to write a new + * entry. + * + * This means that we can always use an {offset, mul} pair to compute a ns + * value that is 'roughly' in the right direction, even if we're writing a new + * {offset, mul} pair during the clock read. + * + * The down-side is that we can no longer guarantee strict monotonicity anymore + * (assuming the TSC was that to begin with), because while we compute the + * intersection point of the two clock slopes and make sure the time is + * continuous at the point of switching; we can no longer guarantee a reader is + * strictly before or after the switch point. + * + * It does mean a reader no longer needs to disable IRQs in order to avoid + * CPU-Freq updates messing with his times, and similarly an NMI reader will + * no longer run the risk of hitting half-written state. + */ + +struct cyc2ns { + struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */ + struct cyc2ns_data *head; /* 48 + 8 = 56 */ + struct cyc2ns_data *tail; /* 56 + 8 = 64 */ +}; /* exactly fits one cacheline */ + +static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); + +struct cyc2ns_data *cyc2ns_read_begin(void) +{ + struct cyc2ns_data *head; + + preempt_disable(); + + head = this_cpu_read(cyc2ns.head); + /* + * Ensure we observe the entry when we observe the pointer to it. + * matches the wmb from cyc2ns_write_end(). + */ + smp_read_barrier_depends(); + head->__count++; + barrier(); + + return head; +} + +void cyc2ns_read_end(struct cyc2ns_data *head) +{ + barrier(); + /* + * If we're the outer most nested read; update the tail pointer + * when we're done. This notifies possible pending writers + * that we've observed the head pointer and that the other + * entry is now free. + */ + if (!--head->__count) { + /* + * x86-TSO does not reorder writes with older reads; + * therefore once this write becomes visible to another + * cpu, we must be finished reading the cyc2ns_data. + * + * matches with cyc2ns_write_begin(). + */ + this_cpu_write(cyc2ns.tail, head); + } + preempt_enable(); +} + +/* + * Begin writing a new @data entry for @cpu. + * + * Assumes some sort of write side lock; currently 'provided' by the assumption + * that cpufreq will call its notifiers sequentially. + */ +static struct cyc2ns_data *cyc2ns_write_begin(int cpu) +{ + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + struct cyc2ns_data *data = c2n->data; + + if (data == c2n->head) + data++; + + /* XXX send an IPI to @cpu in order to guarantee a read? */ + + /* + * When we observe the tail write from cyc2ns_read_end(), + * the cpu must be done with that entry and its safe + * to start writing to it. + */ + while (c2n->tail == data) + cpu_relax(); + + return data; +} + +static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data) +{ + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + + /* + * Ensure the @data writes are visible before we publish the + * entry. Matches the data-depencency in cyc2ns_read_begin(). + */ + smp_wmb(); + + ACCESS_ONCE(c2n->head) = data; +} + +/* + * Accelerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_khz * 10^3)) + * ns = cycles * (10^6 / cpu_khz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^6 * SC / cpu_khz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * + * We can use khz divisor instead of mhz to keep a better precision, since + * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * (mathieu.desnoyers@polymtl.ca) + * + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ + +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static void cyc2ns_data_init(struct cyc2ns_data *data) +{ + data->cyc2ns_mul = 1U << CYC2NS_SCALE_FACTOR; + data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; + data->cyc2ns_offset = 0; + data->__count = 0; +} + +static void cyc2ns_init(int cpu) +{ + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + + cyc2ns_data_init(&c2n->data[0]); + cyc2ns_data_init(&c2n->data[1]); + + c2n->head = c2n->data; + c2n->tail = c2n->data; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + struct cyc2ns_data *data, *tail; + unsigned long long ns; + + /* + * See cyc2ns_read_*() for details; replicated in order to avoid + * an extra few instructions that came with the abstraction. + * Notable, it allows us to only do the __count and tail update + * dance when its actually needed. + */ + + preempt_disable(); + data = this_cpu_read(cyc2ns.head); + tail = this_cpu_read(cyc2ns.tail); + + if (likely(data == tail)) { + ns = data->cyc2ns_offset; + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + } else { + data->__count++; + + barrier(); + + ns = data->cyc2ns_offset; + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + + barrier(); + + if (!--data->__count) + this_cpu_write(cyc2ns.tail, data); + } + preempt_enable(); + + return ns; +} + +/* XXX surely we already have this someplace in the kernel?! */ +#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d)) + +static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) +{ + unsigned long long tsc_now, ns_now; + struct cyc2ns_data *data; + unsigned long flags; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + if (!cpu_khz) + goto done; + + data = cyc2ns_write_begin(cpu); + + rdtscll(tsc_now); + ns_now = cycles_2_ns(tsc_now); + + /* + * Compute a new multiplier as per the above comment and ensure our + * time function is continuous; see the comment near struct + * cyc2ns_data. + */ + data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz); + data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; + data->cyc2ns_offset = ns_now - + mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + + cyc2ns_write_end(cpu, data); + +done: + sched_clock_idle_wakeup_event(0); + local_irq_restore(flags); +} /* * Scheduler clock - returns current time in nanosec units. */ u64 native_sched_clock(void) { - u64 this_offset; + u64 tsc_now; /* * Fall back to jiffies if there's no TSC available: @@ -53,16 +285,16 @@ u64 native_sched_clock(void) * very important for it to be as fast as the platform * can achieve it. ) */ - if (unlikely(tsc_disabled)) { + if (!static_key_false(&__use_tsc)) { /* No locking but a rare wrong value is not a big deal: */ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); } /* read the Time Stamp Counter: */ - rdtscll(this_offset); + rdtscll(tsc_now); /* return the value in ns */ - return __cycles_2_ns(this_offset); + return cycles_2_ns(tsc_now); } /* We need to define a real function for sched_clock, to override the @@ -589,61 +821,11 @@ int recalibrate_cpu_khz(void) EXPORT_SYMBOL(recalibrate_cpu_khz); -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ - -DEFINE_PER_CPU(unsigned long, cyc2ns); -DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); - -static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) -{ - unsigned long long tsc_now, ns_now, *offset; - unsigned long flags, *scale; - - local_irq_save(flags); - sched_clock_idle_sleep_event(); - - scale = &per_cpu(cyc2ns, cpu); - offset = &per_cpu(cyc2ns_offset, cpu); - - rdtscll(tsc_now); - ns_now = __cycles_2_ns(tsc_now); - - if (cpu_khz) { - *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) + - cpu_khz / 2) / cpu_khz; - *offset = ns_now - mult_frac(tsc_now, *scale, - (1UL << CYC2NS_SCALE_FACTOR)); - } - - sched_clock_idle_wakeup_event(0); - local_irq_restore(flags); -} - static unsigned long long cyc2ns_suspend; void tsc_save_sched_clock_state(void) { - if (!sched_clock_stable) + if (!sched_clock_stable()) return; cyc2ns_suspend = sched_clock(); @@ -663,16 +845,26 @@ void tsc_restore_sched_clock_state(void) unsigned long flags; int cpu; - if (!sched_clock_stable) + if (!sched_clock_stable()) return; local_irq_save(flags); - __this_cpu_write(cyc2ns_offset, 0); + /* + * We're comming out of suspend, there's no concurrency yet; don't + * bother being nice about the RCU stuff, just write to both + * data fields. + */ + + this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0); + this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0); + offset = cyc2ns_suspend - sched_clock(); - for_each_possible_cpu(cpu) - per_cpu(cyc2ns_offset, cpu) = offset; + for_each_possible_cpu(cpu) { + per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset; + per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset; + } local_irq_restore(flags); } @@ -795,7 +987,7 @@ void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; - sched_clock_stable = 0; + clear_sched_clock_stable(); disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ @@ -995,14 +1187,18 @@ void __init tsc_init(void) * speed as the bootup CPU. (cpufreq notifiers will fix this * up if their speed diverges) */ - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { + cyc2ns_init(cpu); set_cyc2ns_scale(cpu_khz, cpu); + } if (tsc_disabled > 0) return; /* now allow native_sched_clock() to use rdtsc */ + tsc_disabled = 0; + static_key_slow_inc(&__use_tsc); if (!no_sched_irq_time) enable_sched_clock_irqtime(); diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index adfdf56a3714..26488487bc61 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -16,7 +16,6 @@ */ #include <linux/spinlock.h> #include <linux/kernel.h> -#include <linux/init.h> #include <linux/smp.h> #include <linux/nmi.h> #include <asm/tsc.h> |