From e97df76377b8b3b1f7dfd5d6f8a1d5a31438b140 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 5 Feb 2014 20:48:51 +0100 Subject: perf/x86/intel/p6: Add userspace RDPMC quirk for PPro PPro machines can die hard when PCE gets enabled due to a CPU erratum. The safe way it so disable it by default and keep it disabled. See erratum 26 in: http://download.intel.com/design/archives/processors/pro/docs/24268935.pdf Reported-and-Tested-by: Mark Davies Cc: Alan Cox Cc: Stephane Eranian Cc: Vince Weaver Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140206170815.GW2936@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 6 ++++- arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_p6.c | 48 +++++++++++++++++++++++++------------ 3 files changed, 39 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b88645191fe5..1246b853c4e0 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1521,6 +1521,8 @@ static int __init init_hw_perf_events(void) pr_cont("%s PMU driver.\n", x86_pmu.name); + x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ + for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) quirk->func(); @@ -1534,7 +1536,6 @@ static int __init init_hw_perf_events(void) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, 0, x86_pmu.num_counters, 0, 0); - x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ x86_pmu_format_group.attrs = x86_pmu.format_attrs; if (x86_pmu.event_attrs) @@ -1820,6 +1821,9 @@ static ssize_t set_attr_rdpmc(struct device *cdev, if (ret) return ret; + if (x86_pmu.attr_rdpmc_broken) + return -ENOTSUPP; + if (!!val != !!x86_pmu.attr_rdpmc) { x86_pmu.attr_rdpmc = !!val; smp_call_function(change_rdpmc, (void *)val, 1); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index c1a861829d81..4972c244d0bc 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -409,6 +409,7 @@ struct x86_pmu { /* * sysfs attrs */ + int attr_rdpmc_broken; int attr_rdpmc; struct attribute **format_attrs; struct attribute **event_attrs; diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index b1e2fe115323..7c1a0c07b607 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -231,31 +231,49 @@ static __initconst const struct x86_pmu p6_pmu = { }; +static __init void p6_pmu_rdpmc_quirk(void) +{ + if (boot_cpu_data.x86_mask < 9) { + /* + * PPro erratum 26; fixed in stepping 9 and above. + */ + pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n"); + x86_pmu.attr_rdpmc_broken = 1; + x86_pmu.attr_rdpmc = 0; + } +} + __init int p6_pmu_init(void) { + x86_pmu = p6_pmu; + switch (boot_cpu_data.x86_model) { - case 1: - case 3: /* Pentium Pro */ - case 5: - case 6: /* Pentium II */ - case 7: - case 8: - case 11: /* Pentium III */ - case 9: - case 13: - /* Pentium M */ + case 1: /* Pentium Pro */ + x86_add_quirk(p6_pmu_rdpmc_quirk); + break; + + case 3: /* Pentium II - Klamath */ + case 5: /* Pentium II - Deschutes */ + case 6: /* Pentium II - Mendocino */ break; + + case 7: /* Pentium III - Katmai */ + case 8: /* Pentium III - Coppermine */ + case 10: /* Pentium III Xeon */ + case 11: /* Pentium III - Tualatin */ + break; + + case 9: /* Pentium M - Banias */ + case 13: /* Pentium M - Dothan */ + break; + default: - pr_cont("unsupported p6 CPU model %d ", - boot_cpu_data.x86_model); + pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model); return -ENODEV; } - x86_pmu = p6_pmu; - memcpy(hw_cache_event_ids, p6_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - return 0; } -- cgit From 0e9f2204cfa6d79abe3e525ddf7c4ab5792cc751 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 5 Feb 2014 11:19:56 +0100 Subject: perf/x86: Fix Userspace RDPMC switch The current code forgets to change the CR4 state on the current CPU. Use on_each_cpu() instead of smp_call_function(). Reported-by: Mark Davies Suggested-by: Mark Davies Signed-off-by: Peter Zijlstra Cc: fweisbec@gmail.com Link: http://lkml.kernel.org/n/tip-69efsat90ibhnd577zy3z9gh@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 1246b853c4e0..895604f2e916 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1826,7 +1826,7 @@ static ssize_t set_attr_rdpmc(struct device *cdev, if (!!val != !!x86_pmu.attr_rdpmc) { x86_pmu.attr_rdpmc = !!val; - smp_call_function(change_rdpmc, (void *)val, 1); + on_each_cpu(change_rdpmc, (void *)val, 1); } return count; -- cgit From c091c71ad2218fc50a07b3d1dab85783f3b77efd Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Fri, 24 Jan 2014 14:49:58 +0100 Subject: x86: dma-mapping: fix GFP_ATOMIC macro usage GFP_ATOMIC is not a single gfp flag, but a macro which expands to the other flags, where meaningful is the LACK of __GFP_WAIT flag. To check if caller wants to perform an atomic allocation, the code must test for a lack of the __GFP_WAIT flag. This patch fixes the issue introduced in v3.5-rc1. CC: stable@vger.kernel.org Signed-off-by: Marek Szyprowski --- arch/x86/kernel/pci-dma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 872079a67e4d..f7d0672481fd 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -100,8 +100,10 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, flag |= __GFP_ZERO; again: page = NULL; - if (!(flag & GFP_ATOMIC)) + /* CMA can be used only in the context which permits sleeping */ + if (flag & __GFP_WAIT) page = dma_alloc_from_contiguous(dev, count, get_order(size)); + /* fallback */ if (!page) page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); if (!page) -- cgit From 5f0e030930d715920be4de638084aaf8653867e8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Feb 2014 13:52:29 +0200 Subject: x86, tsc: Fallback to normal calibration if fast MSR calibration fails If we cannot calibrate TSC via MSR based calibration try_msr_calibrate_tsc() stores zero to fast_calibrate and returns that to the caller. This value gets then propagated further to clockevents code resulting division by zero oops like the one below: divide error: 0000 [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Tainted: G W 3.13.0+ #47 task: ffff880075508000 ti: ffff880075506000 task.ti: ffff880075506000 RIP: 0010:[] [] clockevents_config.part.3+0x24/0xa0 RSP: 0000:ffff880075507e58 EFLAGS: 00010246 RAX: ffffffffffffffff RBX: ffff880079c0cd80 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffffffffff RBP: ffff880075507e70 R08: 0000000000000001 R09: 00000000000000be R10: 00000000000000bd R11: 0000000000000003 R12: 000000000000b008 R13: 0000000000000008 R14: 000000000000b010 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff880079c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: ffff880079fff000 CR3: 0000000001c0b000 CR4: 00000000001006f0 Stack: ffff880079c0cd80 000000000000b008 0000000000000008 ffff880075507e88 ffffffff810aecb0 ffff880079c0cd80 ffff880075507e98 ffffffff81030168 ffff880075507ed8 ffffffff81d1104f 00000000000000c3 0000000000000000 Call Trace: [] clockevents_config_and_register+0x20/0x30 [] setup_APIC_timer+0xc8/0xd0 [] setup_boot_APIC_clock+0x4cc/0x4d8 [] native_smp_prepare_cpus+0x3dd/0x3f0 [] kernel_init_freeable+0xc3/0x205 [] ? rest_init+0x90/0x90 [] kernel_init+0xe/0x120 [] ret_from_fork+0x7c/0xb0 [] ? rest_init+0x90/0x90 Prevent this from happening by: 1) Modifying try_msr_calibrate_tsc() to return calibration value or zero if it fails. 2) Check this return value in native_calibrate_tsc() and in case of zero fallback to use normal non-MSR based calibration. [mw: Added subject and changelog] Reported-and-tested-by: Mika Westerberg Signed-off-by: Thomas Gleixner Cc: Bin Gao Cc: One Thousand Gnomes Cc: Ingo Molnar Cc: H. Peter Anvin Link: http://lkml.kernel.org/r/1392810750-18660-1-git-send-email-mika.westerberg@linux.intel.com Signed-off-by: Mika Westerberg Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/tsc.h | 2 +- arch/x86/kernel/tsc.c | 7 ++----- arch/x86/kernel/tsc_msr.c | 28 ++++++++++++++-------------- 3 files changed, 17 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 57ae63cd6ee2..94605c0e9cee 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -66,6 +66,6 @@ extern void tsc_save_sched_clock_state(void); extern void tsc_restore_sched_clock_state(void); /* MSR based TSC calibration for Intel Atom SoC platforms */ -int try_msr_calibrate_tsc(unsigned long *fast_calibrate); +unsigned long try_msr_calibrate_tsc(void); #endif /* _ASM_X86_TSC_H */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index acb3b606613e..cfbe99f88830 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -653,13 +653,10 @@ unsigned long native_calibrate_tsc(void) /* Calibrate TSC using MSR for Intel Atom SoCs */ local_irq_save(flags); - i = try_msr_calibrate_tsc(&fast_calibrate); + fast_calibrate = try_msr_calibrate_tsc(); local_irq_restore(flags); - if (i >= 0) { - if (i == 0) - pr_warn("Fast TSC calibration using MSR failed\n"); + if (fast_calibrate) return fast_calibrate; - } local_irq_save(flags); fast_calibrate = quick_pit_calibrate(); diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 8b5434f4389f..5dfff5809e74 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -77,21 +77,18 @@ static int match_cpu(u8 family, u8 model) /* * Do MSR calibration only for known/supported CPUs. - * Return values: - * -1: CPU is unknown/unsupported for MSR based calibration - * 0: CPU is known/supported, but calibration failed - * 1: CPU is known/supported, and calibration succeeded + * + * Returns the calibration value or 0 if MSR calibration failed. */ -int try_msr_calibrate_tsc(unsigned long *fast_calibrate) +unsigned long try_msr_calibrate_tsc(void) { - int cpu_index; u32 lo, hi, ratio, freq_id, freq; + unsigned long res; + int cpu_index; cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model); if (cpu_index < 0) - return -1; - - *fast_calibrate = 0; + return 0; if (freq_desc_tables[cpu_index].msr_plat) { rdmsr(MSR_PLATFORM_INFO, lo, hi); @@ -103,7 +100,7 @@ int try_msr_calibrate_tsc(unsigned long *fast_calibrate) pr_info("Maximum core-clock to bus-clock ratio: 0x%x\n", ratio); if (!ratio) - return 0; + goto fail; /* Get FSB FREQ ID */ rdmsr(MSR_FSB_FREQ, lo, hi); @@ -112,16 +109,19 @@ int try_msr_calibrate_tsc(unsigned long *fast_calibrate) pr_info("Resolved frequency ID: %u, frequency: %u KHz\n", freq_id, freq); if (!freq) - return 0; + goto fail; /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */ - *fast_calibrate = freq * ratio; - pr_info("TSC runs at %lu KHz\n", *fast_calibrate); + res = freq * ratio; + pr_info("TSC runs at %lu KHz\n", res); #ifdef CONFIG_X86_LOCAL_APIC lapic_timer_frequency = (freq * 1000) / HZ; pr_info("lapic_timer_frequency = %d\n", lapic_timer_frequency); #endif + return res; - return 1; +fail: + pr_warn("Fast TSC calibration using MSR failed\n"); + return 0; } -- cgit From 3e11e818bfd7bd4a8e1214970337bab73ffed32d Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Wed, 19 Feb 2014 13:52:30 +0200 Subject: x86: tsc: Add missing Baytrail frequency to the table Intel Baytrail is based on Silvermont core so MSR_FSB_FREQ[2:0] == 0 means that the CPU reference clock runs at 83.3MHz. Add this missing frequency to the table. Signed-off-by: Mika Westerberg Cc: Bin Gao Cc: One Thousand Gnomes Cc: Ingo Molnar Cc: H. Peter Anvin Link: http://lkml.kernel.org/r/1392810750-18660-2-git-send-email-mika.westerberg@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc_msr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 5dfff5809e74..92ae6acac8a7 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -53,7 +53,7 @@ static struct freq_desc freq_desc_tables[] = { /* TNG */ { 6, 0x4a, 1, { 0, FREQ_100, FREQ_133, 0, 0, 0, 0, 0 } }, /* VLV2 */ - { 6, 0x37, 1, { 0, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } }, + { 6, 0x37, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } }, /* ANN */ { 6, 0x5a, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_100, 0, 0, 0, 0 } }, }; -- cgit From a3ef2229c94ff70998724cb64b9cb4c77db9e950 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 14 Feb 2014 16:44:08 -0800 Subject: perf, nmi: Fix unknown NMI warning When using BTS on Core i7-4*, I get the below kernel warning. $ perf record -c 1 -e branches:u ls Message from syslogd@labpc1501 at Nov 11 15:49:25 ... kernel:[ 438.317893] Uhhuh. NMI received for unknown reason 31 on CPU 2. Message from syslogd@labpc1501 at Nov 11 15:49:25 ... kernel:[ 438.317920] Do you have a strange power saving mode enabled? Message from syslogd@labpc1501 at Nov 11 15:49:25 ... kernel:[ 438.317945] Dazed and confused, but trying to continue Make intel_pmu_handle_irq() take the full exit path when returning early. Cc: eranian@google.com Cc: peterz@infradead.org Cc: mingo@kernel.org Signed-off-by: Markus Metzger Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392425048-5309-1-git-send-email-andi@firstfloor.org Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 0fa4f242f050..698ae77d6f18 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1361,10 +1361,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) intel_pmu_disable_all(); handled = intel_pmu_drain_bts_buffer(); status = intel_pmu_get_status(); - if (!status) { - intel_pmu_enable_all(0); - return handled; - } + if (!status) + goto done; loops = 0; again: -- cgit From c9b08884c9c98929ec2d8abafd78e89062d01ee7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2014 14:29:03 +0100 Subject: perf/x86: Correctly use FEATURE_PDCM The current code simply assumes Intel Arch PerfMon v2+ to have the IA32_PERF_CAPABILITIES MSR; the SDM specifies that we should check CPUID[1].ECX[15] (aka, FEATURE_PDCM) instead. This was found by KVM which implements v2+ but didn't provide the capabilities MSR. Change the code to DTRT; KVM will also implement the MSR and return 0. Cc: pbonzini@redhat.com Reported-by: "Michael S. Tsirkin" Suggested-by: Eduardo Habkost Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140203132903.GI8874@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 698ae77d6f18..aa333d966886 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2308,10 +2308,7 @@ __init int intel_pmu_init(void) if (version > 1) x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); - /* - * v2 and above have a perf capabilities MSR - */ - if (version > 1) { + if (boot_cpu_has(X86_FEATURE_PDCM)) { u64 capabilities; rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities); -- cgit From 337397f3afc911d94d1d71371a36a53ce218b41f Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 19 Feb 2014 14:10:18 +0100 Subject: perf/x86/uncore: Fix IVT/SNB-EP uncore CBOX NID filter table This patch updates the CBOX PMU filters mapping tables for SNB-EP and IVT (model 45 and 62 respectively). The NID umask always comes in addition to another umask. When set, the NID filter is applied. The current mapping tables were missing some code/umask combinations to account for the NID umask. This patch fixes that. Cc: mingo@elte.hu Cc: ak@linux.intel.com Reviewed-by: Yan, Zheng Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140219131018.GA24475@quad Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 29c248799ced..c88f7f4b03ee 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -501,8 +501,11 @@ static struct extra_reg snbep_uncore_cbox_extra_regs[] = { SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, SNBEP_CBO_PMON_CTL_TID_EN, 0x1), SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6), SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6), SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6), SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6), SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8), SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8), @@ -1178,10 +1181,15 @@ static struct extra_reg ivt_uncore_cbox_extra_regs[] = { SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, SNBEP_CBO_PMON_CTL_TID_EN, 0x1), SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc), + SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc), SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc), SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc), SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc), + SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc), SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10), SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10), SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10), -- cgit From b6085a865762236bb84934161273cdac6dd11c2d Mon Sep 17 00:00:00 2001 From: Eugene Surovegin Date: Thu, 23 Jan 2014 09:31:20 -0800 Subject: x86, kaslr: export offset in VMCOREINFO ELF notes Include kASLR offset in VMCOREINFO ELF notes to assist in debugging. [ hpa: pushing this for v3.14 to avoid having a kernel version with kASLR where we can't debug output. ] Signed-off-by: Eugene Surovegin Link: http://lkml.kernel.org/r/20140123173120.GA25474@www.outflux.net Signed-off-by: Kees Cook Signed-off-by: H. Peter Anvin --- arch/x86/kernel/machine_kexec_64.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 4eabc160696f..679cef0791cd 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -279,5 +279,7 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_SYMBOL(node_data); VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); #endif + vmcoreinfo_append_str("KERNELOFFSET=%lx\n", + (unsigned long)&_text - __START_KERNEL); } -- cgit From e290e8c59dbc2a15088d868170d799f763202fef Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 9 Feb 2014 13:56:44 -0800 Subject: x86, kaslr: add missed "static" declarations This silences build warnings about unexported variables and functions. Signed-off-by: Kees Cook Link: http://lkml.kernel.org/r/20140209215644.GA30339@www.outflux.net Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/aslr.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index 90a21f430117..4dbf967da50d 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -111,7 +111,7 @@ struct mem_vector { }; #define MEM_AVOID_MAX 5 -struct mem_vector mem_avoid[MEM_AVOID_MAX]; +static struct mem_vector mem_avoid[MEM_AVOID_MAX]; static bool mem_contains(struct mem_vector *region, struct mem_vector *item) { @@ -180,7 +180,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, } /* Does this memory vector overlap a known avoided area? */ -bool mem_avoid_overlap(struct mem_vector *img) +static bool mem_avoid_overlap(struct mem_vector *img) { int i; @@ -192,8 +192,9 @@ bool mem_avoid_overlap(struct mem_vector *img) return false; } -unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET / CONFIG_PHYSICAL_ALIGN]; -unsigned long slot_max = 0; +static unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET / + CONFIG_PHYSICAL_ALIGN]; +static unsigned long slot_max; static void slots_append(unsigned long addr) { -- cgit From 404381c5839d67aa0c275ad1da96ef3d3928ca2c Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 24 Feb 2014 13:59:32 -0300 Subject: KVM: MMU: drop read-only large sptes when creating lower level sptes Read-only large sptes can be created due to read-only faults as follows: - QEMU pagetable entry that maps guest memory is read-only due to COW. - Guest read faults such memory, COW is not broken, because it is a read-only fault. - Enable dirty logging, large spte not nuked because it is read-only. - Write-fault on such memory causes guest to loop endlessly (which must go down to level 1 because dirty logging is enabled). Fix by dropping large spte when necessary. Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e50425d0f5f7..9b531351a587 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2672,6 +2672,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, break; } + drop_large_spte(vcpu, iterator.sptep); if (!is_shadow_present_pte(*iterator.sptep)) { u64 base_addr = iterator.addr; -- cgit From 26e61e8939b1fe8729572dabe9a9e97d930dd4f6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Feb 2014 16:03:12 +0100 Subject: perf/x86: Fix event scheduling Vince "Super Tester" Weaver reported a new round of syscall fuzzing (Trinity) failures, with perf WARN_ON()s triggering. He also provided traces of the failures. This is I think the relevant bit: > pec_1076_warn-2804 [000] d... 147.926153: x86_pmu_disable: x86_pmu_disable > pec_1076_warn-2804 [000] d... 147.926153: x86_pmu_state: Events: { > pec_1076_warn-2804 [000] d... 147.926156: x86_pmu_state: 0: state: .R config: ffffffffffffffff ( (null)) > pec_1076_warn-2804 [000] d... 147.926158: x86_pmu_state: 33: state: AR config: 0 (ffff88011ac99800) > pec_1076_warn-2804 [000] d... 147.926159: x86_pmu_state: } > pec_1076_warn-2804 [000] d... 147.926160: x86_pmu_state: n_events: 1, n_added: 0, n_txn: 1 > pec_1076_warn-2804 [000] d... 147.926161: x86_pmu_state: Assignment: { > pec_1076_warn-2804 [000] d... 147.926162: x86_pmu_state: 0->33 tag: 1 config: 0 (ffff88011ac99800) > pec_1076_warn-2804 [000] d... 147.926163: x86_pmu_state: } > pec_1076_warn-2804 [000] d... 147.926166: collect_events: Adding event: 1 (ffff880119ec8800) So we add the insn:p event (fd[23]). At this point we should have: n_events = 2, n_added = 1, n_txn = 1 > pec_1076_warn-2804 [000] d... 147.926170: collect_events: Adding event: 0 (ffff8800c9e01800) > pec_1076_warn-2804 [000] d... 147.926172: collect_events: Adding event: 4 (ffff8800cbab2c00) We try and add the {BP,cycles,br_insn} group (fd[3], fd[4], fd[15]). These events are 0:cycles and 4:br_insn, the BP event isn't x86_pmu so that's not visible. group_sched_in() pmu->start_txn() /* nop - BP pmu */ event_sched_in() event->pmu->add() So here we should end up with: 0: n_events = 3, n_added = 2, n_txn = 2 4: n_events = 4, n_added = 3, n_txn = 3 But seeing the below state on x86_pmu_enable(), the must have failed, because the 0 and 4 events aren't there anymore. Looking at group_sched_in(), since the BP is the leader, its event_sched_in() must have succeeded, for otherwise we would not have seen the sibling adds. But since neither 0 or 4 are in the below state; their event_sched_in() must have failed; but I don't see why, the complete state: 0,0,1:p,4 fits perfectly fine on a core2. However, since we try and schedule 4 it means the 0 event must have succeeded! Therefore the 4 event must have failed, its failure will have put group_sched_in() into the fail path, which will call: event_sched_out() event->pmu->del() on 0 and the BP event. Now x86_pmu_del() will reduce n_events; but it will not reduce n_added; giving what we see below: n_event = 2, n_added = 2, n_txn = 2 > pec_1076_warn-2804 [000] d... 147.926177: x86_pmu_enable: x86_pmu_enable > pec_1076_warn-2804 [000] d... 147.926177: x86_pmu_state: Events: { > pec_1076_warn-2804 [000] d... 147.926179: x86_pmu_state: 0: state: .R config: ffffffffffffffff ( (null)) > pec_1076_warn-2804 [000] d... 147.926181: x86_pmu_state: 33: state: AR config: 0 (ffff88011ac99800) > pec_1076_warn-2804 [000] d... 147.926182: x86_pmu_state: } > pec_1076_warn-2804 [000] d... 147.926184: x86_pmu_state: n_events: 2, n_added: 2, n_txn: 2 > pec_1076_warn-2804 [000] d... 147.926184: x86_pmu_state: Assignment: { > pec_1076_warn-2804 [000] d... 147.926186: x86_pmu_state: 0->33 tag: 1 config: 0 (ffff88011ac99800) > pec_1076_warn-2804 [000] d... 147.926188: x86_pmu_state: 1->0 tag: 1 config: 1 (ffff880119ec8800) > pec_1076_warn-2804 [000] d... 147.926188: x86_pmu_state: } > pec_1076_warn-2804 [000] d... 147.926190: x86_pmu_enable: S0: hwc->idx: 33, hwc->last_cpu: 0, hwc->last_tag: 1 hwc->state: 0 So the problem is that x86_pmu_del(), when called from a group_sched_in() that fails (for whatever reason), and without x86_pmu TXN support (because the leader is !x86_pmu), will corrupt the n_added state. Reported-and-Tested-by: Vince Weaver Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Cc: Stephane Eranian Cc: Dave Jones Cc: Link: http://lkml.kernel.org/r/20140221150312.GF3104@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 895604f2e916..79f9f848bee4 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1192,6 +1192,9 @@ static void x86_pmu_del(struct perf_event *event, int flags) for (i = 0; i < cpuc->n_events; i++) { if (event == cpuc->event_list[i]) { + if (i >= cpuc->n_events - cpuc->n_added) + --cpuc->n_added; + if (x86_pmu.put_event_constraints) x86_pmu.put_event_constraints(cpuc, event); -- cgit From a08d3b3b99efd509133946056531cdf8f3a0c09b Mon Sep 17 00:00:00 2001 From: Andrew Honig Date: Thu, 27 Feb 2014 19:35:14 +0100 Subject: kvm: x86: fix emulator buffer overflow (CVE-2014-0049) The problem occurs when the guest performs a pusha with the stack address pointing to an mmio address (or an invalid guest physical address) to start with, but then extending into an ordinary guest physical address. When doing repeated emulated pushes emulator_read_write sets mmio_needed to 1 on the first one. On a later push when the stack points to regular memory, mmio_nr_fragments is set to 0, but mmio_is_needed is not set to 0. As a result, KVM exits to userspace, and then returns to complete_emulated_mmio. In complete_emulated_mmio vcpu->mmio_cur_fragment is incremented. The termination condition of vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments is never achieved. The code bounces back and fourth to userspace incrementing mmio_cur_fragment past it's buffer. If the guest does nothing else it eventually leads to a a crash on a memcpy from invalid memory address. However if a guest code can cause the vm to be destroyed in another vcpu with excellent timing, then kvm_clear_async_pf_completion_queue can be used by the guest to control the data that's pointed to by the call to cancel_work_item, which can be used to gain execution. Fixes: f78146b0f9230765c6315b2e14f56112513389ad Signed-off-by: Andrew Honig Cc: stable@vger.kernel.org (3.5+) Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 39c28f09dfd5..2b8578432d5b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6186,7 +6186,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) frag->len -= len; } - if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { + if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) { vcpu->mmio_needed = 0; /* FIXME: return into emulator if single-stepping. */ -- cgit From 1b385cbdd74aa803e966e01e5fe49490d6044e30 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 27 Feb 2014 22:54:11 +0100 Subject: kvm, vmx: Really fix lazy FPU on nested guest Commit e504c9098ed6 (kvm, vmx: Fix lazy FPU on nested guest, 2013-11-13) highlighted a real problem, but the fix was subtly wrong. nested_read_cr0 is the CR0 as read by L2, but here we want to look at the CR0 value reflecting L1's setup. In other words, L2 might think that TS=0 (so nested_read_cr0 has the bit clear); but if L1 is actually running it with TS=1, we should inject the fault into L1. The effective value of CR0 in L2 is contained in vmcs12->guest_cr0, use it. Fixes: e504c9098ed6acd9e1079c5e10e4910724ad429f Reported-by: Kashyap Chamarty Reported-by: Stefan Bader Tested-by: Kashyap Chamarty Tested-by: Anthoine Bourgeois Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a06f101ef64b..392752834751 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6688,7 +6688,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) else if (is_page_fault(intr_info)) return enable_ept; else if (is_no_device(intr_info) && - !(nested_read_cr0(vmcs12) & X86_CR0_TS)) + !(vmcs12->guest_cr0 & X86_CR0_TS)) return 0; return vmcs12->exception_bitmap & (1u << (intr_info & INTR_INFO_VECTOR_MASK)); -- cgit From a5d90c923bcfb9632d998ed06e9569216ad695f3 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 4 Mar 2014 17:02:17 +0100 Subject: x86/efi: Quirk out SGI UV Alex reported hitting the following BUG after the EFI 1:1 virtual mapping work was merged, kernel BUG at arch/x86/mm/init_64.c:351! invalid opcode: 0000 [#1] SMP Call Trace: [] init_extra_mapping_uc+0x13/0x15 [] uv_system_init+0x22b/0x124b [] ? clockevents_register_device+0x138/0x13d [] ? setup_APIC_timer+0xc5/0xc7 [] ? clockevent_delta2ns+0xb/0xd [] ? setup_boot_APIC_clock+0x4a8/0x4b7 [] ? printk+0x72/0x74 [] native_smp_prepare_cpus+0x389/0x3d6 [] kernel_init_freeable+0xb7/0x1fb [] ? rest_init+0x74/0x74 [] kernel_init+0x9/0xff [] ret_from_fork+0x7c/0xb0 [] ? rest_init+0x74/0x74 Getting this thing to work with the new mapping scheme would need more work, so automatically switch to the old memmap layout for SGI UV. Acked-by: Russ Anderson Cc: Alex Thorlton Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 1 + arch/x86/kernel/setup.c | 10 ++-------- arch/x86/platform/efi/efi.c | 20 ++++++++++++++++++++ 3 files changed, 23 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 3d6b9f81cc68..acd86c850414 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -134,6 +134,7 @@ extern void efi_setup_page_tables(void); extern void __init old_map_region(efi_memory_desc_t *md); extern void __init runtime_code_page_mkexec(void); extern void __init efi_runtime_mkexec(void); +extern void __init efi_apply_memmap_quirks(void); struct efi_setup_data { u64 fw_vendor; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 06853e670354..ce72964b2f46 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1239,14 +1239,8 @@ void __init setup_arch(char **cmdline_p) register_refined_jiffies(CLOCK_TICK_RATE); #ifdef CONFIG_EFI - /* Once setup is done above, unmap the EFI memory map on - * mismatched firmware/kernel archtectures since there is no - * support for runtime services. - */ - if (efi_enabled(EFI_BOOT) && !efi_is_native()) { - pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n"); - efi_unmap_memmap(); - } + if (efi_enabled(EFI_BOOT)) + efi_apply_memmap_quirks(); #endif } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 1a201ac7cef8..b97acecf3fd9 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -52,6 +52,7 @@ #include #include #include +#include #define EFI_DEBUG @@ -1210,3 +1211,22 @@ static int __init parse_efi_cmdline(char *str) return 0; } early_param("efi", parse_efi_cmdline); + +void __init efi_apply_memmap_quirks(void) +{ + /* + * Once setup is done earlier, unmap the EFI memory map on mismatched + * firmware/kernel architectures since there is no support for runtime + * services. + */ + if (!efi_is_native()) { + pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n"); + efi_unmap_memmap(); + } + + /* + * UV doesn't support the new EFI pagetable mapping yet. + */ + if (is_uv_system()) + set_bit(EFI_OLD_MEMMAP, &x86_efi_facility); +} -- cgit From 0ac09f9f8cd1fb028a48330edba6023d347d3cea Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 28 Feb 2014 17:05:26 +0100 Subject: x86, trace: Fix CR2 corruption when tracing page faults The trace_do_page_fault function trigger tracepoint and then handles the actual page fault. This could lead to error if the tracepoint caused page fault. The original cr2 value gets lost and the original page fault handler kills current process with SIGSEGV. This happens if you record page faults with callchain data, the user part of it will cause tracepoint handler to page fault: # perf record -g -e exceptions:page_fault_user ls Fixing this by saving the original cr2 value and using it after tracepoint handler is done. v2: Moving the cr2 read before exception_enter, because it could trigger tracepoint as well. Reported-by: Arnaldo Carvalho de Melo Reported-by: Vince Weaver Tested-by: Vince Weaver Acked-by: Steven Rostedt Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Seiji Aguchi Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1402211701380.6395@vincent-weaver-1.um.maine.edu Link: http://lkml.kernel.org/r/20140228160526.GD1133@krava.brq.redhat.com --- arch/x86/mm/fault.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 6dea040cc3a1..e7fa28bf3262 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1022,11 +1022,11 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) * routines. */ static void __kprobes -__do_page_fault(struct pt_regs *regs, unsigned long error_code) +__do_page_fault(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { struct vm_area_struct *vma; struct task_struct *tsk; - unsigned long address; struct mm_struct *mm; int fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; @@ -1034,9 +1034,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) tsk = current; mm = tsk->mm; - /* Get the faulting address: */ - address = read_cr2(); - /* * Detect and handle instructions that would cause a page fault for * both a tracked kernel page and a userspace page. @@ -1252,9 +1249,11 @@ dotraplinkage void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) { enum ctx_state prev_state; + /* Get the faulting address: */ + unsigned long address = read_cr2(); prev_state = exception_enter(); - __do_page_fault(regs, error_code); + __do_page_fault(regs, error_code, address); exception_exit(prev_state); } @@ -1271,9 +1270,16 @@ dotraplinkage void __kprobes trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) { enum ctx_state prev_state; + /* + * The exception_enter and tracepoint processing could + * trigger another page faults (user space callchain + * reading) and destroy the original cr2 value, so read + * the faulting address now. + */ + unsigned long address = read_cr2(); prev_state = exception_enter(); trace_page_fault_entries(regs, error_code); - __do_page_fault(regs, error_code); + __do_page_fault(regs, error_code, address); exception_exit(prev_state); } -- cgit