From 6ec3cfeca04622e3d80c9270191cd7f5f88214af Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Mon, 13 Apr 2009 15:20:58 -0700 Subject: x86, irq: Remove IRQ_DISABLED check in process context IRQ move As discussed in the thread here: http://marc.info/?l=linux-kernel&m=123964468521142&w=2 Eric W. Biederman observed: > It looks like some additional bugs have slipped in since last I looked. > > set_irq_affinity does this: > ifdef CONFIG_GENERIC_PENDING_IRQ > if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { > cpumask_copy(desc->affinity, cpumask); > desc->chip->set_affinity(irq, cpumask); > } else { > desc->status |= IRQ_MOVE_PENDING; > cpumask_copy(desc->pending_mask, cpumask); > } > #else > > That IRQ_DISABLED case is a software state and as such it has nothing to > do with how safe it is to move an irq in process context. [...] > > The only reason we migrate MSIs in interrupt context today is that there > wasn't infrastructure for support migration both in interrupt context > and outside of it. Yes. The idea here was to force the MSI migration to happen in process context. One of the patches in the series did disable_irq(dev->irq); irq_set_affinity(dev->irq, cpumask_of(dev->cpu)); enable_irq(dev->irq); with the above patch adding irq/manage code check for interrupt disabled and moving the interrupt in process context. IIRC, there was no IRQ_MOVE_PCNTXT when we were developing this HPET code and we ended up having this ugly hack. IRQ_MOVE_PCNTXT was there when we eventually submitted the patch upstream. But, looks like I did a blind rebasing instead of using IRQ_MOVE_PCNTXT in hpet MSI code. Below patch fixes this. i.e., revert commit 932775a4ab622e3c99bd59f14cc and add PCNTXT to HPET MSI setup. Also removes copying of desc->affinity in generic code as set_affinity routines are doing it internally. Reported-by: "Eric W. Biederman" Signed-off-by: Venkatesh Pallipadi Acked-by: "Eric W. Biederman" Cc: "Li Shaohua" Cc: Gary Hade Cc: "lcm@us.ibm.com" Cc: suresh.b.siddha@intel.com LKML-Reference: <20090413222058.GB8211@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a2789e42e162..30da617d18e4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3670,12 +3670,14 @@ int arch_setup_hpet_msi(unsigned int irq) { int ret; struct msi_msg msg; + struct irq_desc *desc = irq_to_desc(irq); ret = msi_compose_msg(NULL, irq, &msg); if (ret < 0) return ret; hpet_msi_write(irq, &msg); + desc->status |= IRQ_MOVE_PCNTXT; set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, "edge"); -- cgit From 27229ca63269c1be0a710f074fa9de4024f283f1 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Fri, 17 Apr 2009 09:24:47 -0500 Subject: x86/uv: fix init of cpu-less nodes Fix an endcase in the UV initialization code for the "UV large system mode" of apicids. If node zero contains no cpus, cpus on another node will be the boot cpu. The percpu data that contains the extra apicid bits was not being initialized early enough. [ Impact: fix potential boot crash on cpu-less UV nodes ] Signed-off-by: Jack Steiner LKML-Reference: <20090417142447.GA23759@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 1248318436e8..49d2f5c739b7 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -34,6 +35,17 @@ DEFINE_PER_CPU(int, x2apic_extra_bits); static enum uv_system_type uv_system_type; +static int early_get_nodeid(void) +{ + union uvh_node_id_u node_id; + unsigned long *mmr; + + mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr)); + node_id.v = *mmr; + early_iounmap(mmr, sizeof(*mmr)); + return node_id.s.node_id; +} + static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (!strcmp(oem_id, "SGI")) { @@ -42,6 +54,8 @@ static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) else if (!strcmp(oem_table_id, "UVX")) uv_system_type = UV_X2APIC; else if (!strcmp(oem_table_id, "UVH")) { + __get_cpu_var(x2apic_extra_bits) = + early_get_nodeid() << (UV_APIC_PNODE_SHIFT - 1); uv_system_type = UV_NON_UNIQUE_APIC; return 1; } -- cgit From 0300e7f1a525ae4e4ac05344624adf0e5f13ea52 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 17 Apr 2009 08:33:52 -0400 Subject: lockdep, x86: account for irqs enabled in paranoid_exit I hit the check_flags error of lockdep: WARNING: at kernel/lockdep.c:2893 check_flags+0x1a7/0x1d0() [...] hardirqs last enabled at (12567): [] local_bh_enable+0xaa/0x110 hardirqs last disabled at (12569): [] int3+0x16/0x40 softirqs last enabled at (12566): [] lock_sock_nested+0xfb/0x110 softirqs last disabled at (12568): [] tcp_prequeue_process+0x2e/0xa0 The check_flags warning of lockdep tells me that lockdep thought interrupts were disabled, but they were really enabled. The numbers in the above parenthesis show the order of events: 12566: softirqs last enabled: lock_sock_nested 12567: hardirqs last enabled: local_bh_enable 12568: softirqs last disabled: tcp_prequeue_process 12566: hardirqs last disabled: int3 int3 is a breakpoint! Examining this further, I have CONFIG_NET_TCPPROBE enabled which adds break points into the kernel. The paranoid_exit of the return of int3 does not account for enabling interrupts on return to kernel. This code is a bit tricky since it is also used by the nmi handler (when lockdep is off), and we must be careful about the swapgs. We can not call kernel code after the swapgs has been performed. [ Impact: fix lockdep check_flags warning + self-turn-off ] Acked-by: Peter Zijlsta Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a331ec38af9e..38946c6e8433 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1410,7 +1410,10 @@ ENTRY(paranoid_exit) paranoid_swapgs: TRACE_IRQS_IRETQ 0 SWAPGS_UNSAFE_STACK + RESTORE_ALL 8 + jmp irq_return paranoid_restore: + TRACE_IRQS_IRETQ 0 RESTORE_ALL 8 jmp irq_return paranoid_userspace: -- cgit From fc61e6636d13eb3a23eb29b4327eeee9de0ef3bc Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Mon, 20 Apr 2009 08:25:31 -0500 Subject: x86/uv: fix for no memory at paddr 0 Fix endcase where the memory at physical address 0 does not really exist AND one of the sockets on blade 0 has no active cpus. The memory that _appears_ to be at physical address 0 is actually memory that located at a different address but has been remapped by the chipset so that it appears to be at physical address 0. When determining the UV pnode, the algorithm for determining the pnode incorrectly used the relocated physical address instead of the actual (global) address. [ Impact: boot failure on partitioned systems ] Signed-off-by: Jack Steiner LKML-Reference: <20090420132530.GA23156@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index d6712334ce4b..2bda69352976 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -652,6 +652,7 @@ void __init uv_system_init(void) if (uv_node_to_blade[nid] >= 0) continue; paddr = node_start_pfn(nid) << PAGE_SHIFT; + paddr = uv_soc_phys_ram_to_gpa(paddr); pnode = (paddr >> m_val) & pnode_mask; blade = boot_pnode_to_blade(pnode); uv_node_to_blade[nid] = blade; -- cgit From 06c38d5e36b12d040839ff224e805146c0368556 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 9 Apr 2009 15:24:34 -0700 Subject: x86-64: fix FPU corruption with signals and preemption In 64bit signal delivery path, clear_used_math() was happening before saving the current active FPU state on to the user stack for signal handling. Between clear_used_math() and the state store on to the user stack, potentially we can get a page fault for the user address and can block. Infact, while testing we were hitting the might_fault() in __clear_user() which can do a schedule(). At a later point in time, we will schedule back into this process and resume the save state (using "xsave/fxsave" instruction) which can lead to DNA fault. And as used_math was cleared before, we will reinit the FP state in the DNA fault and continue. This reinit will result in loosing the FPU state of the process. Move clear_used_math() to a point after the FPU state has been stored onto the user stack. This issue is present from a long time (even before the xsave changes and the x86 merge). But it can easily be exposed in 2.6.28.x and 2.6.29.x series because of the __clear_user() in this path, which has an explicit __cond_resched() leading to a context switch with CONFIG_PREEMPT_VOLUNTARY. [ Impact: fix FPU state corruption ] Signed-off-by: Suresh Siddha Cc: [2.6.28.x, 2.6.29.x] Signed-off-by: H. Peter Anvin --- arch/x86/kernel/xsave.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 0a5b04aa98f1..c5ee17e8c6d9 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -89,7 +89,7 @@ int save_i387_xstate(void __user *buf) if (!used_math()) return 0; - clear_used_math(); /* trigger finit */ + if (task_thread_info(tsk)->status & TS_USEDFPU) { /* * Start with clearing the user buffer. This will present a @@ -114,6 +114,8 @@ int save_i387_xstate(void __user *buf) return -1; } + clear_used_math(); /* trigger finit */ + if (task_thread_info(tsk)->status & TS_XSAVE) { struct _fpstate __user *fx = buf; struct _xstate __user *x = buf; -- cgit From 2f537a9f8e82f55c241b002c8cfbf34303b45ada Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 21 Apr 2009 16:00:15 +0930 Subject: x86: fix boot crash in NMI watchdog with CONFIG_CPUMASK_OFFSTACK=y and flat APIC fcef8576d8a64fc603e719c97d423f9f6d4e0e8b converted backtrace_mask to a cpumask_var_t, and assumed check_nmi_watchdog was called before nmi_watchdog_tick was ever called. Steven's oops shows I was wrong. This is something of a bandaid: I'm not sure we *should* be calling nmi_watchdog_tick before check_nmi_watchdog. Note that gcc eliminates this test for the CONFIG_CPUMASK_OFFSTACK=n case. [ Impact: fix boot crash in rare configs ] Reported-by: Steven Rostedt Signed-off-by: Rusty Russell LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/nmi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index d6bd62407152..2ba52f35a88c 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -414,7 +414,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) touched = 1; } - if (cpumask_test_cpu(cpu, backtrace_mask)) { + /* We can be called before check_nmi_watchdog, hence NULL check. */ + if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) { static DEFINE_SPINLOCK(lock); /* Serialise the printks */ spin_lock(&lock); -- cgit From fcc5c4a2feea3886dc058498b28508b2731720d5 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 21 Apr 2009 16:03:41 +0930 Subject: x86: avoid theoretical spurious NMI backtraces with CONFIG_CPUMASK_OFFSTACK=y In theory (though not shown in practice) alloc_cpumask_var() doesn't zero memory, so CPUs might print an "NMI backtrace for cpu %d" once on boot. (Bug introduced in fcef8576d8a64fc603e719c97d423f9f6d4e0e8b). [ Impact: avoid theoretical syslog noise in rare configs ] Signed-off-by: Rusty Russell Cc: Steven Rostedt LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/nmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 2ba52f35a88c..ce4fbfa315a1 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -138,7 +138,7 @@ int __init check_nmi_watchdog(void) if (!prev_nmi_count) goto error; - alloc_cpumask_var(&backtrace_mask, GFP_KERNEL); + alloc_cpumask_var(&backtrace_mask, GFP_KERNEL|__GFP_ZERO); printk(KERN_INFO "Testing NMI watchdog ... "); #ifdef CONFIG_SMP -- cgit From 7a6f9cbb37120c745fc187083fb5c3de4dca4f97 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 21 Apr 2009 20:00:37 +0200 Subject: x86: hpet: fix periodic mode programming on AMD 81xx (See http://bugzilla.kernel.org/show_bug.cgi?id=12961) It partially reverts commit c23e253e67c9d8a91a0ffa33c1f571a17f0a2403 (x86: hpet: stop HPET_COUNTER when programming periodic mode) HPET on AMD 81xx chipset needs a second write (with HPET_TN_SETVAL cleared) to T0_CMP register to set the period in periodic mode. With this patch HPET_COUNTER is still stopped but not reset when HPET is programmed in periodic mode. This should help to avoid races when HPET is programmed in periodic mode and fixes a boot time hang that I've observed on a machine when using 1000HZ. [ Impact: fix boot time hang on machines with AMD 81xx chipset ] Reported-by: Jeff Mahoney Signed-off-by: Andreas Herrmann Tested-by: Jeff Mahoney LKML-Reference: <20090421180037.GA2763@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 3f0019e0a229..81408b93f887 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -236,6 +236,10 @@ static void hpet_stop_counter(void) unsigned long cfg = hpet_readl(HPET_CFG); cfg &= ~HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); +} + +static void hpet_reset_counter(void) +{ hpet_writel(0, HPET_COUNTER); hpet_writel(0, HPET_COUNTER + 4); } @@ -250,6 +254,7 @@ static void hpet_start_counter(void) static void hpet_restart_counter(void) { hpet_stop_counter(); + hpet_reset_counter(); hpet_start_counter(); } @@ -309,7 +314,7 @@ static int hpet_setup_msi_irq(unsigned int irq); static void hpet_set_mode(enum clock_event_mode mode, struct clock_event_device *evt, int timer) { - unsigned long cfg; + unsigned long cfg, cmp, now; uint64_t delta; switch (mode) { @@ -317,12 +322,23 @@ static void hpet_set_mode(enum clock_event_mode mode, hpet_stop_counter(); delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; delta >>= evt->shift; + now = hpet_readl(HPET_COUNTER); + cmp = now + (unsigned long) delta; cfg = hpet_readl(HPET_Tn_CFG(timer)); /* Make sure we use edge triggered interrupts */ cfg &= ~HPET_TN_LEVEL; cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT; hpet_writel(cfg, HPET_Tn_CFG(timer)); + hpet_writel(cmp, HPET_Tn_CMP(timer)); + udelay(1); + /* + * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL + * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL + * bit is automatically cleared after the first write. + * (See AMD-8111 HyperTransport I/O Hub Data Sheet, + * Publication # 24674) + */ hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); hpet_start_counter(); hpet_print_config(); -- cgit From c5428e950ad42640f00092949fd17e356dfdeafa Mon Sep 17 00:00:00 2001 From: Coly Li Date: Wed, 22 Apr 2009 23:21:56 +0800 Subject: uv_time: add parameter to uv_read_rtc() uv_read_rtc() is referenced by read member of struct clocksource clocksource_uv. In include/linux/clocksource.h, read of struct clocksource is declared as: cycle_t (*read)(struct clocksource *cs) This got introduced recently in: 8e19608: clocksource: pass clocksource to read() callback But arch/x86/kernel/uv_time.c was not properly converted by that pach. This patch adds a dummy parameter (struct clocksource type) to uv_read_rtc() to fix the incompatible reference in clocksource_uv, and add a NULL parameter in all places where uv_read_rtc() gets called. [ Impact: cleanup, address compiler warning ] Signed-off-by: Coly Li Cc: Dimitri Sivanich Cc: Magnus Damm Cc: Andrew Morton Cc: Hugh Dickins LKML-Reference: <49EF3614.1050806@suse.de> Signed-off-by: Ingo Molnar Cc: Dimitri Sivanich --- arch/x86/kernel/uv_time.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index 2ffb6c53326e..583f11d5c480 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -29,7 +29,7 @@ #define RTC_NAME "sgi_rtc" -static cycle_t uv_read_rtc(void); +static cycle_t uv_read_rtc(struct clocksource *cs); static int uv_rtc_next_event(unsigned long, struct clock_event_device *); static void uv_rtc_timer_setup(enum clock_event_mode, struct clock_event_device *); @@ -123,7 +123,7 @@ static int uv_setup_intr(int cpu, u64 expires) /* Initialize comparator value */ uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); - return (expires < uv_read_rtc() && !uv_intr_pending(pnode)); + return (expires < uv_read_rtc(NULL) && !uv_intr_pending(pnode)); } /* @@ -256,7 +256,7 @@ static int uv_rtc_unset_timer(int cpu) spin_lock_irqsave(&head->lock, flags); - if (head->next_cpu == bcpu && uv_read_rtc() >= *t) + if (head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) rc = 1; *t = ULLONG_MAX; @@ -278,7 +278,7 @@ static int uv_rtc_unset_timer(int cpu) /* * Read the RTC. */ -static cycle_t uv_read_rtc(void) +static cycle_t uv_read_rtc(struct clocksource *cs) { return (cycle_t)uv_read_local_mmr(UVH_RTC); } @@ -291,7 +291,7 @@ static int uv_rtc_next_event(unsigned long delta, { int ced_cpu = cpumask_first(ced->cpumask); - return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc()); + return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc(NULL)); } /* -- cgit From 6298c512bc1007c3ff5c9ce20e6996781651cc45 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Apr 2009 12:28:22 +0200 Subject: x86, mce: make polling timer interval per CPU The polling timer while running per CPU still uses a global next_interval variable, which lead to some CPUs either polling too fast or too slow. This was not a serious problem because all errors get picked up eventually, but it's still better to avoid it. Turn next_interval into a per cpu variable. v2: Fix check_interval == 0 case (Hidetoshi Seto) [ Impact: minor bug fix ] Signed-off-by: Andi Kleen Reviewed-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 863f89568b1a..82614f1b923a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -452,13 +452,14 @@ void mce_log_therm_throt_event(__u64 status) */ static int check_interval = 5 * 60; /* 5 minutes */ -static int next_interval; /* in jiffies */ +static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ static void mcheck_timer(unsigned long); static DEFINE_PER_CPU(struct timer_list, mce_timer); static void mcheck_timer(unsigned long data) { struct timer_list *t = &per_cpu(mce_timer, data); + int *n; WARN_ON(smp_processor_id() != data); @@ -470,14 +471,14 @@ static void mcheck_timer(unsigned long data) * Alert userspace if needed. If we logged an MCE, reduce the * polling interval, otherwise increase the polling interval. */ + n = &__get_cpu_var(next_interval); if (mce_notify_user()) { - next_interval = max(next_interval/2, HZ/100); + *n = max(*n/2, HZ/100); } else { - next_interval = min(next_interval * 2, - (int)round_jiffies_relative(check_interval*HZ)); + *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); } - t->expires = jiffies + next_interval; + t->expires = jiffies + *n; add_timer(t); } @@ -632,14 +633,13 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) static void mce_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); + int *n = &__get_cpu_var(next_interval); - /* data race harmless because everyone sets to the same value */ - if (!next_interval) - next_interval = check_interval * HZ; - if (!next_interval) + *n = check_interval * HZ; + if (!*n) return; setup_timer(t, mcheck_timer, smp_processor_id()); - t->expires = round_jiffies(jiffies + next_interval); + t->expires = round_jiffies(jiffies + *n); add_timer(t); } @@ -907,7 +907,6 @@ static void mce_cpu_restart(void *data) /* Reinit MCEs after user configuration changes */ static void mce_restart(void) { - next_interval = check_interval * HZ; on_each_cpu(mce_cpu_restart, NULL, 1); } @@ -1110,7 +1109,8 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - t->expires = round_jiffies(jiffies + next_interval); + t->expires = round_jiffies(jiffies + + __get_cpu_var(next_interval)); add_timer_on(t, cpu); smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); break; -- cgit From 5679af4c1625a1534a4321e1ecc3c48a1cf65eb8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 7 Apr 2009 17:06:55 +0200 Subject: x86, mce: fix boot logging logic The earlier patch to change the poller to a separate function subtly broke the boot logging logic. This could lead to machine checks getting logged at boot even when disabled or defaulting to off on some systems. Fix that. [ Impact: bug fix - avoid spurious MCE in log ] Signed-off-by: Andi Kleen Reviewed-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 82614f1b923a..6fb0b359d2a5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -239,9 +239,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - - mce_log(&m); - add_taint(TAINT_MACHINE_CHECK); + if (!(flags & MCP_DONTLOG)) { + mce_log(&m); + add_taint(TAINT_MACHINE_CHECK); + } /* * Clear state for this bank. @@ -585,7 +586,7 @@ static void mce_init(void *dummy) * Log the machine checks left over from the previous reset. */ bitmap_fill(all_banks, MAX_NR_BANKS); - machine_check_poll(MCP_UC, &all_banks); + machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); set_in_cr4(X86_CR4_MCE); -- cgit From d2c8604121648b744ebb127991f1c5876931885e Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 23 Apr 2009 19:19:42 -0400 Subject: x86, hpet: Stop soliciting hpet=force users on ICH4M The HPET in the ICH4M is not documented in the data sheet because it was not officially validated. While it is fine for hackers to continue to use "hpet=force" to enable the hardware that they have, it is not prudent to solicit additional "hpet=force" users on this hardware. [ Impact: remove hpet=force syslog message on old-ICH systems ] Signed-off-by: Len Brown Acked-by: Venkatesh Pallipadi LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/quirks.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index e95022e4f5d5..7563b31b4f03 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -261,8 +261,6 @@ static void old_ich_force_enable_hpet_user(struct pci_dev *dev) { if (hpet_force_user) old_ich_force_enable_hpet(dev); - else - hpet_print_force_info(); } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1, -- cgit From f9a196b8dceba3c1e5fe885b81e45043ad7c60fc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 May 2009 20:59:25 +0200 Subject: x86: initialize io_bitmap_base on 32bit commit db949bba3c7cf2e664ac12e237c6d4c914f0c69d (x86-32: use non-lazy io bitmap context switching) broke ioperm for 32bit because it removed the lazy initialization of io_bitmap_base and did not set it to the real bitmap offset. [ Impact: fix non-working sys_ioperm() on 32-bit kernels ] Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/common.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c4f667896c28..c1caefc82e62 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1203,6 +1203,8 @@ void __cpuinit cpu_init(void) load_TR_desc(); load_LDT(&init_mm.context); + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + #ifdef CONFIG_DOUBLEFAULT /* Set up doublefault TSS pointer in the GDT */ __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); -- cgit From 6da7342ff1c5274c51ada084974668d10f769c16 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 May 2009 11:44:38 +0200 Subject: amd-iommu: fix iommu flag masks The feature bits should be set via bitmasks, not via feature IDs. [ Impact: fix feature enabling in newer IOMMU versions ] Signed-off-by: Joerg Roedel LKML-Reference: <20090504102028.GA30307@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 42c33cebf00f..8c0be0902dac 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -49,10 +49,10 @@ #define IVHD_DEV_EXT_SELECT 0x46 #define IVHD_DEV_EXT_SELECT_RANGE 0x47 -#define IVHD_FLAG_HT_TUN_EN 0x00 -#define IVHD_FLAG_PASSPW_EN 0x01 -#define IVHD_FLAG_RESPASSPW_EN 0x02 -#define IVHD_FLAG_ISOC_EN 0x03 +#define IVHD_FLAG_HT_TUN_EN_MASK 0x01 +#define IVHD_FLAG_PASSPW_EN_MASK 0x02 +#define IVHD_FLAG_RESPASSPW_EN_MASK 0x04 +#define IVHD_FLAG_ISOC_EN_MASK 0x08 #define IVMD_FLAG_EXCL_RANGE 0x08 #define IVMD_FLAG_UNITY_MAP 0x01 @@ -569,19 +569,19 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, * First set the recommended feature enable bits from ACPI * into the IOMMU control registers */ - h->flags & IVHD_FLAG_HT_TUN_EN ? + h->flags & IVHD_FLAG_HT_TUN_EN_MASK ? iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) : iommu_feature_disable(iommu, CONTROL_HT_TUN_EN); - h->flags & IVHD_FLAG_PASSPW_EN ? + h->flags & IVHD_FLAG_PASSPW_EN_MASK ? iommu_feature_enable(iommu, CONTROL_PASSPW_EN) : iommu_feature_disable(iommu, CONTROL_PASSPW_EN); - h->flags & IVHD_FLAG_RESPASSPW_EN ? + h->flags & IVHD_FLAG_RESPASSPW_EN_MASK ? iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) : iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN); - h->flags & IVHD_FLAG_ISOC_EN ? + h->flags & IVHD_FLAG_ISOC_EN_MASK ? iommu_feature_enable(iommu, CONTROL_ISOC_EN) : iommu_feature_disable(iommu, CONTROL_ISOC_EN); -- cgit From 35d11680a9d82c93eb92f08f9702b72877427b4a Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Mon, 4 May 2009 20:28:59 +0200 Subject: x86: show number of core_siblings instead of thread_siblings in /proc/cpuinfo Commit 7ad728f98162cb1af06a85b2a5fc422dddd4fb78 (cpumask: x86: convert cpu_sibling_map/cpu_core_map to cpumask_var_t) changed the output of /proc/cpuinfo for siblings: Example on an AMD Phenom: physical id : 0 siblings : 1 core id : 3 cpu cores : 4 Before that commit it was: physical id : 0 siblings : 4 core id : 3 cpu cores : 4 Instead of cpu_core_mask it now uses cpu_sibling_mask to count siblings. This is due to the following hunk of above commit: | --- a/arch/x86/kernel/cpu/proc.c | +++ b/arch/x86/kernel/cpu/proc.c | @@ -14,7 +14,7 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinf | if (c->x86_max_cores * smp_num_siblings > 1) { | seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); | seq_printf(m, "siblings\t: %d\n", | - cpus_weight(per_cpu(cpu_core_map, cpu))); | + cpumask_weight(cpu_sibling_mask(cpu))); | seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); | seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); | seq_printf(m, "apicid\t\t: %d\n", c->apicid); This was a mistake, because the impact line shows that this side-effect was not anticipated: Impact: reduce per-cpu size for CONFIG_CPUMASK_OFFSTACK=y So revert the respective hunk to restore the old behavior. [ Impact: fix sibling-info regression in /proc/cpuinfo ] Signed-off-by: Andreas Herrmann Cc: Rusty Russell LKML-Reference: <20090504182859.GA29045@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index f93047fed791..d5e30397246b 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -14,7 +14,7 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, if (c->x86_max_cores * smp_num_siblings > 1) { seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); seq_printf(m, "siblings\t: %d\n", - cpumask_weight(cpu_sibling_mask(cpu))); + cpumask_weight(cpu_core_mask(cpu))); seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); seq_printf(m, "apicid\t\t: %d\n", c->apicid); -- cgit From 61438766514a2d7f191ce1b3cf6812eabbef4ef7 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 6 May 2009 13:02:19 +0100 Subject: x86: fix boot hang in early_reserve_e820() If the first non-reserved (sub-)range doesn't fit the size requested, an endless loop will be entered. If a range returned from find_e820_area_size() turns out insufficient in size, the range must be skipped before calling the function again. [ Impact: fixes boot hang on some platforms ] Signed-off-by: Jan Beulich Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index ef2c3563357d..006281302925 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1074,12 +1074,13 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) u64 addr; u64 start; - start = startt; - while (size < sizet && (start + 1)) + for (start = startt; ; start += size) { start = find_e820_area_size(start, &size, align); - - if (size < sizet) - return 0; + if (!(start + 1)) + return 0; + if (size >= sizet) + break; + } #ifdef CONFIG_X86_32 if (start >= MAXMEM) -- cgit From 6407df5ca54a511054200a1eb23f78f723ca1de4 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 8 May 2009 10:51:41 +0800 Subject: x86, kexec: fix crashdump panic with CONFIG_KEXEC_JUMP Tim Starling reported that crashdump will panic with kernel compiled with CONFIG_KEXEC_JUMP due to null pointer deference in machine_kexec_32.c: machine_kexec(), when deferencing kexec_image. Refering to: http://bugzilla.kernel.org/show_bug.cgi?id=13265 This patch fixes the BUG via replacing global variable reference: kexec_image in machine_kexec() with local variable reference: image, which is more appropriate, and will not be null. Same BUG is in machine_kexec_64.c too, so fixed too in the same way. [ Impact: fix crash on kexec ] Reported-by: Tim Starling Signed-off-by: Huang Ying LKML-Reference: <1241751101.6259.85.camel@yhuang-dev.sh.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/machine_kexec_32.c | 4 ++-- arch/x86/kernel/machine_kexec_64.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index e7368c1da01d..c1c429d00130 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -194,7 +194,7 @@ void machine_kexec(struct kimage *image) unsigned int preserve_context); #ifdef CONFIG_KEXEC_JUMP - if (kexec_image->preserve_context) + if (image->preserve_context) save_processor_state(); #endif @@ -253,7 +253,7 @@ void machine_kexec(struct kimage *image) image->preserve_context); #ifdef CONFIG_KEXEC_JUMP - if (kexec_image->preserve_context) + if (image->preserve_context) restore_processor_state(); #endif diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 89cea4d44679..84c3bf209e98 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -274,7 +274,7 @@ void machine_kexec(struct kimage *image) int save_ftrace_enabled; #ifdef CONFIG_KEXEC_JUMP - if (kexec_image->preserve_context) + if (image->preserve_context) save_processor_state(); #endif @@ -333,7 +333,7 @@ void machine_kexec(struct kimage *image) image->preserve_context); #ifdef CONFIG_KEXEC_JUMP - if (kexec_image->preserve_context) + if (image->preserve_context) restore_processor_state(); #endif -- cgit From e5299926d7459d9fa7c7f856983147817aedb10e Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Fri, 8 May 2009 17:28:40 +0900 Subject: x86: MCE: make cmci_discover_lock irq-safe Lockdep reports the warning below when Li tries to offline one cpu: [ 110.835487] ================================= [ 110.835616] [ INFO: inconsistent lock state ] [ 110.835688] 2.6.30-rc4-00336-g8c9ed89 #52 [ 110.835757] --------------------------------- [ 110.835828] inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. [ 110.835908] swapper/0 [HC1[1]:SC0[0]:HE0:SE1] takes: [ 110.835982] (cmci_discover_lock){?.+...}, at: [] cmci_clear+0x30/0x9b cmci_clear() can be called via smp_call_function_single(). It is better to disable interrupt while holding cmci_discover_lock, to turn it into an irq-safe lock - we can deadlock otherwise. [ Impact: fix possible deadlock in the MCE code ] Reported-by: Shaohua Li Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Cc: Andrew Morton LKML-Reference: <4A03ED38.8000700@jp.fujitsu.com> Signed-off-by: Ingo Molnar Reported-by: Shaohua Li --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index d6b72df89d69..cef3ee30744b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -151,10 +151,11 @@ static void print_update(char *type, int *hdr, int num) static void cmci_discover(int banks, int boot) { unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); + unsigned long flags; int hdr = 0; int i; - spin_lock(&cmci_discover_lock); + spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { u64 val; @@ -184,7 +185,7 @@ static void cmci_discover(int banks, int boot) WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); } } - spin_unlock(&cmci_discover_lock); + spin_unlock_irqrestore(&cmci_discover_lock, flags); if (hdr) printk(KERN_CONT "\n"); } @@ -211,13 +212,14 @@ void cmci_recheck(void) */ void cmci_clear(void) { + unsigned long flags; int i; int banks; u64 val; if (!cmci_supported(&banks)) return; - spin_lock(&cmci_discover_lock); + spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { if (!test_bit(i, __get_cpu_var(mce_banks_owned))) continue; @@ -227,7 +229,7 @@ void cmci_clear(void) wrmsrl(MSR_IA32_MC0_CTL2 + i, val); __clear_bit(i, __get_cpu_var(mce_banks_owned)); } - spin_unlock(&cmci_discover_lock); + spin_unlock_irqrestore(&cmci_discover_lock, flags); } /* -- cgit From b74d446f1f337e3fe906169a3266cb65ffa4179e Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Sat, 9 May 2009 15:35:10 +0600 Subject: x86: Fix false positive section mismatch warnings in the apic code [ Impact: reduce kernel image size a bit, annotate away warnings ] Signed-off-by: Sam Ravnborg [ modified and tested it ] Signed-off-by: Rakib Mullick Cc: Marcin Slusarz LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/es7000_32.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 1c11b819f245..302947775575 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -254,7 +254,7 @@ static int parse_unisys_oem(char *oemptr) } #ifdef CONFIG_ACPI -static int find_unisys_acpi_oem_table(unsigned long *oem_addr) +static int __init find_unisys_acpi_oem_table(unsigned long *oem_addr) { struct acpi_table_header *header = NULL; struct es7000_oem_table *table; @@ -285,7 +285,7 @@ static int find_unisys_acpi_oem_table(unsigned long *oem_addr) return 0; } -static void unmap_unisys_acpi_oem_table(unsigned long oem_addr) +static void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr) { if (!oem_addr) return; @@ -306,7 +306,7 @@ static int es7000_check_dsdt(void) static int es7000_acpi_ret; /* Hook from generic ACPI tables.c */ -static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { unsigned long oem_addr = 0; int check_dsdt; @@ -717,7 +717,7 @@ struct apic apic_es7000_cluster = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, }; -struct apic apic_es7000 = { +struct apic __refdata apic_es7000 = { .name = "es7000", .probe = probe_es7000, -- cgit From 917a0153621572e88aeb2d5df025ad2e81027287 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 21:36:16 -0700 Subject: x86: mtrr: Fix high_width computation when phys-addr is >= 44bit found one system where cpu address line is 44bits, mtrr printout is not right: [ 0.000000] MTRR variable ranges enabled: [ 0.000000] 0 base 0 00000000 mask FF0 00000000 write-back [ 0.000000] 1 base 10 00000000 mask FFF 80000000 write-back [ 0.000000] 2 base 0 80000000 mask FFF 80000000 uncachable [ 0.000000] 3 base 0 7F800000 mask FFF FF800000 uncachable Li Zefan and Frederic pointed out the high_width could be -4 some how. It turns out when phys_addr is 44bit, size_or_mask will be ffffffff,00000000 so ffs(size_or_mask) will be 0. Try to check low 32 bit, to get correct high_width. Signed-off-by: Yinghai Lu Also-analyzed-by: Frederic Weisbecker Also-analyzed-by: Li Zefan Cc: Jeremy Fitzhardinge Cc: Zhaolei Cc: Steven Rostedt Cc: Vegard Nossum Cc: Andrew Morton LKML-Reference: <4A026540.8060504@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0b776c09aff3..d21d4fb161f7 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -275,7 +275,11 @@ static void __init print_mtrr_state(void) } printk(KERN_DEBUG "MTRR variable ranges %sabled:\n", mtrr_state.enabled & 2 ? "en" : "dis"); - high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4; + if (size_or_mask & 0xffffffffUL) + high_width = ffs(size_or_mask & 0xffffffffUL) - 1; + else + high_width = ffs(size_or_mask>>32) + 32 - 1; + high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n", -- cgit From aa512a27e9e8ed32f31b15eec67ab1ceca33839b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 13 May 2009 13:52:19 -0400 Subject: x86/function-graph: fix constraint for recording old return value After upgrading from gcc 4.2.2 to 4.4.0, the function graph tracer broke. Investigating, I found that in the asm that replaces the return value, gcc was using the same register for the old value as it was for the new value. mov (addr), old mov new, (addr) But if old and new are the same register, we clobber new with old! I first thought this was a bug in gcc 4.4.0 and reported it: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40132 Andrew Pinski responded (quickly), saying that it was correct gcc behavior and the code needed to denote old as an "early clobber". Instead of "=r"(old), we need "=&r"(old). [Impact: keep function graph tracer from breaking with gcc 4.4.0 ] Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 18dfa30795c9..b79c5533c421 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -442,7 +442,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) _ASM_EXTABLE(1b, 4b) _ASM_EXTABLE(2b, 4b) - : [old] "=r" (old), [faulted] "=r" (faulted) + : [old] "=&r" (old), [faulted] "=r" (faulted) : [parent] "r" (parent), [return_hooker] "r" (return_hooker) : "memory" ); -- cgit From 33ab1979bc9f719213bc3f392c8fd9d012e4f4e9 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Wed, 11 Feb 2009 18:46:32 -0600 Subject: kgdb,i386: use address that SP register points to in the exception frame The treatment of the SP register is different on x86_64 and i386. This is a regression fix that lived outside the mainline kernel from 2.6.27 to now. The regression was a result of the original merge consolidation of the i386 and x86_64 archs to x86. The incorrectly reported SP on i386 prevented stack tracebacks from working correctly in gdb. Signed-off-by: Jason Wessel --- arch/x86/kernel/kgdb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index eedfaebe1063..b1f4dffb919e 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -88,6 +88,7 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs[GDB_SS] = __KERNEL_DS; gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; + gdb_regs[GDB_SP] = (int)®s->sp; #else gdb_regs[GDB_R8] = regs->r8; gdb_regs[GDB_R9] = regs->r9; @@ -100,8 +101,8 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs32[GDB_PS] = regs->flags; gdb_regs32[GDB_CS] = regs->cs; gdb_regs32[GDB_SS] = regs->ss; -#endif gdb_regs[GDB_SP] = regs->sp; +#endif } /** -- cgit From b4ecc126991b30fe5f9a59dfacda046aeac124b2 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 13 May 2009 17:16:55 -0700 Subject: x86: Fix performance regression caused by paravirt_ops on native kernels Xiaohui Xin and some other folks at Intel have been looking into what's behind the performance hit of paravirt_ops when running native. It appears that the hit is entirely due to the paravirtualized spinlocks introduced by: | commit 8efcbab674de2bee45a2e4cdf97de16b8e609ac8 | Date: Mon Jul 7 12:07:51 2008 -0700 | | paravirt: introduce a "lock-byte" spinlock implementation The extra call/return in the spinlock path is somehow causing an increase in the cycles/instruction of somewhere around 2-7% (seems to vary quite a lot from test to test). The working theory is that the CPU's pipeline is getting upset about the call->call->locked-op->return->return, and seems to be failing to speculate (though I haven't seen anything definitive about the precise reasons). This doesn't entirely make sense, because the performance hit is also visible on unlock and other operations which don't involve locked instructions. But spinlock operations clearly swamp all the other pvops operations, even though I can't imagine that they're nearly as common (there's only a .05% increase in instructions executed). If I disable just the pv-spinlock calls, my tests show that pvops is identical to non-pvops performance on native (my measurements show that it is actually about .1% faster, but Xiaohui shows a .05% slowdown). Summary of results, averaging 10 runs of the "mmperf" test, using a no-pvops build as baseline: nopv Pv-nospin Pv-spin CPU cycles 100.00% 99.89% 102.18% instructions 100.00% 100.10% 100.15% CPI 100.00% 99.79% 102.03% cache ref 100.00% 100.84% 100.28% cache miss 100.00% 90.47% 88.56% cache miss rate 100.00% 89.72% 88.31% branches 100.00% 99.93% 100.04% branch miss 100.00% 103.66% 107.72% branch miss rt 100.00% 103.73% 107.67% wallclock 100.00% 99.90% 102.20% The clear effect here is that the 2% increase in CPI is directly reflected in the final wallclock time. (The other interesting effect is that the more ops are out of line calls via pvops, the lower the cache access and miss rates. Not too surprising, but it suggests that the non-pvops kernel is over-inlined. On the flipside, the branch misses go up correspondingly...) So, what's the fix? Paravirt patching turns all the pvops calls into direct calls, so _spin_lock etc do end up having direct calls. For example, the compiler generated code for paravirtualized _spin_lock is: <_spin_lock+0>: mov %gs:0xb4c8,%rax <_spin_lock+9>: incl 0xffffffffffffe044(%rax) <_spin_lock+15>: callq *0xffffffff805a5b30 <_spin_lock+22>: retq The indirect call will get patched to: <_spin_lock+0>: mov %gs:0xb4c8,%rax <_spin_lock+9>: incl 0xffffffffffffe044(%rax) <_spin_lock+15>: callq <__ticket_spin_lock> <_spin_lock+20>: nop; nop /* or whatever 2-byte nop */ <_spin_lock+22>: retq One possibility is to inline _spin_lock, etc, when building an optimised kernel (ie, when there's no spinlock/preempt instrumentation/debugging enabled). That will remove the outer call/return pair, returning the instruction stream to a single call/return, which will presumably execute the same as the non-pvops case. The downsides arel 1) it will replicate the preempt_disable/enable code at eack lock/unlock callsite; this code is fairly small, but not nothing; and 2) the spinlock definitions are already a very heavily tangled mass of #ifdefs and other preprocessor magic, and making any changes will be non-trivial. The other obvious answer is to disable pv-spinlocks. Making them a separate config option is fairly easy, and it would be trivial to enable them only when Xen is enabled (as the only non-default user). But it doesn't really address the common case of a distro build which is going to have Xen support enabled, and leaves the open question of whether the native performance cost of pv-spinlocks is worth the performance improvement on a loaded Xen system (10% saving of overall system CPU when guests block rather than spin). Still it is a reasonable short-term workaround. [ Impact: fix pvops performance regression when running native ] Analysed-by: "Xin Xiaohui" Analysed-by: "Li Xin" Analysed-by: "Nakajima Jun" Signed-off-by: Jeremy Fitzhardinge Acked-by: H. Peter Anvin Cc: Nick Piggin Cc: Xen-devel LKML-Reference: <4A0B62F7.5030802@goop.org> [ fixed the help text ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 3 ++- arch/x86/kernel/paravirt.c | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 145cce75cda7..88d1bfc847d3 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -89,7 +89,8 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o obj-$(CONFIG_KVM_GUEST) += kvm.o obj-$(CONFIG_KVM_CLOCK) += kvmclock.o -obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o +obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o +obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 8e45f4464880..9faf43bea336 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -134,7 +134,9 @@ static void *get_call_destination(u8 type) .pv_irq_ops = pv_irq_ops, .pv_apic_ops = pv_apic_ops, .pv_mmu_ops = pv_mmu_ops, +#ifdef CONFIG_PARAVIRT_SPINLOCKS .pv_lock_ops = pv_lock_ops, +#endif }; return *((void **)&tmpl + type); } -- cgit