diff options
Diffstat (limited to 'arch/x86/kernel')
66 files changed, 1020 insertions, 1642 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d23d83577d6b..9bba5b79902b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -2,7 +2,7 @@ # Makefile for the linux kernel. # -extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds +extra-y := head_$(BITS).o head$(BITS).o head.o vmlinux.lds CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) @@ -49,7 +49,6 @@ obj-y += cpu/ obj-y += acpi/ obj-y += reboot.o obj-$(CONFIG_X86_32) += reboot_32.o -obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_PCI) += early-quirks.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a415b1f44365..8afb69319815 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -593,7 +593,7 @@ void __init acpi_set_irq_model_ioapic(void) #ifdef CONFIG_ACPI_HOTPLUG_CPU #include <acpi/processor.h> -static void __cpuinitdata acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +static void __cpuinit acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA int nid; @@ -990,7 +990,7 @@ void __init mp_config_acpi_legacy_irqs(void) int i; struct mpc_intsrc mp_irq; -#if defined (CONFIG_MCA) || defined (CONFIG_EISA) +#ifdef CONFIG_EISA /* * Fabricate the legacy ISA bus (bus #31). */ diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 103b6ab368d3..146a49c763a4 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -24,6 +24,10 @@ unsigned long acpi_realmode_flags; static char temp_stack[4096]; #endif +asmlinkage void acpi_enter_s3(void) +{ + acpi_enter_sleep_state(3, wake_sleep_flags); +} /** * acpi_suspend_lowlevel - save kernel state * diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index 416d4be13fef..d68677a2a010 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -3,12 +3,16 @@ */ #include <asm/trampoline.h> +#include <linux/linkage.h> extern unsigned long saved_video_mode; extern long saved_magic; extern int wakeup_pmode_return; +extern u8 wake_sleep_flags; +extern asmlinkage void acpi_enter_s3(void); + extern unsigned long acpi_copy_wakeup_routine(unsigned long); extern void wakeup_long64(void); diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index 13ab720573e3..72610839f03b 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -74,9 +74,7 @@ restore_registers: ENTRY(do_suspend_lowlevel) call save_processor_state call save_registers - pushl $3 - call acpi_enter_sleep_state - addl $4, %esp + call acpi_enter_s3 # In case of S3 failure, we'll emerge here. Jump # to ret_point to recover diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 8ea5164cbd04..014d1d28c397 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -71,9 +71,7 @@ ENTRY(do_suspend_lowlevel) movq %rsi, saved_rsi addq $8, %rsp - movl $3, %edi - xorl %eax, %eax - call acpi_enter_sleep_state + call acpi_enter_s3 /* in case something went wrong, restore the machine status and go on */ jmp resume_point diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 11544d8f1e97..39a222e094af 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -35,6 +35,7 @@ #include <linux/smp.h> #include <linux/mm.h> +#include <asm/irq_remapping.h> #include <asm/perf_event.h> #include <asm/x86_init.h> #include <asm/pgalloc.h> @@ -1325,11 +1326,13 @@ void __cpuinit setup_local_APIC(void) acked); break; } - if (cpu_has_tsc) { - rdtscll(ntsc); - max_loops = (cpu_khz << 10) - (ntsc - tsc); - } else - max_loops--; + if (queued) { + if (cpu_has_tsc) { + rdtscll(ntsc); + max_loops = (cpu_khz << 10) - (ntsc - tsc); + } else + max_loops--; + } } while (queued && max_loops > 0); WARN_ON(max_loops <= 0); @@ -1441,8 +1444,8 @@ void __init bsp_end_local_APIC_setup(void) * Now that local APIC setup is completed for BP, configure the fault * handling for interrupt remapping. */ - if (intr_remapping_enabled) - enable_drhd_fault_handling(); + if (irq_remapping_enabled) + irq_remap_enable_fault_handling(); } @@ -1517,7 +1520,7 @@ void enable_x2apic(void) int __init enable_IR(void) { #ifdef CONFIG_IRQ_REMAP - if (!intr_remapping_supported()) { + if (!irq_remapping_supported()) { pr_debug("intr-remapping not supported\n"); return -1; } @@ -1528,7 +1531,7 @@ int __init enable_IR(void) return -1; } - return enable_intr_remapping(); + return irq_remapping_enable(); #endif return -1; } @@ -1537,10 +1540,13 @@ void __init enable_IR_x2apic(void) { unsigned long flags; int ret, x2apic_enabled = 0; - int dmar_table_init_ret; + int hardware_init_ret; + + /* Make sure irq_remap_ops are initialized */ + setup_irq_remapping_ops(); - dmar_table_init_ret = dmar_table_init(); - if (dmar_table_init_ret && !x2apic_supported()) + hardware_init_ret = irq_remapping_prepare(); + if (hardware_init_ret && !x2apic_supported()) return; ret = save_ioapic_entries(); @@ -1556,7 +1562,7 @@ void __init enable_IR_x2apic(void) if (x2apic_preenabled && nox2apic) disable_x2apic(); - if (dmar_table_init_ret) + if (hardware_init_ret) ret = -1; else ret = enable_IR(); @@ -1637,9 +1643,11 @@ static int __init apic_verify(void) mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; /* The BIOS may have set up the APIC at some other address */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (l & MSR_IA32_APICBASE_ENABLE) - mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; + if (boot_cpu_data.x86 >= 6) { + rdmsr(MSR_IA32_APICBASE, l, h); + if (l & MSR_IA32_APICBASE_ENABLE) + mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; + } pr_info("Found and enabled local APIC!\n"); return 0; @@ -1657,13 +1665,15 @@ int __init apic_force_enable(unsigned long addr) * MSR. This can only be done in software for Intel P6 or later * and AMD K7 (Model > 1) or later. */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (!(l & MSR_IA32_APICBASE_ENABLE)) { - pr_info("Local APIC disabled by BIOS -- reenabling.\n"); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | addr; - wrmsr(MSR_IA32_APICBASE, l, h); - enabled_via_apicbase = 1; + if (boot_cpu_data.x86 >= 6) { + rdmsr(MSR_IA32_APICBASE, l, h); + if (!(l & MSR_IA32_APICBASE_ENABLE)) { + pr_info("Local APIC disabled by BIOS -- reenabling.\n"); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | addr; + wrmsr(MSR_IA32_APICBASE, l, h); + enabled_via_apicbase = 1; + } } return apic_verify(); } @@ -2172,8 +2182,8 @@ static int lapic_suspend(void) local_irq_save(flags); disable_local_APIC(); - if (intr_remapping_enabled) - disable_intr_remapping(); + if (irq_remapping_enabled) + irq_remapping_disable(); local_irq_restore(flags); return 0; @@ -2189,7 +2199,7 @@ static void lapic_resume(void) return; local_irq_save(flags); - if (intr_remapping_enabled) { + if (irq_remapping_enabled) { /* * IO-APIC and PIC have their own resume routines. * We just mask them here to make sure the interrupt @@ -2209,10 +2219,12 @@ static void lapic_resume(void) * FIXME! This will be wrong if we ever support suspend on * SMP! We'll need to do this as part of the CPU restore! */ - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; - wrmsr(MSR_IA32_APICBASE, l, h); + if (boot_cpu_data.x86 >= 6) { + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; + wrmsr(MSR_IA32_APICBASE, l, h); + } } maxlvt = lapic_get_maxlvt(); @@ -2239,8 +2251,8 @@ static void lapic_resume(void) apic_write(APIC_ESR, 0); apic_read(APIC_ESR); - if (intr_remapping_enabled) - reenable_intr_remapping(x2apic_mode); + if (irq_remapping_enabled) + irq_remapping_reenable(x2apic_mode); local_irq_restore(flags); } diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 359b6899a36c..0e881c46e8c8 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -227,6 +227,7 @@ static struct apic apic_flat = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, @@ -386,6 +387,7 @@ static struct apic apic_physflat = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 634ae6cdd5c9..a6e4c6e06c08 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -181,6 +181,7 @@ struct apic apic_noop = { .read = noop_apic_read, .write = noop_apic_write, + .eoi_write = noop_apic_write, .icr_read = noop_apic_icr_read, .icr_write = noop_apic_icr_write, .wait_icr_idle = noop_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 899803e03214..6ec6d5d297c3 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -207,8 +207,11 @@ static void __init map_csrs(void) static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) { - c->phys_proc_id = node; - per_cpu(cpu_llc_id, smp_processor_id()) = node; + + if (c->phys_proc_id != node) { + c->phys_proc_id = node; + per_cpu(cpu_llc_id, smp_processor_id()) = node; + } } static int __init numachip_system_init(void) @@ -292,6 +295,7 @@ static struct apic apic_numachip __refconst = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 0cdec7065aff..31fbdbfbf960 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -248,6 +248,7 @@ static struct apic apic_bigsmp = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index e42d1d3b9134..db4ab1be3c79 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -678,6 +678,7 @@ static struct apic __refdata apic_es7000_cluster = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, @@ -742,6 +743,7 @@ static struct apic __refdata apic_es7000 = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e88300d8e80a..ac96561d1a99 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -68,23 +68,21 @@ #define for_each_irq_pin(entry, head) \ for (entry = head; entry; entry = entry->next) -static void __init __ioapic_init_mappings(void); - -static unsigned int __io_apic_read (unsigned int apic, unsigned int reg); -static void __io_apic_write (unsigned int apic, unsigned int reg, unsigned int val); -static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val); - -static struct io_apic_ops io_apic_ops = { - .init = __ioapic_init_mappings, - .read = __io_apic_read, - .write = __io_apic_write, - .modify = __io_apic_modify, -}; - -void __init set_io_apic_ops(const struct io_apic_ops *ops) +#ifdef CONFIG_IRQ_REMAP +static void irq_remap_modify_chip_defaults(struct irq_chip *chip); +static inline bool irq_remapped(struct irq_cfg *cfg) +{ + return cfg->irq_2_iommu.iommu != NULL; +} +#else +static inline bool irq_remapped(struct irq_cfg *cfg) +{ + return false; +} +static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip) { - io_apic_ops = *ops; } +#endif /* * Is the SiS APIC rmw bug present ? @@ -142,7 +140,7 @@ int mp_irq_entries; /* GSI interrupts */ static int nr_irqs_gsi = NR_IRQS_LEGACY; -#if defined (CONFIG_MCA) || defined (CONFIG_EISA) +#ifdef CONFIG_EISA int mp_bus_id_to_type[MAX_MP_BUSSES]; #endif @@ -313,21 +311,6 @@ static void free_irq_at(unsigned int at, struct irq_cfg *cfg) irq_free_desc(at); } -static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) -{ - return io_apic_ops.read(apic, reg); -} - -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -{ - io_apic_ops.write(apic, reg, value); -} - -static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) -{ - io_apic_ops.modify(apic, reg, value); -} - struct io_apic { unsigned int index; @@ -349,14 +332,14 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector) writel(vector, &io_apic->eoi); } -static unsigned int __io_apic_read(unsigned int apic, unsigned int reg) +unsigned int native_io_apic_read(unsigned int apic, unsigned int reg) { struct io_apic __iomem *io_apic = io_apic_base(apic); writel(reg, &io_apic->index); return readl(&io_apic->data); } -static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); @@ -370,7 +353,7 @@ static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int va * * Older SiS APIC requires we rewrite the index register */ -static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); @@ -379,29 +362,6 @@ static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int v writel(value, &io_apic->data); } -static bool io_apic_level_ack_pending(struct irq_cfg *cfg) -{ - struct irq_pin_list *entry; - unsigned long flags; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, cfg->irq_2_pin) { - unsigned int reg; - int pin; - - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin*2); - /* Is the remote IRR bit set? */ - if (reg & IO_APIC_REDIR_REMOTE_IRR) { - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return true; - } - } - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - - return false; -} - union entry_union { struct { u32 w1, w2; }; struct IO_APIC_route_entry entry; @@ -875,7 +835,7 @@ static int __init find_isa_irq_apic(int irq, int type) return -1; } -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +#ifdef CONFIG_EISA /* * EISA Edge/Level control register, ELCR */ @@ -912,12 +872,6 @@ static int EISA_ELCR(unsigned int irq) #define default_PCI_trigger(idx) (1) #define default_PCI_polarity(idx) (1) -/* MCA interrupts are always polarity zero level triggered, - * when listed as conforming in the MP table. */ - -#define default_MCA_trigger(idx) (1) -#define default_MCA_polarity(idx) default_ISA_polarity(idx) - static int irq_polarity(int idx) { int bus = mp_irqs[idx].srcbus; @@ -975,7 +929,7 @@ static int irq_trigger(int idx) trigger = default_ISA_trigger(idx); else trigger = default_PCI_trigger(idx); -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +#ifdef CONFIG_EISA switch (mp_bus_id_to_type[bus]) { case MP_BUS_ISA: /* ISA pin */ { @@ -992,11 +946,6 @@ static int irq_trigger(int idx) /* set before the switch */ break; } - case MP_BUS_MCA: /* MCA pin */ - { - trigger = default_MCA_trigger(idx); - break; - } default: { printk(KERN_WARNING "broken BIOS!!\n"); @@ -1361,77 +1310,13 @@ static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg, fasteoi ? "fasteoi" : "edge"); } - -static int setup_ir_ioapic_entry(int irq, - struct IR_IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr) -{ - int index; - struct irte irte; - int ioapic_id = mpc_ioapic_id(attr->ioapic); - struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id); - - if (!iommu) { - pr_warn("No mapping iommu for ioapic %d\n", ioapic_id); - return -ENODEV; - } - - index = alloc_irte(iommu, irq, 1); - if (index < 0) { - pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id); - return -ENOMEM; - } - - prepare_irte(&irte, vector, destination); - - /* Set source-id of interrupt request */ - set_ioapic_sid(&irte, ioapic_id); - - modify_irte(irq, &irte); - - apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: " - "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d " - "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X " - "Avail:%X Vector:%02X Dest:%08X " - "SID:%04X SQ:%X SVT:%X)\n", - attr->ioapic, irte.present, irte.fpd, irte.dst_mode, - irte.redir_hint, irte.trigger_mode, irte.dlvry_mode, - irte.avail, irte.vector, irte.dest_id, - irte.sid, irte.sq, irte.svt); - - memset(entry, 0, sizeof(*entry)); - - entry->index2 = (index >> 15) & 0x1; - entry->zero = 0; - entry->format = 1; - entry->index = (index & 0x7fff); - /* - * IO-APIC RTE will be configured with virtual vector. - * irq handler will do the explicit EOI to the io-apic. - */ - entry->vector = attr->ioapic_pin; - entry->mask = 0; /* enable IRQ */ - entry->trigger = attr->trigger; - entry->polarity = attr->polarity; - - /* Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. - */ - if (attr->trigger) - entry->mask = 1; - - return 0; -} - static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, unsigned int destination, int vector, struct io_apic_irq_attr *attr) { - if (intr_remapping_enabled) - return setup_ir_ioapic_entry(irq, - (struct IR_IO_APIC_route_entry *)entry, - destination, vector, attr); + if (irq_remapping_enabled) + return setup_ioapic_remapped_entry(irq, entry, destination, + vector, attr); memset(entry, 0, sizeof(*entry)); @@ -1588,7 +1473,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, { struct IO_APIC_route_entry entry; - if (intr_remapping_enabled) + if (irq_remapping_enabled) return; memset(&entry, 0, sizeof(entry)); @@ -1674,7 +1559,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) printk(KERN_DEBUG ".... IRQ redirection table:\n"); - if (intr_remapping_enabled) { + if (irq_remapping_enabled) { printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR" " Pol Stat Indx2 Zero Vect:\n"); } else { @@ -1683,7 +1568,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) } for (i = 0; i <= reg_01.bits.entries; i++) { - if (intr_remapping_enabled) { + if (irq_remapping_enabled) { struct IO_APIC_route_entry entry; struct IR_IO_APIC_route_entry *ir_entry; @@ -2050,7 +1935,7 @@ void disable_IO_APIC(void) * IOAPIC RTE as well as interrupt-remapping table entry). * As this gets called during crash dump, keep this simple for now. */ - if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) { + if (ioapic_i8259.pin != -1 && !irq_remapping_enabled) { struct IO_APIC_route_entry entry; memset(&entry, 0, sizeof(entry)); @@ -2074,7 +1959,7 @@ void disable_IO_APIC(void) * Use virtual wire A mode when interrupt remapping is enabled. */ if (cpu_has_apic || apic_from_smp_config()) - disconnect_bsp_APIC(!intr_remapping_enabled && + disconnect_bsp_APIC(!irq_remapping_enabled && ioapic_i8259.pin != -1); } @@ -2390,71 +2275,6 @@ ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, return ret; } -#ifdef CONFIG_IRQ_REMAP - -/* - * Migrate the IO-APIC irq in the presence of intr-remapping. - * - * For both level and edge triggered, irq migration is a simple atomic - * update(of vector and cpu destination) of IRTE and flush the hardware cache. - * - * For level triggered, we eliminate the io-apic RTE modification (with the - * updated vector information), by using a virtual vector (io-apic pin number). - * Real vector that is used for interrupting cpu will be coming from - * the interrupt-remapping table entry. - * - * As the migration is a simple atomic update of IRTE, the same mechanism - * is used to migrate MSI irq's in the presence of interrupt-remapping. - */ -static int -ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - struct irq_cfg *cfg = data->chip_data; - unsigned int dest, irq = data->irq; - struct irte irte; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return -EINVAL; - - if (get_irte(irq, &irte)) - return -EBUSY; - - if (assign_irq_vector(irq, cfg, mask)) - return -EBUSY; - - dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); - - irte.vector = cfg->vector; - irte.dest_id = IRTE_DEST(dest); - - /* - * Atomically updates the IRTE with the new destination, vector - * and flushes the interrupt entry cache. - */ - modify_irte(irq, &irte); - - /* - * After this point, all the interrupts will start arriving - * at the new destination. So, time to cleanup the previous - * vector allocation. - */ - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - cpumask_copy(data->affinity, mask); - return 0; -} - -#else -static inline int -ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - return 0; -} -#endif - asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; @@ -2552,6 +2372,29 @@ static void ack_apic_edge(struct irq_data *data) atomic_t irq_mis_count; #ifdef CONFIG_GENERIC_PENDING_IRQ +static bool io_apic_level_ack_pending(struct irq_cfg *cfg) +{ + struct irq_pin_list *entry; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + for_each_irq_pin(entry, cfg->irq_2_pin) { + unsigned int reg; + int pin; + + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + /* Is the remote IRR bit set? */ + if (reg & IO_APIC_REDIR_REMOTE_IRR) { + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return true; + } + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return false; +} + static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) { /* If we are moving the irq we need to mask it */ @@ -2699,7 +2542,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip) chip->irq_eoi = ir_ack_apic_level; #ifdef CONFIG_SMP - chip->irq_set_affinity = ir_ioapic_set_affinity; + chip->irq_set_affinity = set_remapped_irq_affinity; #endif } #endif /* CONFIG_IRQ_REMAP */ @@ -2912,7 +2755,7 @@ static inline void __init check_timer(void) * 8259A. */ if (pin1 == -1) { - if (intr_remapping_enabled) + if (irq_remapping_enabled) panic("BIOS bug: timer not connected to IO-APIC"); pin1 = pin2; apic1 = apic2; @@ -2945,7 +2788,7 @@ static inline void __init check_timer(void) clear_IO_APIC_pin(0, pin1); goto out; } - if (intr_remapping_enabled) + if (irq_remapping_enabled) panic("timer doesn't work through Interrupt-remapped IO-APIC"); local_irq_disable(); clear_IO_APIC_pin(apic1, pin1); @@ -3169,7 +3012,7 @@ void destroy_irq(unsigned int irq) irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); if (irq_remapped(cfg)) - free_irte(irq); + free_remapped_irq(irq); raw_spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq, cfg); raw_spin_unlock_irqrestore(&vector_lock, flags); @@ -3198,54 +3041,34 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); if (irq_remapped(cfg)) { - struct irte irte; - int ir_index; - u16 sub_handle; - - ir_index = map_irq_to_irte_handle(irq, &sub_handle); - BUG_ON(ir_index == -1); - - prepare_irte(&irte, cfg->vector, dest); - - /* Set source-id of interrupt request */ - if (pdev) - set_msi_sid(&irte, pdev); - else - set_hpet_sid(&irte, hpet_id); - - modify_irte(irq, &irte); + compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id); + return err; + } + if (x2apic_enabled()) + msg->address_hi = MSI_ADDR_BASE_HI | + MSI_ADDR_EXT_DEST_ID(dest); + else msg->address_hi = MSI_ADDR_BASE_HI; - msg->data = sub_handle; - msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | - MSI_ADDR_IR_SHV | - MSI_ADDR_IR_INDEX1(ir_index) | - MSI_ADDR_IR_INDEX2(ir_index); - } else { - if (x2apic_enabled()) - msg->address_hi = MSI_ADDR_BASE_HI | - MSI_ADDR_EXT_DEST_ID(dest); - else - msg->address_hi = MSI_ADDR_BASE_HI; - msg->address_lo = - MSI_ADDR_BASE_LO | - ((apic->irq_dest_mode == 0) ? - MSI_ADDR_DEST_MODE_PHYSICAL: - MSI_ADDR_DEST_MODE_LOGICAL) | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU: - MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); + msg->address_lo = + MSI_ADDR_BASE_LO | + ((apic->irq_dest_mode == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL: + MSI_ADDR_DEST_MODE_LOGICAL) | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(cfg->vector); - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED: - MSI_DATA_DELIVERY_LOWPRI) | - MSI_DATA_VECTOR(cfg->vector); - } return err; } @@ -3288,33 +3111,6 @@ static struct irq_chip msi_chip = { .irq_retrigger = ioapic_retrigger_irq, }; -/* - * Map the PCI dev to the corresponding remapping hardware unit - * and allocate 'nvec' consecutive interrupt-remapping table entries - * in it. - */ -static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) -{ - struct intel_iommu *iommu; - int index; - - iommu = map_dev_to_ir(dev); - if (!iommu) { - printk(KERN_ERR - "Unable to map PCI %s to iommu\n", pci_name(dev)); - return -ENOENT; - } - - index = alloc_irte(iommu, irq, nvec); - if (index < 0) { - printk(KERN_ERR - "Unable to allocate %d IRTE for PCI %s\n", nvec, - pci_name(dev)); - return -ENOSPC; - } - return index; -} - static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) { struct irq_chip *chip = &msi_chip; @@ -3345,7 +3141,6 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) int node, ret, sub_handle, index = 0; unsigned int irq, irq_want; struct msi_desc *msidesc; - struct intel_iommu *iommu = NULL; /* x86 doesn't support multiple MSI yet */ if (type == PCI_CAP_ID_MSI && nvec > 1) @@ -3359,7 +3154,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (irq == 0) return -1; irq_want = irq + 1; - if (!intr_remapping_enabled) + if (!irq_remapping_enabled) goto no_ir; if (!sub_handle) { @@ -3367,23 +3162,16 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) * allocate the consecutive block of IRTE's * for 'nvec' */ - index = msi_alloc_irte(dev, irq, nvec); + index = msi_alloc_remapped_irq(dev, irq, nvec); if (index < 0) { ret = index; goto error; } } else { - iommu = map_dev_to_ir(dev); - if (!iommu) { - ret = -ENOENT; + ret = msi_setup_remapped_irq(dev, irq, index, + sub_handle); + if (ret < 0) goto error; - } - /* - * setup the mapping between the irq and the IRTE - * base index, the sub_handle pointing to the - * appropriate interrupt remap table entry. - */ - set_irte_irq(irq, iommu, index, sub_handle); } no_ir: ret = setup_msi_irq(dev, msidesc, irq); @@ -3501,15 +3289,8 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) struct msi_msg msg; int ret; - if (intr_remapping_enabled) { - struct intel_iommu *iommu = map_hpet_to_ir(id); - int index; - - if (!iommu) - return -1; - - index = alloc_irte(iommu, irq, 1); - if (index < 0) + if (irq_remapping_enabled) { + if (!setup_hpet_msi_remapped(irq, id)) return -1; } @@ -3888,8 +3669,8 @@ void __init setup_ioapic_dest(void) else mask = apic->target_cpus(); - if (intr_remapping_enabled) - ir_ioapic_set_affinity(idata, mask, false); + if (irq_remapping_enabled) + set_remapped_irq_affinity(idata, mask, false); else ioapic_set_affinity(idata, mask, false); } @@ -3931,12 +3712,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics) return res; } -void __init ioapic_and_gsi_init(void) -{ - io_apic_ops.init(); -} - -static void __init __ioapic_init_mappings(void) +void __init native_io_apic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; struct resource *ioapic_res; diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 00d2422ca7c9..f00a68cca37a 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -530,6 +530,7 @@ static struct apic __refdata apic_numaq = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index ff2c1b9aac4d..1b291da09e60 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -142,6 +142,7 @@ static struct apic apic_default = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index fea000b27f07..659897c00755 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -546,6 +546,7 @@ static struct apic apic_summit = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 48f3103b3c93..ff35cff0e1a7 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -260,6 +260,7 @@ static struct apic apic_x2apic_cluster = { .read = native_apic_msr_read, .write = native_apic_msr_write, + .eoi_write = native_apic_msr_eoi_write, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, .wait_icr_idle = native_x2apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 8a778db45e3a..c17e982db275 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -24,6 +24,12 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (x2apic_phys) return x2apic_enabled(); + else if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) && + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) && + x2apic_enabled()) { + printk(KERN_DEBUG "System requires x2apic physical mode\n"); + return 1; + } else return 0; } @@ -166,6 +172,7 @@ static struct apic apic_x2apic_phys = { .read = native_apic_msr_read, .write = native_apic_msr_write, + .eoi_write = native_apic_msr_eoi_write, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, .wait_icr_idle = native_x2apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 87bfa69e216e..c6d03f7a4401 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -404,6 +404,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .read = native_apic_msr_read, .write = native_apic_msr_write, + .eoi_write = native_apic_msr_eoi_write, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, .wait_icr_idle = native_x2apic_wait_icr_idle, diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 459e78cbf61e..07b0c0db466c 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -2401,7 +2401,7 @@ static void __exit apm_exit(void) * (pm_idle), Wait for all processors to update cached/local * copies of pm_idle before proceeding. */ - cpu_idle_wait(); + kick_all_cpus_sync(); } if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0) && (apm_info.connection_version > 0x0100)) { diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 5da1269e8ddc..e2dbcb7dabdd 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -27,21 +27,29 @@ static int num_scan_areas; static __init int set_corruption_check(char *arg) { - char *end; + ssize_t ret; + unsigned long val; - memory_corruption_check = simple_strtol(arg, &end, 10); + ret = kstrtoul(arg, 10, &val); + if (ret) + return ret; - return (*end == 0) ? 0 : -EINVAL; + memory_corruption_check = val; + return 0; } early_param("memory_corruption_check", set_corruption_check); static __init int set_corruption_check_period(char *arg) { - char *end; + ssize_t ret; + unsigned long val; - corruption_check_period = simple_strtoul(arg, &end, 10); + ret = kstrtoul(arg, 10, &val); + if (ret) + return ret; - return (*end == 0) ? 0 : -EINVAL; + corruption_check_period = val; + return 0; } early_param("memory_corruption_check_period", set_corruption_check_period); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 0a44b90602b0..146bb6218eec 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -26,7 +26,8 @@ * contact AMD for precise details and a CPU swap. * * See http://www.multimania.com/poulot/k6bug.html - * http://www.amd.com/K6/k6docs/revgd.html + * and section 2.6.2 of "AMD-K6 Processor Revision Guide - Model 6" + * (Publication # 21266 Issue Date: August 1998) * * The following test is erm.. interesting. AMD neglected to up * the chip setting when fixing the bug but they also tweaked some @@ -94,7 +95,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) "system stability may be impaired when more than 32 MB are used.\n"); else printk(KERN_CONT "probably OK (after B9730xxxx).\n"); - printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); } /* K6 with old style WHCR */ @@ -353,10 +353,11 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) node = per_cpu(cpu_llc_id, cpu); /* - * If core numbers are inconsistent, it's likely a multi-fabric platform, - * so invoke platform-specific handler + * On multi-fabric platform (e.g. Numascale NumaChip) a + * platform-specific handler needs to be called to fixup some + * IDs of the CPU. */ - if (c->phys_proc_id != node) + if (x86_cpuinit.fixup_cpu_id) x86_cpuinit.fixup_cpu_id(c, node); if (!node_online(node)) { @@ -579,6 +580,24 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } } + /* re-enable TopologyExtensions if switched off by BIOS */ + if ((c->x86 == 0x15) && + (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) && + !cpu_has(c, X86_FEATURE_TOPOEXT)) { + u64 val; + + if (!rdmsrl_amd_safe(0xc0011005, &val)) { + val |= 1ULL << 54; + wrmsrl_amd_safe(0xc0011005, val); + rdmsrl(0xc0011005, val); + if (val & (1ULL << 54)) { + set_cpu_cap(c, X86_FEATURE_TOPOEXT); + printk(KERN_INFO FW_INFO "CPU: Re-enabling " + "disabled Topology Extensions Support\n"); + } + } + } + cpu_detect_cache_sizes(c); /* Multi core CPU? */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 67e258362a3d..82f29e70d058 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1163,15 +1163,6 @@ static void dbg_restore_debug_regs(void) #endif /* ! CONFIG_KGDB */ /* - * Prints an error where the NUMA and configured core-number mismatch and the - * platform didn't override this to fix it up - */ -void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node) -{ - pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id); -} - -/* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT * and IDT. We reload them nevertheless, this function acts as a @@ -1194,7 +1185,7 @@ void __cpuinit cpu_init(void) oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA - if (cpu != 0 && percpu_read(numa_node) == 0 && + if (cpu != 0 && this_cpu_read(numa_node) == 0 && early_cpu_to_node(cpu) != NUMA_NO_NODE) set_numa_node(early_cpu_to_node(cpu)); #endif diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 73d08ed98a64..9a7c90d80bc4 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -433,14 +433,14 @@ int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot, /* check if @slot is already used or the index is already disabled */ ret = amd_get_l3_disable_slot(nb, slot); if (ret >= 0) - return -EINVAL; + return -EEXIST; if (index > nb->l3_cache.indices) return -EINVAL; /* check whether the other slot has disabled the same index already */ if (index == amd_get_l3_disable_slot(nb, !slot)) - return -EINVAL; + return -EEXIST; amd_l3_disable_index(nb, cpu, slot, index); @@ -468,8 +468,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val); if (err) { if (err == -EEXIST) - printk(KERN_WARNING "L3 disable slot %d in use!\n", - slot); + pr_warning("L3 slot %d in use/index already disabled!\n", + slot); return err; } return count; @@ -615,14 +615,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) new_l2 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); - l2_id = c->apicid >> index_msb; + l2_id = c->apicid & ~((1 << index_msb) - 1); break; case 3: new_l3 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order( num_threads_sharing); - l3_id = c->apicid >> index_msb; + l3_id = c->apicid & ~((1 << index_msb) - 1); break; default: break; diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 5502b289341b..36565373af87 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -23,7 +23,7 @@ * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) * * Arrays used to match for this should also be declared using - * MODULE_DEVICE_TABLE(x86_cpu, ...) + * MODULE_DEVICE_TABLE(x86cpu, ...) * * This always matches against the boot cpu, assuming models and features are * consistent over all CPUs. diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d086a09c087d..2afcbd253e1d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -583,7 +583,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) struct mce m; int i; - percpu_inc(mce_poll_count); + this_cpu_inc(mce_poll_count); mce_gather_info(&m, NULL); @@ -945,9 +945,10 @@ struct mce_info { atomic_t inuse; struct task_struct *t; __u64 paddr; + int restartable; } mce_info[MCE_INFO_MAX]; -static void mce_save_info(__u64 addr) +static void mce_save_info(__u64 addr, int c) { struct mce_info *mi; @@ -955,6 +956,7 @@ static void mce_save_info(__u64 addr) if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { mi->t = current; mi->paddr = addr; + mi->restartable = c; return; } } @@ -1015,7 +1017,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) atomic_inc(&mce_entry); - percpu_inc(mce_exception_count); + this_cpu_inc(mce_exception_count); if (!banks) goto out; @@ -1130,7 +1132,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_panic("Fatal machine check on current CPU", &m, msg); if (worst == MCE_AR_SEVERITY) { /* schedule action before return to userland */ - mce_save_info(m.addr); + mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); set_thread_flag(TIF_MCE_NOTIFY); } else if (kill_it) { force_sig(SIGBUS, current); @@ -1179,7 +1181,13 @@ void mce_notify_process(void) pr_err("Uncorrected hardware memory error in user-access at %llx", mi->paddr); - if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) { + /* + * We must call memory_failure() here even if the current process is + * doomed. We still need to mark the page as poisoned and alert any + * other users of the page. + */ + if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 || + mi->restartable == 0) { pr_err("Memory error not recovered"); force_sig(SIGBUS, current); } @@ -1423,6 +1431,43 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) */ if (c->x86 == 6 && banks > 0) mce_banks[0].ctl = 0; + + /* + * Turn off MC4_MISC thresholding banks on those models since + * they're not supported there. + */ + if (c->x86 == 0x15 && + (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { + int i; + u64 val, hwcr; + bool need_toggle; + u32 msrs[] = { + 0x00000413, /* MC4_MISC0 */ + 0xc0000408, /* MC4_MISC1 */ + }; + + rdmsrl(MSR_K7_HWCR, hwcr); + + /* McStatusWrEn has to be set */ + need_toggle = !(hwcr & BIT(18)); + + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + + for (i = 0; i < ARRAY_SIZE(msrs); i++) { + rdmsrl(msrs[i], val); + + /* CntP bit set? */ + if (val & BIT(62)) { + val &= ~BIT(62); + wrmsrl(msrs[i], val); + } + } + + /* restore old settings */ + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr); + } } if (c->x86_vendor == X86_VENDOR_INTEL) { diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 99b57179f912..f4873a64f46d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -51,6 +51,7 @@ struct threshold_block { unsigned int cpu; u32 address; u16 interrupt_enable; + bool interrupt_capable; u16 threshold_limit; struct kobject kobj; struct list_head miscj; @@ -83,6 +84,21 @@ struct thresh_restart { u16 old_limit; }; +static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) +{ + /* + * bank 4 supports APIC LVT interrupts implicitly since forever. + */ + if (bank == 4) + return true; + + /* + * IntP: interrupt present; if this bit is set, the thresholding + * bank can generate APIC LVT interrupts + */ + return msr_high_bits & BIT(28); +} + static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) { int msr = (hi & MASK_LVTOFF_HI) >> 20; @@ -104,8 +120,10 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) return 1; }; -/* must be called with correct cpu affinity */ -/* Called via smp_call_function_single() */ +/* + * Called via smp_call_function_single(), must be called with correct + * cpu affinity. + */ static void threshold_restart_bank(void *_tr) { struct thresh_restart *tr = _tr; @@ -128,6 +146,12 @@ static void threshold_restart_bank(void *_tr) (new_count & THRESHOLD_MAX); } + /* clear IntType */ + hi &= ~MASK_INT_TYPE_HI; + + if (!tr->b->interrupt_capable) + goto done; + if (tr->set_lvt_off) { if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) { /* set new lvt offset */ @@ -136,9 +160,10 @@ static void threshold_restart_bank(void *_tr) } } - tr->b->interrupt_enable ? - (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : - (hi &= ~MASK_INT_TYPE_HI); + if (tr->b->interrupt_enable) + hi |= INT_TYPE_APIC; + + done: hi |= MASK_COUNT_EN_HI; wrmsr(tr->b->address, lo, hi); @@ -202,14 +227,17 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (shared_bank[bank] && c->cpu_core_id) break; - offset = setup_APIC_mce(offset, - (high & MASK_LVTOFF_HI) >> 20); - memset(&b, 0, sizeof(b)); - b.cpu = cpu; - b.bank = bank; - b.block = block; - b.address = address; + b.cpu = cpu; + b.bank = bank; + b.block = block; + b.address = address; + b.interrupt_capable = lvt_interrupt_supported(bank, high); + + if (b.interrupt_capable) { + int new = (high & MASK_LVTOFF_HI) >> 20; + offset = setup_APIC_mce(offset, new); + } mce_threshold_block_init(&b, offset); mce_threshold_vector = amd_threshold_interrupt; @@ -309,6 +337,9 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) struct thresh_restart tr; unsigned long new; + if (!b->interrupt_capable) + return -EINVAL; + if (strict_strtoul(buf, 0, &new) < 0) return -EINVAL; @@ -390,10 +421,10 @@ RW_ATTR(threshold_limit); RW_ATTR(error_count); static struct attribute *default_attrs[] = { - &interrupt_enable.attr, &threshold_limit.attr, &error_count.attr, - NULL + NULL, /* possibly interrupt_enable if supported, see below */ + NULL, }; #define to_block(k) container_of(k, struct threshold_block, kobj) @@ -467,8 +498,14 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, b->cpu = cpu; b->address = address; b->interrupt_enable = 0; + b->interrupt_capable = lvt_interrupt_supported(bank, high); b->threshold_limit = THRESHOLD_MAX; + if (b->interrupt_capable) + threshold_ktype.default_attrs[2] = &interrupt_enable.attr; + else + threshold_ktype.default_attrs[2] = NULL; + INIT_LIST_HEAD(&b->miscj); if (per_cpu(threshold_banks, cpu)[bank]->blocks) { diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 65652265fffd..11a4eb9131d5 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -496,6 +496,7 @@ static __initconst const struct x86_pmu amd_pmu = { * 0x023 DE PERF_CTL[2:0] * 0x02D LS PERF_CTL[3] * 0x02E LS PERF_CTL[3,0] + * 0x031 LS PERF_CTL[2:0] (**) * 0x043 CU PERF_CTL[2:0] * 0x045 CU PERF_CTL[2:0] * 0x046 CU PERF_CTL[2:0] @@ -509,10 +510,12 @@ static __initconst const struct x86_pmu amd_pmu = { * 0x0DD LS PERF_CTL[5:0] * 0x0DE LS PERF_CTL[5:0] * 0x0DF LS PERF_CTL[5:0] + * 0x1C0 EX PERF_CTL[5:3] * 0x1D6 EX PERF_CTL[5:0] * 0x1D8 EX PERF_CTL[5:0] * - * (*) depending on the umask all FPU counters may be used + * (*) depending on the umask all FPU counters may be used + * (**) only one unitmask enabled at a time */ static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); @@ -562,6 +565,12 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev return &amd_f15_PMC3; case 0x02E: return &amd_f15_PMC30; + case 0x031: + if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1) + return &amd_f15_PMC20; + return &emptyconstraint; + case 0x1C0: + return &amd_f15_PMC53; default: return &amd_f15_PMC50; } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 1b81839b6c88..571246d81edf 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -271,7 +271,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) return 1; - show_registers(regs); + show_regs(regs); #ifdef CONFIG_X86_32 if (user_mode_vm(regs)) { sp = regs->sp; @@ -311,16 +311,33 @@ void die(const char *str, struct pt_regs *regs, long err) static int __init kstack_setup(char *s) { + ssize_t ret; + unsigned long val; + if (!s) return -EINVAL; - kstack_depth_to_print = simple_strtoul(s, NULL, 0); + + ret = kstrtoul(s, 0, &val); + if (ret) + return ret; + kstack_depth_to_print = val; return 0; } early_param("kstack", kstack_setup); static int __init code_bytes_setup(char *s) { - code_bytes = simple_strtoul(s, NULL, 0); + ssize_t ret; + unsigned long val; + + if (!s) + return -EINVAL; + + ret = kstrtoul(s, 0, &val); + if (ret) + return ret; + + code_bytes = val; if (code_bytes > 8192) code_bytes = 8192; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 88ec9129271d..e0b1d783daab 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -82,7 +82,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, } -void show_registers(struct pt_regs *regs) +void show_regs(struct pt_regs *regs) { int i; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 17107bd6e1f0..791b76122aa8 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -245,7 +245,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, show_trace_log_lvl(task, regs, sp, bp, log_lvl); } -void show_registers(struct pt_regs *regs) +void show_regs(struct pt_regs *regs) { int i; unsigned long sp; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 7b784f4ef1e4..01ccf9b71473 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -56,6 +56,7 @@ #include <asm/irq_vectors.h> #include <asm/cpufeature.h> #include <asm/alternative-asm.h> +#include <asm/asm.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ #include <linux/elf-em.h> @@ -151,10 +152,8 @@ .pushsection .fixup, "ax" 99: movl $0, (%esp) jmp 98b -.section __ex_table, "a" - .align 4 - .long 98b, 99b .popsection + _ASM_EXTABLE(98b,99b) .endm .macro PTGS_TO_GS @@ -164,10 +163,8 @@ .pushsection .fixup, "ax" 99: movl $0, PT_GS(%esp) jmp 98b -.section __ex_table, "a" - .align 4 - .long 98b, 99b .popsection + _ASM_EXTABLE(98b,99b) .endm .macro GS_TO_REG reg @@ -249,12 +246,10 @@ jmp 2b 6: movl $0, (%esp) jmp 3b -.section __ex_table, "a" - .align 4 - .long 1b, 4b - .long 2b, 5b - .long 3b, 6b .popsection + _ASM_EXTABLE(1b,4b) + _ASM_EXTABLE(2b,5b) + _ASM_EXTABLE(3b,6b) POP_GS_EX .endm @@ -415,10 +410,7 @@ sysenter_past_esp: jae syscall_fault 1: movl (%ebp),%ebp movl %ebp,PT_EBP(%esp) -.section __ex_table,"a" - .align 4 - .long 1b,syscall_fault -.previous + _ASM_EXTABLE(1b,syscall_fault) GET_THREAD_INFO(%ebp) @@ -485,10 +477,8 @@ sysexit_audit: .pushsection .fixup,"ax" 2: movl $0,PT_FS(%esp) jmp 1b -.section __ex_table,"a" - .align 4 - .long 1b,2b .popsection + _ASM_EXTABLE(1b,2b) PTGS_TO_GS_EX ENDPROC(ia32_sysenter_target) @@ -543,10 +533,7 @@ ENTRY(iret_exc) pushl $do_iret_error jmp error_code .previous -.section __ex_table,"a" - .align 4 - .long irq_return,iret_exc -.previous + _ASM_EXTABLE(irq_return,iret_exc) CFI_RESTORE_STATE ldt_ss: @@ -901,10 +888,7 @@ END(device_not_available) #ifdef CONFIG_PARAVIRT ENTRY(native_iret) iret -.section __ex_table,"a" - .align 4 - .long native_iret, iret_exc -.previous + _ASM_EXTABLE(native_iret, iret_exc) END(native_iret) ENTRY(native_irq_enable_sysexit) @@ -1093,13 +1077,10 @@ ENTRY(xen_failsafe_callback) movl %eax,16(%esp) jmp 4b .previous -.section __ex_table,"a" - .align 4 - .long 1b,6b - .long 2b,7b - .long 3b,8b - .long 4b,9b -.previous + _ASM_EXTABLE(1b,6b) + _ASM_EXTABLE(2b,7b) + _ASM_EXTABLE(3b,8b) + _ASM_EXTABLE(4b,9b) ENDPROC(xen_failsafe_callback) BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index cdc79b5cfcd9..320852d02026 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -55,6 +55,7 @@ #include <asm/paravirt.h> #include <asm/ftrace.h> #include <asm/percpu.h> +#include <asm/asm.h> #include <linux/err.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ @@ -900,18 +901,12 @@ restore_args: irq_return: INTERRUPT_RETURN - - .section __ex_table, "a" - .quad irq_return, bad_iret - .previous + _ASM_EXTABLE(irq_return, bad_iret) #ifdef CONFIG_PARAVIRT ENTRY(native_iret) iretq - - .section __ex_table,"a" - .quad native_iret, bad_iret - .previous + _ASM_EXTABLE(native_iret, bad_iret) #endif .section .fixup,"ax" @@ -1181,10 +1176,7 @@ gs_change: CFI_ENDPROC END(native_load_gs_index) - .section __ex_table,"a" - .align 8 - .quad gs_change,bad_gs - .previous + _ASM_EXTABLE(gs_change,bad_gs) .section .fixup,"ax" /* running with kernelgs */ bad_gs: diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 4243e8bbdcb1..32ff36596ab1 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -435,7 +435,7 @@ static void run_sync(void) local_irq_disable(); } -static void ftrace_replace_code(int enable) +void ftrace_replace_code(int enable) { struct ftrace_rec_iter *iter; struct dyn_ftrace *rec; @@ -493,18 +493,7 @@ void arch_ftrace_update_code(int command) { modifying_ftrace_code++; - if (command & FTRACE_UPDATE_CALLS) - ftrace_replace_code(1); - else if (command & FTRACE_DISABLE_CALLS) - ftrace_replace_code(0); - - if (command & FTRACE_UPDATE_TRACE_FUNC) - ftrace_update_ftrace_func(ftrace_trace_function); - - if (command & FTRACE_START_FUNC_RET) - ftrace_enable_ftrace_graph_caller(); - else if (command & FTRACE_STOP_FUNC_RET) - ftrace_disable_ftrace_graph_caller(); + ftrace_modify_all_code(command); modifying_ftrace_code--; } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index ce0be7cd085e..463c9797ca6a 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -21,6 +21,7 @@ #include <asm/msr-index.h> #include <asm/cpufeature.h> #include <asm/percpu.h> +#include <asm/nops.h> /* Physical address */ #define pa(X) ((X) - __PAGE_OFFSET) @@ -363,28 +364,23 @@ default_entry: pushl $0 popfl -#ifdef CONFIG_SMP - cmpb $0, ready - jnz checkCPUtype -#endif /* CONFIG_SMP */ - /* * start system 32-bit setup. We need to re-do some of the things done * in 16-bit mode for the "real" operations. */ - call setup_idt - -checkCPUtype: - - movl $-1,X86_CPUID # -1 for no CPUID initially - + movl setup_once_ref,%eax + andl %eax,%eax + jz 1f # Did we do this already? + call *%eax +1: + /* check if it is 486 or 386. */ /* * XXX - this does a lot of unnecessary setup. Alignment checks don't * apply at our cpl of 0 and the stack ought to be aligned already, and * we don't need to preserve eflags. */ - + movl $-1,X86_CPUID # -1 for no CPUID initially movb $3,X86 # at least 386 pushfl # push EFLAGS popl %eax # get EFLAGS @@ -450,21 +446,6 @@ is386: movl $2,%ecx # set MP movl $(__KERNEL_PERCPU), %eax movl %eax,%fs # set this cpu's percpu -#ifdef CONFIG_CC_STACKPROTECTOR - /* - * The linker can't handle this by relocation. Manually set - * base address in stack canary segment descriptor. - */ - cmpb $0,ready - jne 1f - movl $gdt_page,%eax - movl $stack_canary,%ecx - movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) - shrl $16, %ecx - movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) - movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) -1: -#endif movl $(__KERNEL_STACK_CANARY),%eax movl %eax,%gs @@ -473,7 +454,6 @@ is386: movl $2,%ecx # set MP cld # gcc2 wants the direction flag cleared at all times pushl $0 # fake return address for unwinder - movb $1, ready jmp *(initial_code) /* @@ -495,81 +475,122 @@ check_x87: .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */ ret + +#include "verify_cpu.S" + /* - * setup_idt + * setup_once * - * sets up a idt with 256 entries pointing to - * ignore_int, interrupt gates. It doesn't actually load - * idt - that can be done only after paging has been enabled - * and the kernel moved to PAGE_OFFSET. Interrupts - * are enabled elsewhere, when we can be relatively - * sure everything is ok. + * The setup work we only want to run on the BSP. * * Warning: %esi is live across this function. */ -setup_idt: - lea ignore_int,%edx - movl $(__KERNEL_CS << 16),%eax - movw %dx,%ax /* selector = 0x0010 = cs */ - movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ +__INIT +setup_once: + /* + * Set up a idt with 256 entries pointing to ignore_int, + * interrupt gates. It doesn't actually load idt - that needs + * to be done on each CPU. Interrupts are enabled elsewhere, + * when we can be relatively sure everything is ok. + */ - lea idt_table,%edi - mov $256,%ecx -rp_sidt: + movl $idt_table,%edi + movl $early_idt_handlers,%eax + movl $NUM_EXCEPTION_VECTORS,%ecx +1: movl %eax,(%edi) - movl %edx,4(%edi) + movl %eax,4(%edi) + /* interrupt gate, dpl=0, present */ + movl $(0x8E000000 + __KERNEL_CS),2(%edi) + addl $9,%eax addl $8,%edi - dec %ecx - jne rp_sidt + loop 1b -.macro set_early_handler handler,trapno - lea \handler,%edx + movl $256 - NUM_EXCEPTION_VECTORS,%ecx + movl $ignore_int,%edx movl $(__KERNEL_CS << 16),%eax - movw %dx,%ax + movw %dx,%ax /* selector = 0x0010 = cs */ movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ - lea idt_table,%edi - movl %eax,8*\trapno(%edi) - movl %edx,8*\trapno+4(%edi) -.endm +2: + movl %eax,(%edi) + movl %edx,4(%edi) + addl $8,%edi + loop 2b - set_early_handler handler=early_divide_err,trapno=0 - set_early_handler handler=early_illegal_opcode,trapno=6 - set_early_handler handler=early_protection_fault,trapno=13 - set_early_handler handler=early_page_fault,trapno=14 +#ifdef CONFIG_CC_STACKPROTECTOR + /* + * Configure the stack canary. The linker can't handle this by + * relocation. Manually set base address in stack canary + * segment descriptor. + */ + movl $gdt_page,%eax + movl $stack_canary,%ecx + movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) + shrl $16, %ecx + movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) + movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) +#endif + andl $0,setup_once_ref /* Once is enough, thanks */ ret -early_divide_err: - xor %edx,%edx - pushl $0 /* fake errcode */ - jmp early_fault +ENTRY(early_idt_handlers) + # 36(%esp) %eflags + # 32(%esp) %cs + # 28(%esp) %eip + # 24(%rsp) error code + i = 0 + .rept NUM_EXCEPTION_VECTORS + .if (EXCEPTION_ERRCODE_MASK >> i) & 1 + ASM_NOP2 + .else + pushl $0 # Dummy error code, to make stack frame uniform + .endif + pushl $i # 20(%esp) Vector number + jmp early_idt_handler + i = i + 1 + .endr +ENDPROC(early_idt_handlers) + + /* This is global to keep gas from relaxing the jumps */ +ENTRY(early_idt_handler) + cld + cmpl $2,%ss:early_recursion_flag + je hlt_loop + incl %ss:early_recursion_flag -early_illegal_opcode: - movl $6,%edx - pushl $0 /* fake errcode */ - jmp early_fault + push %eax # 16(%esp) + push %ecx # 12(%esp) + push %edx # 8(%esp) + push %ds # 4(%esp) + push %es # 0(%esp) + movl $(__KERNEL_DS),%eax + movl %eax,%ds + movl %eax,%es -early_protection_fault: - movl $13,%edx - jmp early_fault + cmpl $(__KERNEL_CS),32(%esp) + jne 10f -early_page_fault: - movl $14,%edx - jmp early_fault + leal 28(%esp),%eax # Pointer to %eip + call early_fixup_exception + andl %eax,%eax + jnz ex_entry /* found an exception entry */ -early_fault: - cld +10: #ifdef CONFIG_PRINTK - pusha - movl $(__KERNEL_DS),%eax - movl %eax,%ds - movl %eax,%es - cmpl $2,early_recursion_flag - je hlt_loop - incl early_recursion_flag + xorl %eax,%eax + movw %ax,2(%esp) /* clean up the segment values on some cpus */ + movw %ax,6(%esp) + movw %ax,34(%esp) + leal 40(%esp),%eax + pushl %eax /* %esp before the exception */ + pushl %ebx + pushl %ebp + pushl %esi + pushl %edi movl %cr2,%eax pushl %eax - pushl %edx /* trapno */ + pushl (20+6*4)(%esp) /* trapno */ pushl $fault_msg call printk #endif @@ -578,6 +599,17 @@ hlt_loop: hlt jmp hlt_loop +ex_entry: + pop %es + pop %ds + pop %edx + pop %ecx + pop %eax + addl $8,%esp /* drop vector number and error code */ + decl %ss:early_recursion_flag + iret +ENDPROC(early_idt_handler) + /* This is the default interrupt "handler" :-) */ ALIGN ignore_int: @@ -611,13 +643,18 @@ ignore_int: popl %eax #endif iret +ENDPROC(ignore_int) +__INITDATA + .align 4 +early_recursion_flag: + .long 0 -#include "verify_cpu.S" - - __REFDATA -.align 4 +__REFDATA + .align 4 ENTRY(initial_code) .long i386_start_kernel +ENTRY(setup_once_ref) + .long setup_once /* * BSS section @@ -670,22 +707,19 @@ ENTRY(initial_page_table) ENTRY(stack_start) .long init_thread_union+THREAD_SIZE -early_recursion_flag: - .long 0 - -ready: .byte 0 - +__INITRODATA int_msg: .asciz "Unknown interrupt or fault at: %p %p %p\n" fault_msg: /* fault info: */ .ascii "BUG: Int %d: CR2 %p\n" -/* pusha regs: */ - .ascii " EDI %p ESI %p EBP %p ESP %p\n" - .ascii " EBX %p EDX %p ECX %p EAX %p\n" +/* regs pushed in early_idt_handler: */ + .ascii " EDI %p ESI %p EBP %p EBX %p\n" + .ascii " ESP %p ES %p DS %p\n" + .ascii " EDX %p ECX %p EAX %p\n" /* fault frame: */ - .ascii " err %p EIP %p CS %p flg %p\n" + .ascii " vec %p err %p EIP %p CS %p flg %p\n" .ascii "Stack: %p %p %p %p %p %p %p %p\n" .ascii " %p %p %p %p %p %p %p %p\n" .asciz " %p %p %p %p %p %p %p %p\n" @@ -699,6 +733,7 @@ fault_msg: * segment size, and 32-bit linear address value: */ + .data .globl boot_gdt_descr .globl idt_descr diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 40f4eb3766d1..7a40f2447321 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -19,12 +19,15 @@ #include <asm/cache.h> #include <asm/processor-flags.h> #include <asm/percpu.h> +#include <asm/nops.h> #ifdef CONFIG_PARAVIRT #include <asm/asm-offsets.h> #include <asm/paravirt.h> +#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg #else -#define GET_CR2_INTO_RCX movq %cr2, %rcx +#define GET_CR2_INTO(reg) movq %cr2, reg +#define INTERRUPT_RETURN iretq #endif /* we are not able to switch in one step to the final KERNEL ADDRESS SPACE @@ -270,36 +273,56 @@ bad_address: jmp bad_address .section ".init.text","ax" -#ifdef CONFIG_EARLY_PRINTK .globl early_idt_handlers early_idt_handlers: + # 104(%rsp) %rflags + # 96(%rsp) %cs + # 88(%rsp) %rip + # 80(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - movl $i, %esi + .if (EXCEPTION_ERRCODE_MASK >> i) & 1 + ASM_NOP2 + .else + pushq $0 # Dummy error code, to make stack frame uniform + .endif + pushq $i # 72(%rsp) Vector number jmp early_idt_handler i = i + 1 .endr -#endif ENTRY(early_idt_handler) -#ifdef CONFIG_EARLY_PRINTK + cld + cmpl $2,early_recursion_flag(%rip) jz 1f incl early_recursion_flag(%rip) - GET_CR2_INTO_RCX - movq %rcx,%r9 - xorl %r8d,%r8d # zero for error code - movl %esi,%ecx # get vector number - # Test %ecx against mask of vectors that push error code. - cmpl $31,%ecx - ja 0f - movl $1,%eax - salq %cl,%rax - testl $0x27d00,%eax - je 0f - popq %r8 # get error code -0: movq 0(%rsp),%rcx # get ip - movq 8(%rsp),%rdx # get cs + + pushq %rax # 64(%rsp) + pushq %rcx # 56(%rsp) + pushq %rdx # 48(%rsp) + pushq %rsi # 40(%rsp) + pushq %rdi # 32(%rsp) + pushq %r8 # 24(%rsp) + pushq %r9 # 16(%rsp) + pushq %r10 # 8(%rsp) + pushq %r11 # 0(%rsp) + + cmpl $__KERNEL_CS,96(%rsp) + jne 10f + + leaq 88(%rsp),%rdi # Pointer to %rip + call early_fixup_exception + andl %eax,%eax + jnz 20f # Found an exception entry + +10: +#ifdef CONFIG_EARLY_PRINTK + GET_CR2_INTO(%r9) # can clobber any volatile register if pv + movl 80(%rsp),%r8d # error code + movl 72(%rsp),%esi # vector number + movl 96(%rsp),%edx # %cs + movq 88(%rsp),%rcx # %rip xorl %eax,%eax leaq early_idt_msg(%rip),%rdi call early_printk @@ -308,17 +331,32 @@ ENTRY(early_idt_handler) call dump_stack #ifdef CONFIG_KALLSYMS leaq early_idt_ripmsg(%rip),%rdi - movq 0(%rsp),%rsi # get rip again + movq 40(%rsp),%rsi # %rip again call __print_symbol #endif #endif /* EARLY_PRINTK */ 1: hlt jmp 1b -#ifdef CONFIG_EARLY_PRINTK +20: # Exception table entry found + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rdi + popq %rsi + popq %rdx + popq %rcx + popq %rax + addq $16,%rsp # drop vector number and error code + decl early_recursion_flag(%rip) + INTERRUPT_RETURN + + .balign 4 early_recursion_flag: .long 0 +#ifdef CONFIG_EARLY_PRINTK early_idt_msg: .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n" early_idt_ripmsg: diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 7734bcbb5a3a..f250431fb505 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -88,7 +88,7 @@ void kernel_fpu_begin(void) __thread_clear_has_fpu(me); /* We do 'stts()' in kernel_fpu_end() */ } else { - percpu_write(fpu_owner_task, NULL); + this_cpu_write(fpu_owner_task, NULL); clts(); } } @@ -235,6 +235,7 @@ int init_fpu(struct task_struct *tsk) if (tsk_used_math(tsk)) { if (HAVE_HWFP && tsk == current) unlazy_fpu(tsk); + tsk->thread.fpu.last_cpu = ~0; return 0; } diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c deleted file mode 100644 index 43e9ccf44947..000000000000 --- a/arch/x86/kernel/init_task.c +++ /dev/null @@ -1,42 +0,0 @@ -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/init_task.h> -#include <linux/fs.h> -#include <linux/mqueue.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/desc.h> - -static struct signal_struct init_signals = INIT_SIGNALS(init_signals); -static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); - -/* - * Initial thread structure. - * - * We need to make sure that this is THREAD_SIZE aligned due to the - * way process stacks are handled. This is done by having a special - * "init_task" linker map entry.. - */ -union thread_union init_thread_union __init_task_data = - { INIT_THREAD_INFO(init_task) }; - -/* - * Initial task structure. - * - * All other task structs will be allocated on slabs in fork.c - */ -struct task_struct init_task = INIT_TASK(init_task); -EXPORT_SYMBOL(init_task); - -/* - * per-CPU TSS segments. Threads are completely 'soft' on Linux, - * no more per-task TSS's. The TSS size is kept cacheline-aligned - * so they are allowed to end up in the .data..cacheline_aligned - * section. Since TSS's are completely CPU-local, we want them - * on exact cacheline boundaries, to eliminate cacheline ping-pong. - */ -DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; - diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 58b7f27cb3e9..344faf8d0d62 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -127,8 +127,8 @@ void __cpuinit irq_ctx_init(int cpu) return; irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), - THREAD_FLAGS, - THREAD_ORDER)); + THREADINFO_GFP, + THREAD_SIZE_ORDER)); memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); irqctx->tinfo.cpu = cpu; irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; @@ -137,8 +137,8 @@ void __cpuinit irq_ctx_init(int cpu) per_cpu(hardirq_ctx, cpu) = irqctx; irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), - THREAD_FLAGS, - THREAD_ORDER)); + THREADINFO_GFP, + THREAD_SIZE_ORDER)); memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); irqctx->tinfo.cpu = cpu; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index e213fc8408d2..e2f751efb7b1 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -1037,9 +1037,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) "current sp %p does not match saved sp %p\n", stack_addr(regs), kcb->jprobe_saved_sp); printk(KERN_ERR "Saved registers for jprobe %p\n", jp); - show_registers(saved_regs); + show_regs(saved_regs); printk(KERN_ERR "Current registers\n"); - show_registers(regs); + show_regs(regs); BUG(); } *regs = kcb->jprobe_saved_regs; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b8ba6e4a27e4..e554e5ad2fe8 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -79,7 +79,6 @@ struct kvm_task_sleep_node { u32 token; int cpu; bool halted; - struct mm_struct *mm; }; static struct kvm_task_sleep_head { @@ -126,9 +125,7 @@ void kvm_async_pf_task_wait(u32 token) n.token = token; n.cpu = smp_processor_id(); - n.mm = current->active_mm; n.halted = idle || preempt_count() > 1; - atomic_inc(&n.mm->mm_count); init_waitqueue_head(&n.wq); hlist_add_head(&n.link, &b->list); spin_unlock(&b->lock); @@ -161,9 +158,6 @@ EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); static void apf_task_wake_one(struct kvm_task_sleep_node *n) { hlist_del_init(&n->link); - if (!n->mm) - return; - mmdrop(n->mm); if (n->halted) smp_send_reschedule(n->cpu); else if (waitqueue_active(&n->wq)) @@ -207,7 +201,7 @@ again: * async PF was not yet handled. * Add dummy entry for the token. */ - n = kmalloc(sizeof(*n), GFP_ATOMIC); + n = kzalloc(sizeof(*n), GFP_ATOMIC); if (!n) { /* * Allocation failed! Busy wait while other cpu @@ -219,7 +213,6 @@ again: } n->token = token; n->cpu = smp_processor_id(); - n->mm = NULL; init_waitqueue_head(&n->wq); hlist_add_head(&n->link, &b->list); } else diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c deleted file mode 100644 index 7eb1e2b97827..000000000000 --- a/arch/x86/kernel/mca_32.c +++ /dev/null @@ -1,476 +0,0 @@ -/* - * Written by Martin Kolinek, February 1996 - * - * Changes: - * - * Chris Beauregard July 28th, 1996 - * - Fixed up integrated SCSI detection - * - * Chris Beauregard August 3rd, 1996 - * - Made mca_info local - * - Made integrated registers accessible through standard function calls - * - Added name field - * - More sanity checking - * - * Chris Beauregard August 9th, 1996 - * - Rewrote /proc/mca - * - * Chris Beauregard January 7th, 1997 - * - Added basic NMI-processing - * - Added more information to mca_info structure - * - * David Weinehall October 12th, 1998 - * - Made a lot of cleaning up in the source - * - Added use of save_flags / restore_flags - * - Added the 'driver_loaded' flag in MCA_adapter - * - Added an alternative implemention of ZP Gu's mca_find_unused_adapter - * - * David Weinehall March 24th, 1999 - * - Fixed the output of 'Driver Installed' in /proc/mca/pos - * - Made the Integrated Video & SCSI show up even if they have id 0000 - * - * Alexander Viro November 9th, 1999 - * - Switched to regular procfs methods - * - * Alfred Arnold & David Weinehall August 23rd, 2000 - * - Added support for Planar POS-registers - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/mca.h> -#include <linux/kprobes.h> -#include <linux/slab.h> -#include <asm/io.h> -#include <linux/proc_fs.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/ioport.h> -#include <asm/uaccess.h> -#include <linux/init.h> - -static unsigned char which_scsi; - -int MCA_bus; -EXPORT_SYMBOL(MCA_bus); - -/* - * Motherboard register spinlock. Untested on SMP at the moment, but - * are there any MCA SMP boxes? - * - * Yes - Alan - */ -static DEFINE_SPINLOCK(mca_lock); - -/* Build the status info for the adapter */ - -static void mca_configure_adapter_status(struct mca_device *mca_dev) -{ - mca_dev->status = MCA_ADAPTER_NONE; - - mca_dev->pos_id = mca_dev->pos[0] - + (mca_dev->pos[1] << 8); - - if (!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) { - - /* - * id = 0x0000 usually indicates hardware failure, - * however, ZP Gu (zpg@castle.net> reports that his 9556 - * has 0x0000 as id and everything still works. There - * also seem to be an adapter with id = 0x0000; the - * NCR Parallel Bus Memory Card. Until this is confirmed, - * however, this code will stay. - */ - - mca_dev->status = MCA_ADAPTER_ERROR; - - return; - } else if (mca_dev->pos_id != 0xffff) { - - /* - * 0xffff usually indicates that there's no adapter, - * however, some integrated adapters may have 0xffff as - * their id and still be valid. Examples are on-board - * VGA of the 55sx, the integrated SCSI of the 56 & 57, - * and possibly also the 95 ULTIMEDIA. - */ - - mca_dev->status = MCA_ADAPTER_NORMAL; - } - - if ((mca_dev->pos_id == 0xffff || - mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) { - int j; - - for (j = 2; j < 8; j++) { - if (mca_dev->pos[j] != 0xff) { - mca_dev->status = MCA_ADAPTER_NORMAL; - break; - } - } - } - - if (!(mca_dev->pos[2] & MCA_ENABLED)) { - - /* enabled bit is in POS 2 */ - - mca_dev->status = MCA_ADAPTER_DISABLED; - } -} /* mca_configure_adapter_status */ - -/*--------------------------------------------------------------------*/ - -static struct resource mca_standard_resources[] = { - { .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" }, - { .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" }, - { .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" }, - { .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" }, - { .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" }, - { .start = 0x96, .end = 0x97, .name = "POS (MCA)" }, - { .start = 0x100, .end = 0x107, .name = "POS (MCA)" } -}; - -#define MCA_STANDARD_RESOURCES ARRAY_SIZE(mca_standard_resources) - -/* - * mca_read_and_store_pos - read the POS registers into a memory buffer - * @pos: a char pointer to 8 bytes, contains the POS register value on - * successful return - * - * Returns 1 if a card actually exists (i.e. the pos isn't - * all 0xff) or 0 otherwise - */ -static int mca_read_and_store_pos(unsigned char *pos) -{ - int j; - int found = 0; - - for (j = 0; j < 8; j++) { - pos[j] = inb_p(MCA_POS_REG(j)); - if (pos[j] != 0xff) { - /* 0xff all across means no device. 0x00 means - * something's broken, but a device is - * probably there. However, if you get 0x00 - * from a motherboard register it won't matter - * what we find. For the record, on the - * 57SLC, the integrated SCSI adapter has - * 0xffff for the adapter ID, but nonzero for - * other registers. */ - - found = 1; - } - } - return found; -} - -static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg) -{ - unsigned char byte; - unsigned long flags; - - if (reg < 0 || reg >= 8) - return 0; - - spin_lock_irqsave(&mca_lock, flags); - if (mca_dev->pos_register) { - /* Disable adapter setup, enable motherboard setup */ - - outb_p(0, MCA_ADAPTER_SETUP_REG); - outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG); - - byte = inb_p(MCA_POS_REG(reg)); - outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); - } else { - - /* Make sure motherboard setup is off */ - - outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); - - /* Read the appropriate register */ - - outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG); - byte = inb_p(MCA_POS_REG(reg)); - outb_p(0, MCA_ADAPTER_SETUP_REG); - } - spin_unlock_irqrestore(&mca_lock, flags); - - mca_dev->pos[reg] = byte; - - return byte; -} - -static void mca_pc_write_pos(struct mca_device *mca_dev, int reg, - unsigned char byte) -{ - unsigned long flags; - - if (reg < 0 || reg >= 8) - return; - - spin_lock_irqsave(&mca_lock, flags); - - /* Make sure motherboard setup is off */ - - outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); - - /* Read in the appropriate register */ - - outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG); - outb_p(byte, MCA_POS_REG(reg)); - outb_p(0, MCA_ADAPTER_SETUP_REG); - - spin_unlock_irqrestore(&mca_lock, flags); - - /* Update the global register list, while we have the byte */ - - mca_dev->pos[reg] = byte; - -} - -/* for the primary MCA bus, we have identity transforms */ -static int mca_dummy_transform_irq(struct mca_device *mca_dev, int irq) -{ - return irq; -} - -static int mca_dummy_transform_ioport(struct mca_device *mca_dev, int port) -{ - return port; -} - -static void *mca_dummy_transform_memory(struct mca_device *mca_dev, void *mem) -{ - return mem; -} - - -static int __init mca_init(void) -{ - unsigned int i, j; - struct mca_device *mca_dev; - unsigned char pos[8]; - short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00}; - struct mca_bus *bus; - - /* - * WARNING: Be careful when making changes here. Putting an adapter - * and the motherboard simultaneously into setup mode may result in - * damage to chips (according to The Indispensable PC Hardware Book - * by Hans-Peter Messmer). Also, we disable system interrupts (so - * that we are not disturbed in the middle of this). - */ - - /* Make sure the MCA bus is present */ - - if (mca_system_init()) { - printk(KERN_ERR "MCA bus system initialisation failed\n"); - return -ENODEV; - } - - if (!MCA_bus) - return -ENODEV; - - printk(KERN_INFO "Micro Channel bus detected.\n"); - - /* All MCA systems have at least a primary bus */ - bus = mca_attach_bus(MCA_PRIMARY_BUS); - if (!bus) - goto out_nomem; - bus->default_dma_mask = 0xffffffffLL; - bus->f.mca_write_pos = mca_pc_write_pos; - bus->f.mca_read_pos = mca_pc_read_pos; - bus->f.mca_transform_irq = mca_dummy_transform_irq; - bus->f.mca_transform_ioport = mca_dummy_transform_ioport; - bus->f.mca_transform_memory = mca_dummy_transform_memory; - - /* get the motherboard device */ - mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL); - if (unlikely(!mca_dev)) - goto out_nomem; - - /* - * We do not expect many MCA interrupts during initialization, - * but let us be safe: - */ - spin_lock_irq(&mca_lock); - - /* Make sure adapter setup is off */ - - outb_p(0, MCA_ADAPTER_SETUP_REG); - - /* Read motherboard POS registers */ - - mca_dev->pos_register = 0x7f; - outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG); - mca_dev->name[0] = 0; - mca_read_and_store_pos(mca_dev->pos); - mca_configure_adapter_status(mca_dev); - /* fake POS and slot for a motherboard */ - mca_dev->pos_id = MCA_MOTHERBOARD_POS; - mca_dev->slot = MCA_MOTHERBOARD; - mca_register_device(MCA_PRIMARY_BUS, mca_dev); - - mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); - if (unlikely(!mca_dev)) - goto out_unlock_nomem; - - /* Put motherboard into video setup mode, read integrated video - * POS registers, and turn motherboard setup off. - */ - - mca_dev->pos_register = 0xdf; - outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG); - mca_dev->name[0] = 0; - mca_read_and_store_pos(mca_dev->pos); - mca_configure_adapter_status(mca_dev); - /* fake POS and slot for the integrated video */ - mca_dev->pos_id = MCA_INTEGVIDEO_POS; - mca_dev->slot = MCA_INTEGVIDEO; - mca_register_device(MCA_PRIMARY_BUS, mca_dev); - - /* - * Put motherboard into scsi setup mode, read integrated scsi - * POS registers, and turn motherboard setup off. - * - * It seems there are two possible SCSI registers. Martin says that - * for the 56,57, 0xf7 is the one, but fails on the 76. - * Alfredo (apena@vnet.ibm.com) says - * 0xfd works on his machine. We'll try both of them. I figure it's - * a good bet that only one could be valid at a time. This could - * screw up though if one is used for something else on the other - * machine. - */ - - for (i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) { - outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG); - if (mca_read_and_store_pos(pos)) - break; - } - if (which_scsi) { - /* found a scsi card */ - mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); - if (unlikely(!mca_dev)) - goto out_unlock_nomem; - - for (j = 0; j < 8; j++) - mca_dev->pos[j] = pos[j]; - - mca_configure_adapter_status(mca_dev); - /* fake POS and slot for integrated SCSI controller */ - mca_dev->pos_id = MCA_INTEGSCSI_POS; - mca_dev->slot = MCA_INTEGSCSI; - mca_dev->pos_register = which_scsi; - mca_register_device(MCA_PRIMARY_BUS, mca_dev); - } - - /* Turn off motherboard setup */ - - outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); - - /* - * Now loop over MCA slots: put each adapter into setup mode, and - * read its POS registers. Then put adapter setup off. - */ - - for (i = 0; i < MCA_MAX_SLOT_NR; i++) { - outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG); - if (!mca_read_and_store_pos(pos)) - continue; - - mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); - if (unlikely(!mca_dev)) - goto out_unlock_nomem; - - for (j = 0; j < 8; j++) - mca_dev->pos[j] = pos[j]; - - mca_dev->driver_loaded = 0; - mca_dev->slot = i; - mca_dev->pos_register = 0; - mca_configure_adapter_status(mca_dev); - mca_register_device(MCA_PRIMARY_BUS, mca_dev); - } - outb_p(0, MCA_ADAPTER_SETUP_REG); - - /* Enable interrupts and return memory start */ - spin_unlock_irq(&mca_lock); - - for (i = 0; i < MCA_STANDARD_RESOURCES; i++) - request_resource(&ioport_resource, mca_standard_resources + i); - - mca_do_proc_init(); - - return 0; - - out_unlock_nomem: - spin_unlock_irq(&mca_lock); - out_nomem: - printk(KERN_EMERG "Failed memory allocation in MCA setup!\n"); - return -ENOMEM; -} - -subsys_initcall(mca_init); - -/*--------------------------------------------------------------------*/ - -static __kprobes void -mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag) -{ - int slot = mca_dev->slot; - - if (slot == MCA_INTEGSCSI) { - printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n", - mca_dev->name); - } else if (slot == MCA_INTEGVIDEO) { - printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n", - mca_dev->name); - } else if (slot == MCA_MOTHERBOARD) { - printk(KERN_CRIT "NMI: caused by motherboard (%s)\n", - mca_dev->name); - } - - /* More info available in POS 6 and 7? */ - - if (check_flag) { - unsigned char pos6, pos7; - - pos6 = mca_device_read_pos(mca_dev, 6); - pos7 = mca_device_read_pos(mca_dev, 7); - - printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7); - } - -} /* mca_handle_nmi_slot */ - -/*--------------------------------------------------------------------*/ - -static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data) -{ - struct mca_device *mca_dev = to_mca_device(dev); - unsigned char pos5; - - pos5 = mca_device_read_pos(mca_dev, 5); - - if (!(pos5 & 0x80)) { - /* - * Bit 7 of POS 5 is reset when this adapter has a hardware - * error. Bit 7 it reset if there's error information - * available in POS 6 and 7. - */ - mca_handle_nmi_device(mca_dev, !(pos5 & 0x40)); - return 1; - } - return 0; -} - -void __kprobes mca_handle_nmi(void) -{ - /* - * First try - scan the various adapters and see if a specific - * adapter was responsible for the error. - */ - bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback); -} diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 73465aab28f8..8a2ce8fd41c0 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -82,11 +82,6 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); - if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { - pr_warning("CPU%d: family %d not supported\n", cpu, c->x86); - return -1; - } - csig->rev = c->microcode; pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); @@ -380,6 +375,13 @@ static struct microcode_ops microcode_amd_ops = { struct microcode_ops * __init init_amd_microcode(void) { + struct cpuinfo_x86 *c = &cpu_data(0); + + if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { + pr_warning("AMD CPU family 0x%x not supported\n", c->x86); + return NULL; + } + patch = (void *)get_zeroed_page(GFP_KERNEL); if (!patch) return NULL; diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 87a0f8688301..fbdfc6917180 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -299,12 +299,11 @@ static ssize_t reload_store(struct device *dev, { unsigned long val; int cpu = dev->id; - int ret = 0; - char *end; + ssize_t ret = 0; - val = simple_strtoul(buf, &end, 0); - if (end == buf) - return -EINVAL; + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; if (val == 1) { get_online_cpus(); @@ -419,10 +418,8 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif) if (err) return err; - if (microcode_init_cpu(cpu) == UCODE_ERROR) { - sysfs_remove_group(&dev->kobj, &mc_attr_group); + if (microcode_init_cpu(cpu) == UCODE_ERROR) return -EINVAL; - } return err; } @@ -528,11 +525,11 @@ static int __init microcode_init(void) microcode_ops = init_intel_microcode(); else if (c->x86_vendor == X86_VENDOR_AMD) microcode_ops = init_amd_microcode(); - - if (!microcode_ops) { + else pr_err("no support for this CPU vendor\n"); + + if (!microcode_ops) return -ENODEV; - } microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 3ca42d0e43a2..0327e2b3c408 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -147,12 +147,6 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) memset(csig, 0, sizeof(*csig)); - if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || - cpu_has(c, X86_FEATURE_IA64)) { - pr_err("CPU%d not a capable Intel processor\n", cpu_num); - return -1; - } - csig->sig = cpuid_eax(0x00000001); if ((c->x86_model >= 5) || (c->x86 > 6)) { @@ -463,6 +457,14 @@ static struct microcode_ops microcode_intel_ops = { struct microcode_ops * __init init_intel_microcode(void) { + struct cpuinfo_x86 *c = &cpu_data(0); + + if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || + cpu_has(c, X86_FEATURE_IA64)) { + pr_err("Intel CPU family 0x%x not supported\n", c->x86); + return NULL; + } + return µcode_intel_ops; } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index ca470e4c92dc..b02d4dd6b8a3 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -97,7 +97,7 @@ static void __init MP_bus_info(struct mpc_bus *m) set_bit(m->busid, mp_bus_not_pci); if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +#ifdef CONFIG_EISA mp_bus_id_to_type[m->busid] = MP_BUS_ISA; #endif } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { @@ -105,12 +105,10 @@ static void __init MP_bus_info(struct mpc_bus *m) x86_init.mpparse.mpc_oem_pci_bus(m); clear_bit(m->busid, mp_bus_not_pci); -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +#ifdef CONFIG_EISA mp_bus_id_to_type[m->busid] = MP_BUS_PCI; } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { mp_bus_id_to_type[m->busid] = MP_BUS_EISA; - } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) { - mp_bus_id_to_type[m->busid] = MP_BUS_MCA; #endif } else printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); @@ -368,9 +366,6 @@ static void __init construct_ioapic_table(int mpc_default_type) case 3: memcpy(bus.bustype, "EISA ", 6); break; - case 4: - case 7: - memcpy(bus.bustype, "MCA ", 6); } MP_bus_info(&bus); if (mpc_default_type > 4) { @@ -623,7 +618,7 @@ void __init default_find_smp_config(void) return; /* * If it is an SMP machine we should know now, unless the - * configuration is in an EISA/MCA bus machine with an + * configuration is in an EISA bus machine with an * extended bios data area. * * there is a real-mode segmented pointer pointing to the diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index eb1539eac393..90875279ef3d 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -19,8 +19,6 @@ #include <linux/slab.h> #include <linux/export.h> -#include <linux/mca.h> - #if defined(CONFIG_EDAC) #include <linux/edac.h> #endif @@ -31,14 +29,6 @@ #include <asm/nmi.h> #include <asm/x86_init.h> -#define NMI_MAX_NAMELEN 16 -struct nmiaction { - struct list_head list; - nmi_handler_t handler; - unsigned int flags; - char *name; -}; - struct nmi_desc { spinlock_t lock; struct list_head head; @@ -54,6 +44,14 @@ static struct nmi_desc nmi_desc[NMI_MAX] = .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), .head = LIST_HEAD_INIT(nmi_desc[1].head), }, + { + .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), + .head = LIST_HEAD_INIT(nmi_desc[2].head), + }, + { + .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), + .head = LIST_HEAD_INIT(nmi_desc[3].head), + }, }; @@ -107,11 +105,14 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2 return handled; } -static int __setup_nmi(unsigned int type, struct nmiaction *action) +int __register_nmi_handler(unsigned int type, struct nmiaction *action) { struct nmi_desc *desc = nmi_to_desc(type); unsigned long flags; + if (!action->handler) + return -EINVAL; + spin_lock_irqsave(&desc->lock, flags); /* @@ -120,6 +121,8 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action) * to manage expectations */ WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head)); + WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head)); + WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head)); /* * some handlers need to be executed first otherwise a fake @@ -133,8 +136,9 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action) spin_unlock_irqrestore(&desc->lock, flags); return 0; } +EXPORT_SYMBOL(__register_nmi_handler); -static struct nmiaction *__free_nmi(unsigned int type, const char *name) +void unregister_nmi_handler(unsigned int type, const char *name) { struct nmi_desc *desc = nmi_to_desc(type); struct nmiaction *n; @@ -157,61 +161,16 @@ static struct nmiaction *__free_nmi(unsigned int type, const char *name) spin_unlock_irqrestore(&desc->lock, flags); synchronize_rcu(); - return (n); } - -int register_nmi_handler(unsigned int type, nmi_handler_t handler, - unsigned long nmiflags, const char *devname) -{ - struct nmiaction *action; - int retval = -ENOMEM; - - if (!handler) - return -EINVAL; - - action = kzalloc(sizeof(struct nmiaction), GFP_KERNEL); - if (!action) - goto fail_action; - - action->handler = handler; - action->flags = nmiflags; - action->name = kstrndup(devname, NMI_MAX_NAMELEN, GFP_KERNEL); - if (!action->name) - goto fail_action_name; - - retval = __setup_nmi(type, action); - - if (retval) - goto fail_setup_nmi; - - return retval; - -fail_setup_nmi: - kfree(action->name); -fail_action_name: - kfree(action); -fail_action: - - return retval; -} -EXPORT_SYMBOL_GPL(register_nmi_handler); - -void unregister_nmi_handler(unsigned int type, const char *name) -{ - struct nmiaction *a; - - a = __free_nmi(type, name); - if (a) { - kfree(a->name); - kfree(a); - } -} - EXPORT_SYMBOL_GPL(unregister_nmi_handler); static __kprobes void pci_serr_error(unsigned char reason, struct pt_regs *regs) { + /* check to see if anyone registered against these types of errors */ + if (nmi_handle(NMI_SERR, regs, false)) + return; + pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", reason, smp_processor_id()); @@ -241,10 +200,14 @@ io_check_error(unsigned char reason, struct pt_regs *regs) { unsigned long i; + /* check to see if anyone registered against these types of errors */ + if (nmi_handle(NMI_IO_CHECK, regs, false)) + return; + pr_emerg( "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", reason, smp_processor_id()); - show_registers(regs); + show_regs(regs); if (panic_on_io_nmi) panic("NMI IOCK error: Not continuing"); @@ -282,16 +245,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) __this_cpu_add(nmi_stats.unknown, 1); -#ifdef CONFIG_MCA - /* - * Might actually be able to figure out what the guilty party - * is: - */ - if (MCA_bus) { - mca_handle_nmi(); - return; - } -#endif pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", reason, smp_processor_id()); diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 2c39dcd510fa..e31bf8d5c4d2 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -13,6 +13,7 @@ #include <linux/cpumask.h> #include <linux/delay.h> #include <linux/init.h> +#include <linux/percpu.h> #include <asm/apic.h> #include <asm/nmi.h> @@ -117,15 +118,15 @@ static void __init dotest(void (*testcase_fn)(void), int expected) unexpected_testcase_failures++; if (nmi_fail == FAILURE) - printk("FAILED |"); + printk(KERN_CONT "FAILED |"); else if (nmi_fail == TIMEOUT) - printk("TIMEOUT|"); + printk(KERN_CONT "TIMEOUT|"); else - printk("ERROR |"); + printk(KERN_CONT "ERROR |"); dump_stack(); } else { testcase_successes++; - printk(" ok |"); + printk(KERN_CONT " ok |"); } testcase_total++; @@ -150,10 +151,10 @@ void __init nmi_selftest(void) print_testname("remote IPI"); dotest(remote_ipi, SUCCESS); - printk("\n"); + printk(KERN_CONT "\n"); print_testname("local IPI"); dotest(local_ipi, SUCCESS); - printk("\n"); + printk(KERN_CONT "\n"); cleanup_nmi_testsuite(); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index ab137605e694..9ce885996fd7 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -241,16 +241,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA static inline void enter_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); + BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); - percpu_write(paravirt_lazy_mode, mode); + this_cpu_write(paravirt_lazy_mode, mode); } static void leave_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(percpu_read(paravirt_lazy_mode) != mode); + BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode); - percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); + this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); } void paravirt_enter_lazy_mmu(void) @@ -267,7 +267,7 @@ void paravirt_start_context_switch(struct task_struct *prev) { BUG_ON(preemptible()); - if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { + if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { arch_leave_lazy_mmu_mode(); set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); } @@ -289,7 +289,7 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) if (in_interrupt()) return PARAVIRT_LAZY_NONE; - return percpu_read(paravirt_lazy_mode); + return this_cpu_read(paravirt_lazy_mode); } void arch_flush_lazy_mmu_mode(void) diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index d0b2fb9ccbb1..b72838bae64a 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1480,8 +1480,9 @@ cleanup: static int __init calgary_parse_options(char *p) { unsigned int bridge; + unsigned long val; size_t len; - char* endp; + ssize_t ret; while (*p) { if (!strncmp(p, "64k", 3)) @@ -1512,10 +1513,11 @@ static int __init calgary_parse_options(char *p) ++p; if (*p == '\0') break; - bridge = simple_strtoul(p, &endp, 0); - if (p == endp) + ret = kstrtoul(p, 0, &val); + if (ret) break; + bridge = val; if (bridge < MAX_PHB_BUS_NUM) { printk(KERN_INFO "Calgary: disabling " "translation for PHB %#x\n", bridge); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 1d92a5ab6e8b..735279e54e59 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -27,6 +27,15 @@ #include <asm/debugreg.h> #include <asm/nmi.h> +/* + * per-CPU TSS segments. Threads are completely 'soft' on Linux, + * no more per-task TSS's. The TSS size is kept cacheline-aligned + * so they are allowed to end up in the .data..cacheline_aligned + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ +DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; + #ifdef CONFIG_X86_64 static DEFINE_PER_CPU(unsigned char, is_idle); static ATOMIC_NOTIFIER_HEAD(idle_notifier); @@ -47,10 +56,16 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister); struct kmem_cache *task_xstate_cachep; EXPORT_SYMBOL_GPL(task_xstate_cachep); +/* + * this gets called so that we can store lazy state into memory and copy the + * current task into the new thread. + */ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { int ret; + unlazy_fpu(src); + *dst = *src; if (fpu_allocated(&src->thread.fpu)) { memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu)); @@ -67,10 +82,9 @@ void free_thread_xstate(struct task_struct *tsk) fpu_free(&tsk->thread.fpu); } -void free_thread_info(struct thread_info *ti) +void arch_release_task_struct(struct task_struct *tsk) { - free_thread_xstate(ti->task); - free_pages((unsigned long)ti, THREAD_ORDER); + free_thread_xstate(tsk); } void arch_task_cache_init(void) @@ -81,6 +95,16 @@ void arch_task_cache_init(void) SLAB_PANIC | SLAB_NOTRACK, NULL); } +static inline void drop_fpu(struct task_struct *tsk) +{ + /* + * Forget coprocessor state.. + */ + tsk->fpu_counter = 0; + clear_fpu(tsk); + clear_used_math(); +} + /* * Free current thread data structures etc.. */ @@ -103,12 +127,8 @@ void exit_thread(void) put_cpu(); kfree(bp); } -} -void show_regs(struct pt_regs *regs) -{ - show_registers(regs); - show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0); + drop_fpu(me); } void show_regs_common(void) @@ -143,12 +163,7 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - /* - * Forget coprocessor state.. - */ - tsk->fpu_counter = 0; - clear_fpu(tsk); - clear_used_math(); + drop_fpu(tsk); } static void hard_disable_TSC(void) @@ -377,7 +392,7 @@ static inline void play_dead(void) #ifdef CONFIG_X86_64 void enter_idle(void) { - percpu_write(is_idle, 1); + this_cpu_write(is_idle, 1); atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); } @@ -516,26 +531,6 @@ void stop_this_cpu(void *dummy) } } -static void do_nothing(void *unused) -{ -} - -/* - * cpu_idle_wait - Used to ensure that all the CPUs discard old value of - * pm_idle and update to new pm_idle value. Required while changing pm_idle - * handler on SMP systems. - * - * Caller must have changed pm_idle to the new value before the call. Old - * pm_idle value will not be used by any CPU after the return of this function. - */ -void cpu_idle_wait(void) -{ - smp_mb(); - /* kick all the CPUs so that they exit out of pm_idle */ - smp_call_function(do_nothing, NULL, 1); -} -EXPORT_SYMBOL_GPL(cpu_idle_wait); - /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { @@ -594,9 +589,17 @@ int mwait_usable(const struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; + /* Use mwait if idle=mwait boot option is given */ if (boot_option_idle_override == IDLE_FORCE_MWAIT) return 1; + /* + * Any idle= boot option other than idle=mwait means that we must not + * use mwait. Eg: idle=halt or idle=poll or idle=nomwait + */ + if (boot_option_idle_override != IDLE_NO_OVERRIDE) + return 0; + if (c->cpuid_level < MWAIT_INFO) return 0; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index ae6847303e26..516fa186121b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -126,15 +126,6 @@ void release_thread(struct task_struct *dead_task) release_vm86_irqs(dead_task); } -/* - * This gets called before we allocate a new thread and copy - * the current task into it. - */ -void prepare_to_copy(struct task_struct *tsk) -{ - unlazy_fpu(tsk); -} - int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) @@ -302,7 +293,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) switch_fpu_finish(next_p, fpu); - percpu_write(current_task, next_p); + this_cpu_write(current_task, next_p); return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 733ca39f367e..61cdf7fdf099 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -145,15 +145,6 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls) return get_desc_base(&t->thread.tls_array[tls]); } -/* - * This gets called before we allocate a new thread and copy - * the current task into it. - */ -void prepare_to_copy(struct task_struct *tsk) -{ - unlazy_fpu(tsk); -} - int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) @@ -237,7 +228,7 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, current->thread.usersp = new_sp; regs->ip = new_ip; regs->sp = new_sp; - percpu_write(old_rsp, new_sp); + this_cpu_write(old_rsp, new_sp); regs->cs = _cs; regs->ss = _ss; regs->flags = X86_EFLAGS_IF; @@ -359,11 +350,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch the PDA and FPU contexts. */ - prev->usersp = percpu_read(old_rsp); - percpu_write(old_rsp, next->usersp); - percpu_write(current_task, next_p); + prev->usersp = this_cpu_read(old_rsp); + this_cpu_write(old_rsp, next->usersp); + this_cpu_write(current_task, next_p); - percpu_write(kernel_stack, + this_cpu_write(kernel_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE - KERNEL_STACK_OFFSET); @@ -423,6 +414,7 @@ void set_personality_ia32(bool x32) current_thread_info()->status |= TS_COMPAT; } } +EXPORT_SYMBOL_GPL(set_personality_ia32); unsigned long get_wchan(struct task_struct *p) { diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 685845cf16e0..13b1990c7c58 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1480,7 +1480,11 @@ long syscall_trace_enter(struct pt_regs *regs) regs->flags |= X86_EFLAGS_TF; /* do the secure computing check first */ - secure_computing(regs->orig_ax); + if (secure_computing(regs->orig_ax)) { + /* seccomp failures shouldn't expose any additional code. */ + ret = -1L; + goto out; + } if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) ret = -1L; @@ -1505,6 +1509,7 @@ long syscall_trace_enter(struct pt_regs *regs) regs->dx, regs->r10); #endif +out: return ret ?: regs->orig_ax; } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index d840e69a853c..77215c23fba1 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -39,7 +39,8 @@ static int reboot_mode; enum reboot_type reboot_type = BOOT_ACPI; int reboot_force; -/* This variable is used privately to keep track of whether or not +/* + * This variable is used privately to keep track of whether or not * reboot_type is still set to its default value (i.e., reboot= hasn't * been set on the command line). This is needed so that we can * suppress DMI scanning for reboot quirks. Without it, it's @@ -51,7 +52,8 @@ static int reboot_default = 1; static int reboot_cpu = -1; #endif -/* This is set if we need to go through the 'emergency' path. +/* + * This is set if we need to go through the 'emergency' path. * When machine_emergency_restart() is called, we may be on * an inconsistent state and won't be able to do a clean cleanup */ @@ -60,22 +62,24 @@ static int reboot_emergency; /* This is set by the PCI code if either type 1 or type 2 PCI is detected */ bool port_cf9_safe = false; -/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] - warm Don't set the cold reboot flag - cold Set the cold reboot flag - bios Reboot by jumping through the BIOS (only for X86_32) - smp Reboot by executing reset on BSP or other CPU (only for X86_32) - triple Force a triple fault (init) - kbd Use the keyboard controller. cold reset (default) - acpi Use the RESET_REG in the FADT - efi Use efi reset_system runtime service - pci Use the so-called "PCI reset register", CF9 - force Avoid anything that could hang. +/* + * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] + * warm Don't set the cold reboot flag + * cold Set the cold reboot flag + * bios Reboot by jumping through the BIOS (only for X86_32) + * smp Reboot by executing reset on BSP or other CPU (only for X86_32) + * triple Force a triple fault (init) + * kbd Use the keyboard controller. cold reset (default) + * acpi Use the RESET_REG in the FADT + * efi Use efi reset_system runtime service + * pci Use the so-called "PCI reset register", CF9 + * force Avoid anything that could hang. */ static int __init reboot_setup(char *str) { for (;;) { - /* Having anything passed on the command line via + /* + * Having anything passed on the command line via * reboot= will cause us to disable DMI checking * below. */ @@ -98,9 +102,11 @@ static int __init reboot_setup(char *str) if (isdigit(*(str+2))) reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0'); } - /* we will leave sorting out the final value - when we are ready to reboot, since we might not - have detected BSP APIC ID or smp_num_cpu */ + /* + * We will leave sorting out the final value + * when we are ready to reboot, since we might not + * have detected BSP APIC ID or smp_num_cpu + */ break; #endif /* CONFIG_SMP */ @@ -150,6 +156,82 @@ static int __init set_bios_reboot(const struct dmi_system_id *d) return 0; } +extern const unsigned char machine_real_restart_asm[]; +extern const u64 machine_real_restart_gdt[3]; + +void machine_real_restart(unsigned int type) +{ + void *restart_va; + unsigned long restart_pa; + void (*restart_lowmem)(unsigned int); + u64 *lowmem_gdt; + + local_irq_disable(); + + /* + * Write zero to CMOS register number 0x0f, which the BIOS POST + * routine will recognize as telling it to do a proper reboot. (Well + * that's what this book in front of me says -- it may only apply to + * the Phoenix BIOS though, it's not clear). At the same time, + * disable NMIs by setting the top bit in the CMOS address register, + * as we're about to do peculiar things to the CPU. I'm not sure if + * `outb_p' is needed instead of just `outb'. Use it to be on the + * safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) + */ + spin_lock(&rtc_lock); + CMOS_WRITE(0x00, 0x8f); + spin_unlock(&rtc_lock); + + /* + * Switch back to the initial page table. + */ + load_cr3(initial_page_table); + + /* + * Write 0x1234 to absolute memory location 0x472. The BIOS reads + * this on booting to tell it to "Bypass memory test (also warm + * boot)". This seems like a fairly standard thing that gets set by + * REBOOT.COM programs, and the previous reset routine did this + * too. */ + *((unsigned short *)0x472) = reboot_mode; + + /* Patch the GDT in the low memory trampoline */ + lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt); + + restart_va = TRAMPOLINE_SYM(machine_real_restart_asm); + restart_pa = virt_to_phys(restart_va); + restart_lowmem = (void (*)(unsigned int))restart_pa; + + /* GDT[0]: GDT self-pointer */ + lowmem_gdt[0] = + (u64)(sizeof(machine_real_restart_gdt) - 1) + + ((u64)virt_to_phys(lowmem_gdt) << 16); + /* GDT[1]: 64K real mode code segment */ + lowmem_gdt[1] = + GDT_ENTRY(0x009b, restart_pa, 0xffff); + + /* Jump to the identity-mapped low memory code */ + restart_lowmem(type); +} +#ifdef CONFIG_APM_MODULE +EXPORT_SYMBOL(machine_real_restart); +#endif + +#endif /* CONFIG_X86_32 */ + +/* + * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot + */ +static int __init set_pci_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_CF9) { + reboot_type = BOOT_CF9; + printk(KERN_INFO "%s series board detected. " + "Selecting PCI-method for reboots.\n", d->ident); + } + return 0; +} + static int __init set_kbd_reboot(const struct dmi_system_id *d) { if (reboot_type != BOOT_KBD) { @@ -159,7 +241,12 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d) return 0; } +/* + * This is a single dmi_table handling all reboot quirks. Note that + * REBOOT_BIOS is only available for 32bit + */ static struct dmi_system_id __initdata reboot_dmi_table[] = { +#ifdef CONFIG_X86_32 { /* Handle problems with rebooting on Dell E520's */ .callback = set_bios_reboot, .ident = "Dell E520", @@ -184,7 +271,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"), }, }, - { /* Handle problems with rebooting on Dell Optiplex 745's SFF*/ + { /* Handle problems with rebooting on Dell Optiplex 745's SFF */ .callback = set_bios_reboot, .ident = "Dell OptiPlex 745", .matches = { @@ -192,7 +279,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), }, }, - { /* Handle problems with rebooting on Dell Optiplex 745's DFF*/ + { /* Handle problems with rebooting on Dell Optiplex 745's DFF */ .callback = set_bios_reboot, .ident = "Dell OptiPlex 745", .matches = { @@ -201,7 +288,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0MM599"), }, }, - { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */ + { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */ .callback = set_bios_reboot, .ident = "Dell OptiPlex 745", .matches = { @@ -210,7 +297,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0KW626"), }, }, - { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */ + { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */ .callback = set_bios_reboot, .ident = "Dell OptiPlex 330", .matches = { @@ -219,7 +306,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0KP561"), }, }, - { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ + { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ .callback = set_bios_reboot, .ident = "Dell OptiPlex 360", .matches = { @@ -228,7 +315,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0T656F"), }, }, - { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/ + { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */ .callback = set_bios_reboot, .ident = "Dell OptiPlex 760", .matches = { @@ -301,7 +388,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), }, }, - { /* Handle problems with rebooting on ASUS P4S800 */ + { /* Handle problems with rebooting on ASUS P4S800 */ .callback = set_bios_reboot, .ident = "ASUS P4S800", .matches = { @@ -309,7 +396,9 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "P4S800"), }, }, - { /* Handle reboot issue on Acer Aspire one */ +#endif /* CONFIG_X86_32 */ + + { /* Handle reboot issue on Acer Aspire one */ .callback = set_kbd_reboot, .ident = "Acer Aspire One A110", .matches = { @@ -317,96 +406,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), }, }, - { } -}; - -static int __init reboot_init(void) -{ - /* Only do the DMI check if reboot_type hasn't been overridden - * on the command line - */ - if (reboot_default) { - dmi_check_system(reboot_dmi_table); - } - return 0; -} -core_initcall(reboot_init); - -extern const unsigned char machine_real_restart_asm[]; -extern const u64 machine_real_restart_gdt[3]; - -void machine_real_restart(unsigned int type) -{ - void *restart_va; - unsigned long restart_pa; - void (*restart_lowmem)(unsigned int); - u64 *lowmem_gdt; - - local_irq_disable(); - - /* Write zero to CMOS register number 0x0f, which the BIOS POST - routine will recognize as telling it to do a proper reboot. (Well - that's what this book in front of me says -- it may only apply to - the Phoenix BIOS though, it's not clear). At the same time, - disable NMIs by setting the top bit in the CMOS address register, - as we're about to do peculiar things to the CPU. I'm not sure if - `outb_p' is needed instead of just `outb'. Use it to be on the - safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) - */ - spin_lock(&rtc_lock); - CMOS_WRITE(0x00, 0x8f); - spin_unlock(&rtc_lock); - - /* - * Switch back to the initial page table. - */ - load_cr3(initial_page_table); - - /* Write 0x1234 to absolute memory location 0x472. The BIOS reads - this on booting to tell it to "Bypass memory test (also warm - boot)". This seems like a fairly standard thing that gets set by - REBOOT.COM programs, and the previous reset routine did this - too. */ - *((unsigned short *)0x472) = reboot_mode; - - /* Patch the GDT in the low memory trampoline */ - lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt); - - restart_va = TRAMPOLINE_SYM(machine_real_restart_asm); - restart_pa = virt_to_phys(restart_va); - restart_lowmem = (void (*)(unsigned int))restart_pa; - - /* GDT[0]: GDT self-pointer */ - lowmem_gdt[0] = - (u64)(sizeof(machine_real_restart_gdt) - 1) + - ((u64)virt_to_phys(lowmem_gdt) << 16); - /* GDT[1]: 64K real mode code segment */ - lowmem_gdt[1] = - GDT_ENTRY(0x009b, restart_pa, 0xffff); - - /* Jump to the identity-mapped low memory code */ - restart_lowmem(type); -} -#ifdef CONFIG_APM_MODULE -EXPORT_SYMBOL(machine_real_restart); -#endif - -#endif /* CONFIG_X86_32 */ - -/* - * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot - */ -static int __init set_pci_reboot(const struct dmi_system_id *d) -{ - if (reboot_type != BOOT_CF9) { - reboot_type = BOOT_CF9; - printk(KERN_INFO "%s series board detected. " - "Selecting PCI-method for reboots.\n", d->ident); - } - return 0; -} - -static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { { /* Handle problems with rebooting on Apple MacBook5 */ .callback = set_pci_reboot, .ident = "Apple MacBook5", @@ -474,17 +473,17 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { { } }; -static int __init pci_reboot_init(void) +static int __init reboot_init(void) { - /* Only do the DMI check if reboot_type hasn't been overridden + /* + * Only do the DMI check if reboot_type hasn't been overridden * on the command line */ - if (reboot_default) { - dmi_check_system(pci_reboot_dmi_table); - } + if (reboot_default) + dmi_check_system(reboot_dmi_table); return 0; } -core_initcall(pci_reboot_init); +core_initcall(reboot_init); static inline void kb_wait(void) { @@ -502,14 +501,14 @@ static void vmxoff_nmi(int cpu, struct pt_regs *regs) cpu_emergency_vmxoff(); } -/* Use NMIs as IPIs to tell all CPUs to disable virtualization - */ +/* Use NMIs as IPIs to tell all CPUs to disable virtualization */ static void emergency_vmx_disable_all(void) { /* Just make sure we won't change CPUs while doing this */ local_irq_disable(); - /* We need to disable VMX on all CPUs before rebooting, otherwise + /* + * We need to disable VMX on all CPUs before rebooting, otherwise * we risk hanging up the machine, because the CPU ignore INIT * signals when VMX is enabled. * @@ -528,8 +527,7 @@ static void emergency_vmx_disable_all(void) * is still enabling VMX. */ if (cpu_has_vmx() && cpu_vmx_enabled()) { - /* Disable VMX on this CPU. - */ + /* Disable VMX on this CPU. */ cpu_vmxoff(); /* Halt and disable VMX on the other CPUs */ @@ -574,12 +572,12 @@ static void native_machine_emergency_restart(void) /* Could also try the reset bit in the Hammer NB */ switch (reboot_type) { case BOOT_KBD: - mach_reboot_fixups(); /* for board specific fixups */ + mach_reboot_fixups(); /* For board specific fixups */ for (i = 0; i < 10; i++) { kb_wait(); udelay(50); - outb(0xfe, 0x64); /* pulse reset low */ + outb(0xfe, 0x64); /* Pulse reset low */ udelay(50); } if (attempt == 0 && orig_reboot_type == BOOT_ACPI) { @@ -621,7 +619,7 @@ static void native_machine_emergency_restart(void) case BOOT_CF9: port_cf9_safe = true; - /* fall through */ + /* Fall through */ case BOOT_CF9_COND: if (port_cf9_safe) { @@ -659,7 +657,8 @@ void native_machine_shutdown(void) /* Make certain I only run on the appropriate processor */ set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id)); - /* O.K Now that I'm on the appropriate processor, + /* + * O.K Now that I'm on the appropriate processor, * stop all of the others. */ stop_other_cpus(); @@ -697,12 +696,11 @@ static void native_machine_restart(char *__unused) static void native_machine_halt(void) { - /* stop other cpus and apics */ + /* Stop other cpus and apics */ machine_shutdown(); tboot_shutdown(TB_SHUTDOWN_HALT); - /* stop this cpu */ stop_this_cpu(NULL); } @@ -713,7 +711,7 @@ static void native_machine_power_off(void) machine_shutdown(); pm_power_off(); } - /* a fallback in case there is no PM info available */ + /* A fallback in case there is no PM info available */ tboot_shutdown(TB_SHUTDOWN_HALT); } @@ -775,7 +773,8 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) cpu = raw_smp_processor_id(); - /* Don't do anything if this handler is invoked on crashing cpu. + /* + * Don't do anything if this handler is invoked on crashing cpu. * Otherwise, system will completely hang. Crashing cpu can get * an NMI if system was initially booted with nmi_watchdog parameter. */ @@ -799,7 +798,8 @@ static void smp_send_nmi_allbutself(void) apic->send_IPI_allbutself(NMI_VECTOR); } -/* Halt all other CPUs, calling the specified function on each of them +/* + * Halt all other CPUs, calling the specified function on each of them * * This function can be used to halt all other CPUs on crash * or emergency reboot time. The function passed as parameter @@ -810,7 +810,7 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) unsigned long msecs; local_irq_disable(); - /* Make a note of crashing cpu. Will be used in NMI callback.*/ + /* Make a note of crashing cpu. Will be used in NMI callback. */ crashing_cpu = safe_smp_processor_id(); shootdown_callback = callback; @@ -819,8 +819,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) /* Would it be better to replace the trap vector here? */ if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback, NMI_FLAG_FIRST, "crash")) - return; /* return what? */ - /* Ensure the new callback function is set before sending + return; /* Return what? */ + /* + * Ensure the new callback function is set before sending * out the NMI */ wmb(); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1a2901562059..366c688d619e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -34,7 +34,6 @@ #include <linux/memblock.h> #include <linux/seq_file.h> #include <linux/console.h> -#include <linux/mca.h> #include <linux/root_dev.h> #include <linux/highmem.h> #include <linux/module.h> @@ -179,12 +178,6 @@ struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; /* common cpu data for all cpus */ struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1}; EXPORT_SYMBOL(boot_cpu_data); -static void set_mca_bus(int x) -{ -#ifdef CONFIG_MCA - MCA_bus = x; -#endif -} unsigned int def_to_bigsmp; @@ -393,10 +386,9 @@ static void __init reserve_initrd(void) initrd_start = 0; if (ramdisk_size >= (end_of_lowmem>>1)) { - memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); - printk(KERN_ERR "initrd too large to handle, " - "disabling initrd\n"); - return; + panic("initrd too large to handle, " + "disabling initrd (%lld needed, %lld available)\n", + ramdisk_size, end_of_lowmem>>1); } printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image, @@ -717,7 +709,6 @@ void __init setup_arch(char **cmdline_p) apm_info.bios = boot_params.apm_bios_info; ist_info = boot_params.ist_info; if (boot_params.sys_desc_table.length != 0) { - set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2); machine_id = boot_params.sys_desc_table.table[0]; machine_submodel_id = boot_params.sys_desc_table.table[1]; BIOS_revision = boot_params.sys_desc_table.table[2]; @@ -1012,7 +1003,8 @@ void __init setup_arch(char **cmdline_p) init_cpu_to_node(); init_apic_mappings(); - ioapic_and_gsi_init(); + if (x86_io_apic_ops.init) + x86_io_apic_ops.init(); kvm_guest_init(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 71f4727da373..5a98aa272184 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -185,10 +185,22 @@ void __init setup_per_cpu_areas(void) #endif rc = -EINVAL; if (pcpu_chosen_fc != PCPU_FC_PAGE) { - const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE; const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE; + size_t atom_size; + /* + * On 64bit, use PMD_SIZE for atom_size so that embedded + * percpu areas are aligned to PMD. This, in the future, + * can also allow using PMD mappings in vmalloc area. Use + * PAGE_SIZE on 32bit as vmalloc space is highly contended + * and large vmalloc area allocs can easily fail. + */ +#ifdef CONFIG_X86_64 + atom_size = PMD_SIZE; +#else + atom_size = PAGE_SIZE; +#endif rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, atom_size, pcpu_cpu_distance, diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 041af2fd088d..965dfda0fd5e 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -479,18 +479,8 @@ asmlinkage int sys_sigsuspend(int history0, int history1, old_sigset_t mask) { sigset_t blocked; - - current->saved_sigmask = current->blocked; - - mask &= _BLOCKABLE; siginitset(&blocked, mask); - set_current_blocked(&blocked); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - - set_restore_sigmask(); - return -ERESTARTNOHAND; + return sigsuspend(&blocked); } asmlinkage int diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 66c74f481cab..48d2b7ded422 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -109,6 +109,9 @@ * about nothing of note with C stepping upwards. */ +static atomic_t stopping_cpu = ATOMIC_INIT(-1); +static bool smp_no_nmi_ipi = false; + /* * this function sends a 'reschedule' IPI to another CPU. * it goes straight through and wastes no time serializing @@ -149,8 +152,6 @@ void native_send_call_func_ipi(const struct cpumask *mask) free_cpumask_var(allbutself); } -static atomic_t stopping_cpu = ATOMIC_INIT(-1); - static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) { /* We are registered on stopping cpu too, avoid spurious NMI */ @@ -162,7 +163,19 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) return NMI_HANDLED; } -static void native_nmi_stop_other_cpus(int wait) +/* + * this function calls the 'stop' function on all other CPUs in the system. + */ + +asmlinkage void smp_reboot_interrupt(void) +{ + ack_APIC_irq(); + irq_enter(); + stop_this_cpu(NULL); + irq_exit(); +} + +static void native_stop_other_cpus(int wait) { unsigned long flags; unsigned long timeout; @@ -174,20 +187,25 @@ static void native_nmi_stop_other_cpus(int wait) * Use an own vector here because smp_call_function * does lots of things not suitable in a panic situation. */ + + /* + * We start by using the REBOOT_VECTOR irq. + * The irq is treated as a sync point to allow critical + * regions of code on other cpus to release their spin locks + * and re-enable irqs. Jumping straight to an NMI might + * accidentally cause deadlocks with further shutdown/panic + * code. By syncing, we give the cpus up to one second to + * finish their work before we force them off with the NMI. + */ if (num_online_cpus() > 1) { /* did someone beat us here? */ if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1) return; - if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, - NMI_FLAG_FIRST, "smp_stop")) - /* Note: we ignore failures here */ - return; - - /* sync above data before sending NMI */ + /* sync above data before sending IRQ */ wmb(); - apic->send_IPI_allbutself(NMI_VECTOR); + apic->send_IPI_allbutself(REBOOT_VECTOR); /* * Don't wait longer than a second if the caller @@ -197,63 +215,37 @@ static void native_nmi_stop_other_cpus(int wait) while (num_online_cpus() > 1 && (wait || timeout--)) udelay(1); } + + /* if the REBOOT_VECTOR didn't work, try with the NMI */ + if ((num_online_cpus() > 1) && (!smp_no_nmi_ipi)) { + if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, + NMI_FLAG_FIRST, "smp_stop")) + /* Note: we ignore failures here */ + /* Hope the REBOOT_IRQ is good enough */ + goto finish; - local_irq_save(flags); - disable_local_APIC(); - local_irq_restore(flags); -} - -/* - * this function calls the 'stop' function on all other CPUs in the system. - */ - -asmlinkage void smp_reboot_interrupt(void) -{ - ack_APIC_irq(); - irq_enter(); - stop_this_cpu(NULL); - irq_exit(); -} - -static void native_irq_stop_other_cpus(int wait) -{ - unsigned long flags; - unsigned long timeout; + /* sync above data before sending IRQ */ + wmb(); - if (reboot_force) - return; + pr_emerg("Shutting down cpus with NMI\n"); - /* - * Use an own vector here because smp_call_function - * does lots of things not suitable in a panic situation. - * On most systems we could also use an NMI here, - * but there are a few systems around where NMI - * is problematic so stay with an non NMI for now - * (this implies we cannot stop CPUs spinning with irq off - * currently) - */ - if (num_online_cpus() > 1) { - apic->send_IPI_allbutself(REBOOT_VECTOR); + apic->send_IPI_allbutself(NMI_VECTOR); /* - * Don't wait longer than a second if the caller + * Don't wait longer than a 10 ms if the caller * didn't ask us to wait. */ - timeout = USEC_PER_SEC; + timeout = USEC_PER_MSEC * 10; while (num_online_cpus() > 1 && (wait || timeout--)) udelay(1); } +finish: local_irq_save(flags); disable_local_APIC(); local_irq_restore(flags); } -static void native_smp_disable_nmi_ipi(void) -{ - smp_ops.stop_other_cpus = native_irq_stop_other_cpus; -} - /* * Reschedule call back. */ @@ -287,8 +279,8 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) static int __init nonmi_ipi_setup(char *str) { - native_smp_disable_nmi_ipi(); - return 1; + smp_no_nmi_ipi = true; + return 1; } __setup("nonmi_ipi", nonmi_ipi_setup); @@ -298,7 +290,7 @@ struct smp_ops smp_ops = { .smp_prepare_cpus = native_smp_prepare_cpus, .smp_cpus_done = native_smp_cpus_done, - .stop_other_cpus = native_nmi_stop_other_cpus, + .stop_other_cpus = native_stop_other_cpus, .smp_send_reschedule = native_smp_send_reschedule, .cpu_up = native_cpu_up, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6e1e406038c2..433529e29be4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -76,20 +76,8 @@ /* State of each CPU */ DEFINE_PER_CPU(int, cpu_state) = { 0 }; -/* Store all idle threads, this can be reused instead of creating -* a new thread. Also avoids complicated thread destroy functionality -* for idle threads. -*/ #ifdef CONFIG_HOTPLUG_CPU /* - * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is - * removed after init for !CONFIG_HOTPLUG_CPU. - */ -static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); -#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) -#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) - -/* * We need this for trampoline_base protection from concurrent accesses when * off- and onlining cores wildly. */ @@ -97,20 +85,16 @@ static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex); void cpu_hotplug_driver_lock(void) { - mutex_lock(&x86_cpu_hotplug_driver_mutex); + mutex_lock(&x86_cpu_hotplug_driver_mutex); } void cpu_hotplug_driver_unlock(void) { - mutex_unlock(&x86_cpu_hotplug_driver_mutex); + mutex_unlock(&x86_cpu_hotplug_driver_mutex); } ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; } ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; } -#else -static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; -#define get_idle_for_cpu(x) (idle_thread_array[(x)]) -#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p)) #endif /* Number of siblings per CPU package */ @@ -315,59 +299,90 @@ void __cpuinit smp_store_cpu_info(int id) identify_secondary_cpu(c); } -static void __cpuinit link_thread_siblings(int cpu1, int cpu2) +static bool __cpuinit +topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name) { - cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); - cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); - cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); - cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); - cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); - cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2), + "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! " + "[node: %d != %d]. Ignoring dependency.\n", + cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); } +#define link_mask(_m, c1, c2) \ +do { \ + cpumask_set_cpu((c1), cpu_##_m##_mask(c2)); \ + cpumask_set_cpu((c2), cpu_##_m##_mask(c1)); \ +} while (0) + +static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if (cpu_has(c, X86_FEATURE_TOPOEXT)) { + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + if (c->phys_proc_id == o->phys_proc_id && + per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) && + c->compute_unit_id == o->compute_unit_id) + return topology_sane(c, o, "smt"); + + } else if (c->phys_proc_id == o->phys_proc_id && + c->cpu_core_id == o->cpu_core_id) { + return topology_sane(c, o, "smt"); + } + + return false; +} + +static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID && + per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) + return topology_sane(c, o, "llc"); + + return false; +} + +static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if (c->phys_proc_id == o->phys_proc_id) + return topology_sane(c, o, "mc"); + + return false; +} void __cpuinit set_cpu_sibling_map(int cpu) { - int i; + bool has_mc = boot_cpu_data.x86_max_cores > 1; + bool has_smt = smp_num_siblings > 1; struct cpuinfo_x86 *c = &cpu_data(cpu); + struct cpuinfo_x86 *o; + int i; cpumask_set_cpu(cpu, cpu_sibling_setup_mask); - if (smp_num_siblings > 1) { - for_each_cpu(i, cpu_sibling_setup_mask) { - struct cpuinfo_x86 *o = &cpu_data(i); - - if (cpu_has(c, X86_FEATURE_TOPOEXT)) { - if (c->phys_proc_id == o->phys_proc_id && - per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) && - c->compute_unit_id == o->compute_unit_id) - link_thread_siblings(cpu, i); - } else if (c->phys_proc_id == o->phys_proc_id && - c->cpu_core_id == o->cpu_core_id) { - link_thread_siblings(cpu, i); - } - } - } else { + if (!has_smt && !has_mc) { cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); - } - - cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); - - if (__this_cpu_read(cpu_info.x86_max_cores) == 1) { - cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); + cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); + cpumask_set_cpu(cpu, cpu_core_mask(cpu)); c->booted_cores = 1; return; } for_each_cpu(i, cpu_sibling_setup_mask) { - if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && - per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { - cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); - cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); - } - if (c->phys_proc_id == cpu_data(i).phys_proc_id) { - cpumask_set_cpu(i, cpu_core_mask(cpu)); - cpumask_set_cpu(cpu, cpu_core_mask(i)); + o = &cpu_data(i); + + if ((i == cpu) || (has_smt && match_smt(c, o))) + link_mask(sibling, cpu, i); + + if ((i == cpu) || (has_mc && match_llc(c, o))) + link_mask(llc_shared, cpu, i); + + if ((i == cpu) || (has_mc && match_mc(c, o))) { + link_mask(core, cpu, i); + /* * Does this new cpu bringup a new core? */ @@ -398,8 +413,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu) * For perf, we return last level cache shared map. * And for power savings, we return cpu_core_map */ - if ((sched_mc_power_savings || sched_smt_power_savings) && - !(cpu_has(c, X86_FEATURE_AMD_DCM))) + if (!(cpu_has(c, X86_FEATURE_AMD_DCM))) return cpu_core_mask(cpu); else return cpu_llc_shared_mask(cpu); @@ -618,22 +632,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) return (send_status | accept_status); } -struct create_idle { - struct work_struct work; - struct task_struct *idle; - struct completion done; - int cpu; -}; - -static void __cpuinit do_fork_idle(struct work_struct *work) -{ - struct create_idle *c_idle = - container_of(work, struct create_idle, work); - - c_idle->idle = fork_idle(c_idle->cpu); - complete(&c_idle->done); -} - /* reduce the number of lines printed when booting a large cpu count system */ static void __cpuinit announce_cpu(int cpu, int apicid) { @@ -660,58 +658,31 @@ static void __cpuinit announce_cpu(int cpu, int apicid) * Returns zero if CPU booted OK, else error code from * ->wakeup_secondary_cpu. */ -static int __cpuinit do_boot_cpu(int apicid, int cpu) +static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) { unsigned long boot_error = 0; unsigned long start_ip; int timeout; - struct create_idle c_idle = { - .cpu = cpu, - .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), - }; - - INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle); alternatives_smp_switch(1); - c_idle.idle = get_idle_for_cpu(cpu); - - /* - * We can't use kernel_thread since we must avoid to - * reschedule the child. - */ - if (c_idle.idle) { - c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *) - (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1); - init_idle(c_idle.idle, cpu); - goto do_rest; - } - - schedule_work(&c_idle.work); - wait_for_completion(&c_idle.done); + idle->thread.sp = (unsigned long) (((struct pt_regs *) + (THREAD_SIZE + task_stack_page(idle))) - 1); + per_cpu(current_task, cpu) = idle; - if (IS_ERR(c_idle.idle)) { - printk("failed fork for CPU %d\n", cpu); - destroy_work_on_stack(&c_idle.work); - return PTR_ERR(c_idle.idle); - } - - set_idle_for_cpu(cpu, c_idle.idle); -do_rest: - per_cpu(current_task, cpu) = c_idle.idle; #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); #else - clear_tsk_thread_flag(c_idle.idle, TIF_FORK); + clear_tsk_thread_flag(idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); per_cpu(kernel_stack, cpu) = - (unsigned long)task_stack_page(c_idle.idle) - + (unsigned long)task_stack_page(idle) - KERNEL_STACK_OFFSET + THREAD_SIZE; #endif early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; - stack_start = c_idle.idle->thread.sp; + stack_start = idle->thread.sp; /* start_ip had better be page-aligned! */ start_ip = trampoline_address(); @@ -813,12 +784,10 @@ do_rest: */ smpboot_restore_warm_reset_vector(); } - - destroy_work_on_stack(&c_idle.work); return boot_error; } -int __cpuinit native_cpu_up(unsigned int cpu) +int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) { int apicid = apic->cpu_present_to_apicid(cpu); unsigned long flags; @@ -851,7 +820,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; - err = do_boot_cpu(apicid, cpu); + err = do_boot_cpu(apicid, cpu, tidle); if (err) { pr_debug("do_boot_cpu failed %d\n", err); return -EIO; diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c index c29e235792af..b79133abda48 100644 --- a/arch/x86/kernel/test_rodata.c +++ b/arch/x86/kernel/test_rodata.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <asm/cacheflush.h> #include <asm/sections.h> +#include <asm/asm.h> int rodata_test(void) { @@ -42,14 +43,7 @@ int rodata_test(void) ".section .fixup,\"ax\"\n" "2: jmp 1b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 16\n" -#ifdef CONFIG_X86_32 - " .long 0b,2b\n" -#else - " .quad 0b,2b\n" -#endif - ".previous" + _ASM_EXTABLE(0b,2b) : [rslt] "=r" (result) : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL) ); diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index c6eba2b42673..24d3c91e9812 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -14,7 +14,6 @@ #include <linux/i8253.h> #include <linux/time.h> #include <linux/export.h> -#include <linux/mca.h> #include <asm/vsyscall.h> #include <asm/x86_init.h> @@ -58,11 +57,6 @@ EXPORT_SYMBOL(profile_pc); static irqreturn_t timer_interrupt(int irq, void *dev_id) { global_clock_event->event_handler(global_clock_event); - - /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ - if (MCA_bus) - outb_p(inb_p(0x61)| 0x80, 0x61); - return IRQ_HANDLED; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 92d5756d85fc..ff08457a025d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -37,10 +37,6 @@ #include <linux/eisa.h> #endif -#ifdef CONFIG_MCA -#include <linux/mca.h> -#endif - #if defined(CONFIG_EDAC) #include <linux/edac.h> #endif diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index a1d804bcd483..8eeb55a551b4 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -15,6 +15,7 @@ #include <linux/init.h> #include <linux/pci_ids.h> #include <linux/pci_regs.h> +#include <linux/smp.h> #include <asm/apic.h> #include <asm/pci-direct.h> @@ -22,6 +23,8 @@ #include <asm/paravirt.h> #include <asm/setup.h> +#define TOPOLOGY_REGISTER_OFFSET 0x10 + #if defined CONFIG_PCI && defined CONFIG_PARAVIRT /* * Interrupt control on vSMPowered systems: @@ -149,12 +152,49 @@ int is_vsmp_box(void) return 0; } #endif + +static void __init vsmp_cap_cpus(void) +{ +#if !defined(CONFIG_X86_VSMP) && defined(CONFIG_SMP) + void __iomem *address; + unsigned int cfg, topology, node_shift, maxcpus; + + /* + * CONFIG_X86_VSMP is not configured, so limit the number CPUs to the + * ones present in the first board, unless explicitly overridden by + * setup_max_cpus + */ + if (setup_max_cpus != NR_CPUS) + return; + + /* Read the vSMP Foundation topology register */ + cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0); + address = early_ioremap(cfg + TOPOLOGY_REGISTER_OFFSET, 4); + if (WARN_ON(!address)) + return; + + topology = readl(address); + node_shift = (topology >> 16) & 0x7; + if (!node_shift) + /* The value 0 should be decoded as 8 */ + node_shift = 8; + maxcpus = (topology & ((1 << node_shift) - 1)) + 1; + + pr_info("vSMP CTL: Capping CPUs to %d (CONFIG_X86_VSMP is unset)\n", + maxcpus); + setup_max_cpus = maxcpus; + early_iounmap(address, 4); +#endif +} + void __init vsmp_init(void) { detect_vsmp_box(); if (!is_vsmp_box()) return; + vsmp_cap_cpus(); + set_vsmp_pv_ops(); return; } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index e9f265fd79ae..35c5e543f550 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -18,6 +18,7 @@ #include <asm/e820.h> #include <asm/time.h> #include <asm/irq.h> +#include <asm/io_apic.h> #include <asm/pat.h> #include <asm/tsc.h> #include <asm/iommu.h> @@ -93,7 +94,6 @@ struct x86_init_ops x86_init __initdata = { struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { .early_percpu_clock_init = x86_init_noop, .setup_percpu_clockev = setup_secondary_APIC_clock, - .fixup_cpu_id = x86_default_fixup_cpu_id, }; static void default_nmi_init(void) { }; @@ -120,3 +120,10 @@ struct x86_msi_ops x86_msi = { .teardown_msi_irqs = default_teardown_msi_irqs, .restore_msi_irqs = default_restore_msi_irqs, }; + +struct x86_io_apic_ops x86_io_apic_ops = { + .init = native_io_apic_init_mappings, + .read = native_io_apic_read, + .write = native_io_apic_write, + .modify = native_io_apic_modify, +}; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index e62728e30b01..bd18149b2b0f 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -48,8 +48,6 @@ void __sanitize_i387_state(struct task_struct *tsk) if (!fx) return; - BUG_ON(__thread_has_fpu(tsk)); - xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; /* |