summaryrefslogtreecommitdiff
path: root/arch/x86/kernel/apic
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/apic')
-rw-r--r--arch/x86/kernel/apic/Makefile8
-rw-r--r--arch/x86/kernel/apic/apic.c1412
-rw-r--r--arch/x86/kernel/apic/apic_common.c35
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c265
-rw-r--r--arch/x86/kernel/apic/apic_noop.c116
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c108
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c216
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c9
-rw-r--r--arch/x86/kernel/apic/init.c110
-rw-r--r--arch/x86/kernel/apic/io_apic.c1560
-rw-r--r--arch/x86/kernel/apic/ipi.c347
-rw-r--r--arch/x86/kernel/apic/local.h68
-rw-r--r--arch/x86/kernel/apic/msi.c522
-rw-r--r--arch/x86/kernel/apic/probe_32.c179
-rw-r--r--arch/x86/kernel/apic/probe_64.c45
-rw-r--r--arch/x86/kernel/apic/vector.c556
-rw-r--r--arch/x86/kernel/apic/x2apic.h9
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c214
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c100
-rw-r--r--arch/x86/kernel/apic/x2apic_savic.c428
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c1577
21 files changed, 3996 insertions, 3888 deletions
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index a6fcaf16cdbf..581db89477f9 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -4,10 +4,10 @@
#
# Leads to non-deterministic coverage that is not a function of syscall inputs.
-# In particualr, smp_apic_timer_interrupt() is called in random places.
+# In particular, smp_apic_timer_interrupt() is called in random places.
KCOV_INSTRUMENT := n
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o init.o
obj-y += hw_nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
@@ -18,13 +18,11 @@ ifeq ($(CONFIG_X86_64),y)
# APIC probe will depend on the listing order here
obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o
obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
+obj-$(CONFIG_AMD_SECURE_AVIC) += x2apic_savic.o
obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
obj-y += apic_flat_64.o
endif
-# APIC probe will depend on the listing order here
-obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
-
# For 32bit, probe_32 need to be listed last
obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b7bcdd781651..d93f87f29d03 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Local APIC handling, local APIC timers
*
@@ -18,6 +19,7 @@
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/acpi_pmtmr.h>
+#include <linux/bitmap.h>
#include <linux/clockchips.h>
#include <linux/interrupt.h>
#include <linux/memblock.h>
@@ -34,18 +36,23 @@
#include <linux/dmi.h>
#include <linux/smp.h>
#include <linux/mm.h>
+#include <linux/kvm_types.h>
+
+#include <xen/xen.h>
#include <asm/trace/irq_vectors.h>
#include <asm/irq_remapping.h>
+#include <asm/pc-conf-reg.h>
#include <asm/perf_event.h>
#include <asm/x86_init.h>
-#include <asm/pgalloc.h>
#include <linux/atomic.h>
+#include <asm/barrier.h>
#include <asm/mpspec.h>
#include <asm/i8259.h>
#include <asm/proto.h>
#include <asm/traps.h>
#include <asm/apic.h>
+#include <asm/acpi.h>
#include <asm/io_apic.h>
#include <asm/desc.h>
#include <asm/hpet.h>
@@ -53,67 +60,44 @@
#include <asm/time.h>
#include <asm/smp.h>
#include <asm/mce.h>
+#include <asm/msr.h>
#include <asm/tsc.h>
#include <asm/hypervisor.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
#include <asm/irq_regs.h>
+#include <asm/cpu.h>
-unsigned int num_processors;
-
-unsigned disabled_cpus;
+#include "local.h"
/* Processor that is doing the boot up */
-unsigned int boot_cpu_physical_apicid = -1U;
+u32 boot_cpu_physical_apicid __ro_after_init = BAD_APICID;
EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
-u8 boot_cpu_apic_version;
-
-/*
- * The highest APIC ID seen during enumeration.
- */
-static unsigned int max_physical_apicid;
-
-/*
- * Bitmask of physically existing CPUs:
- */
-physid_mask_t phys_cpu_present_map;
-
-/*
- * Processor to be disabled specified by kernel parameter
- * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to
- * avoid undefined behaviour caused by sending INIT from AP to BSP.
- */
-static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID;
+u8 boot_cpu_apic_version __ro_after_init;
/*
* This variable controls which CPUs receive external NMIs. By default,
* external NMIs are delivered only to the BSP.
*/
-static int apic_extnmi = APIC_EXTNMI_BSP;
+static int apic_extnmi __ro_after_init = APIC_EXTNMI_BSP;
/*
- * Map cpu index to physical APIC ID
+ * Hypervisor supports 15 bits of APIC ID in MSI Extended Destination ID
*/
-DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
-DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID);
-DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);
+static bool virt_ext_dest_id __ro_after_init;
-#ifdef CONFIG_X86_32
+/* For parallel bootup. */
+unsigned long apic_mmio_base __ro_after_init;
-/*
- * On x86_32, the mapping between cpu and logical apicid may vary
- * depending on apic in use. The following early percpu variable is
- * used for the mapping. This is where the behaviors of x86_64 and 32
- * actually diverge. Let's keep it ugly for now.
- */
-DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID);
+static inline bool apic_accessible(void)
+{
+ return x2apic_mode || apic_mmio_base;
+}
+#ifdef CONFIG_X86_32
/* Local APIC was disabled by the BIOS and enabled by the kernel */
-static int enabled_via_apicbase;
+static int enabled_via_apicbase __ro_after_init;
/*
* Handle interrupt mode configuration register (IMCR).
@@ -125,18 +109,14 @@ static int enabled_via_apicbase;
*/
static inline void imcr_pic_to_apic(void)
{
- /* select IMCR register */
- outb(0x70, 0x22);
/* NMI and 8259 INTR go through APIC */
- outb(0x01, 0x23);
+ pc_conf_set(PC_CONF_MPS_IMCR, 0x01);
}
static inline void imcr_apic_to_pic(void)
{
- /* select IMCR register */
- outb(0x70, 0x22);
/* NMI and 8259 INTR go directly to BSP */
- outb(0x00, 0x23);
+ pc_conf_set(PC_CONF_MPS_IMCR, 0x00);
}
#endif
@@ -166,40 +146,39 @@ static __init int setup_apicpmtimer(char *s)
{
apic_calibrate_pmtmr = 1;
notsc_setup(NULL);
- return 0;
+ return 1;
}
__setup("apicpmtimer", setup_apicpmtimer);
#endif
-unsigned long mp_lapic_addr;
-int disable_apic;
+static unsigned long mp_lapic_addr __ro_after_init;
+bool apic_is_disabled __ro_after_init;
/* Disable local APIC timer from the kernel commandline or via dmi quirk */
static int disable_apic_timer __initdata;
/* Local APIC timer works in C2 */
-int local_apic_timer_c2_ok;
+int local_apic_timer_c2_ok __ro_after_init;
EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
/*
* Debug level, exported for io_apic.c
*/
-unsigned int apic_verbosity;
+int apic_verbosity __ro_after_init;
-int pic_mode;
+int pic_mode __ro_after_init;
/* Have we found an MP table */
-int smp_found_config;
+int smp_found_config __ro_after_init;
static struct resource lapic_resource = {
.name = "Local APIC",
.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
};
-unsigned int lapic_timer_frequency = 0;
+/* Measured in ticks per HZ. */
+unsigned int lapic_timer_period = 0;
static void apic_pm_activate(void);
-static unsigned long apic_phys;
-
/*
* Get the LAPIC version
*/
@@ -239,31 +218,7 @@ static int modern_apic(void)
*/
static void __init apic_disable(void)
{
- pr_info("APIC: switched to apic NOOP\n");
- apic = &apic_noop;
-}
-
-void native_apic_wait_icr_idle(void)
-{
- while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
- cpu_relax();
-}
-
-u32 native_safe_apic_wait_icr_idle(void)
-{
- u32 send_status;
- int timeout;
-
- timeout = 0;
- do {
- send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
- if (!send_status)
- break;
- inc_irq_stat(icr_read_retry_count);
- udelay(100);
- } while (timeout++ < 1000);
-
- return send_status;
+ apic_install_driver(&apic_noop);
}
void native_apic_icr_write(u32 low, u32 id)
@@ -271,7 +226,7 @@ void native_apic_icr_write(u32 low, u32 id)
unsigned long flags;
local_irq_save(flags);
- apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
+ apic_write(APIC_ICR2, SET_XAPIC_DEST_FIELD(id));
apic_write(APIC_ICR, low);
local_irq_restore(flags);
}
@@ -286,16 +241,6 @@ u64 native_apic_icr_read(void)
return icr1 | ((u64)icr2 << 32);
}
-#ifdef CONFIG_X86_32
-/**
- * get_physical_broadcast - Get number of physical broadcast IDs
- */
-int get_physical_broadcast(void)
-{
- return modern_apic() ? 0xff : 0xf;
-}
-#endif
-
/**
* lapic_get_maxlvt - get the maximum number of local vector table entries
*/
@@ -316,6 +261,9 @@ int lapic_get_maxlvt(void)
#define APIC_DIVISOR 16
#define TSC_DIVISOR 8
+/* i82489DX specific */
+#define I82489DX_BASE_DIVIDER (((0x2) << 18))
+
/*
* This function sets up the local APIC timer, with a timeout of
* 'clocks' APIC bus clock. During calibration we actually call
@@ -336,8 +284,14 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE;
+ /*
+ * The i82489DX APIC uses bit 18 and 19 for the base divider. This
+ * overlaps with bit 18 on integrated APICs, but is not documented
+ * in the SDM. No problem though. i82489DX equipped systems do not
+ * have TSC deadline timer.
+ */
if (!lapic_is_integrated())
- lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
+ lvtt_value |= I82489DX_BASE_DIVIDER;
if (!irqen)
lvtt_value |= APIC_LVT_MASKED;
@@ -351,8 +305,6 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
* According to Intel, MFENCE can do the serialization here.
*/
asm volatile("mfence" : : : "memory");
-
- printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
return;
}
@@ -410,10 +362,9 @@ static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
if (vector && !eilvt_entry_is_changeable(vector, new))
/* may not change if vectors are different */
return rsvd;
- rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
- } while (rsvd != new);
+ } while (!atomic_try_cmpxchg(&eilvt_offsets[offset], &rsvd, new));
- rsvd &= ~APIC_EILVT_MASKED;
+ rsvd = new & ~APIC_EILVT_MASKED;
if (rsvd && rsvd != vector)
pr_info("LVT offset %d assigned for vector 0x%02x\n",
offset, rsvd);
@@ -473,8 +424,11 @@ static int lapic_next_deadline(unsigned long delta,
{
u64 tsc;
+ /* This MSR is special and need a special fence: */
+ weak_wrmsr_fence();
+
tsc = rdtsc();
- wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
+ wrmsrq(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
return 0;
}
@@ -489,7 +443,19 @@ static int lapic_timer_shutdown(struct clock_event_device *evt)
v = apic_read(APIC_LVTT);
v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
apic_write(APIC_LVTT, v);
- apic_write(APIC_TMICT, 0);
+
+ /*
+ * Setting APIC_LVT_MASKED (above) should be enough to tell
+ * the hardware that this timer will never fire. But AMD
+ * erratum 411 and some Intel CPU behavior circa 2024 say
+ * otherwise. Time for belt and suspenders programming: mask
+ * the timer _and_ zero the counter registers:
+ */
+ if (v & APIC_LVT_TIMER_TSCDEADLINE)
+ wrmsrq(MSR_IA32_TSC_DEADLINE, 0);
+ else
+ apic_write(APIC_TMICT, 0);
+
return 0;
}
@@ -500,7 +466,7 @@ lapic_timer_set_periodic_oneshot(struct clock_event_device *evt, bool oneshot)
if (evt->features & CLOCK_EVT_FEAT_DUMMY)
return 0;
- __setup_APIC_LVTT(lapic_timer_frequency, oneshot, 1);
+ __setup_APIC_LVTT(lapic_timer_period, oneshot, 1);
return 0;
}
@@ -520,7 +486,7 @@ static int lapic_timer_set_oneshot(struct clock_event_device *evt)
static void lapic_timer_broadcast(const struct cpumask *mask)
{
#ifdef CONFIG_SMP
- apic->send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
+ __apic_send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
#endif
}
@@ -545,97 +511,60 @@ static struct clock_event_device lapic_clockevent = {
};
static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
-#define DEADLINE_MODEL_MATCH_FUNC(model, func) \
- { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&func }
-
-#define DEADLINE_MODEL_MATCH_REV(model, rev) \
- { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)rev }
-
-static u32 hsx_deadline_rev(void)
-{
- switch (boot_cpu_data.x86_stepping) {
- case 0x02: return 0x3a; /* EP */
- case 0x04: return 0x0f; /* EX */
- }
-
- return ~0U;
-}
-
-static u32 bdx_deadline_rev(void)
-{
- switch (boot_cpu_data.x86_stepping) {
- case 0x02: return 0x00000011;
- case 0x03: return 0x0700000e;
- case 0x04: return 0x0f00000c;
- case 0x05: return 0x0e000003;
- }
-
- return ~0U;
-}
-
-static u32 skx_deadline_rev(void)
-{
- switch (boot_cpu_data.x86_stepping) {
- case 0x03: return 0x01000136;
- case 0x04: return 0x02000014;
- }
+static const struct x86_cpu_id deadline_match[] __initconst = {
+ X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x2, 0x2, 0x3a), /* EP */
+ X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x4, 0x4, 0x0f), /* EX */
- if (boot_cpu_data.x86_stepping > 4)
- return 0;
+ X86_MATCH_VFM(INTEL_BROADWELL_X, 0x0b000020),
- return ~0U;
-}
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x2, 0x2, 0x00000011),
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x3, 0x3, 0x0700000e),
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x4, 0x4, 0x0f00000c),
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x5, 0x5, 0x0e000003),
-static const struct x86_cpu_id deadline_match[] = {
- DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev),
- DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020),
- DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev),
- DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X, skx_deadline_rev),
+ X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x3, 0x3, 0x01000136),
+ X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x4, 0x4, 0x02000014),
+ X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x5, 0xf, 0),
- DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22),
- DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20),
- DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_GT3E, 0x17),
+ X86_MATCH_VFM(INTEL_HASWELL, 0x22),
+ X86_MATCH_VFM(INTEL_HASWELL_L, 0x20),
+ X86_MATCH_VFM(INTEL_HASWELL_G, 0x17),
- DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_CORE, 0x25),
- DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_GT3E, 0x17),
+ X86_MATCH_VFM(INTEL_BROADWELL, 0x25),
+ X86_MATCH_VFM(INTEL_BROADWELL_G, 0x17),
- DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_MOBILE, 0xb2),
- DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_DESKTOP, 0xb2),
+ X86_MATCH_VFM(INTEL_SKYLAKE_L, 0xb2),
+ X86_MATCH_VFM(INTEL_SKYLAKE, 0xb2),
- DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_MOBILE, 0x52),
- DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_DESKTOP, 0x52),
+ X86_MATCH_VFM(INTEL_KABYLAKE_L, 0x52),
+ X86_MATCH_VFM(INTEL_KABYLAKE, 0x52),
{},
};
-static void apic_check_deadline_errata(void)
+static __init bool apic_validate_deadline_timer(void)
{
const struct x86_cpu_id *m;
u32 rev;
- if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) ||
- boot_cpu_has(X86_FEATURE_HYPERVISOR))
- return;
+ if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return false;
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return true;
m = x86_match_cpu(deadline_match);
if (!m)
- return;
+ return true;
- /*
- * Function pointers will have the MSB set due to address layout,
- * immediate revisions will not.
- */
- if ((long)m->driver_data < 0)
- rev = ((u32 (*)(void))(m->driver_data))();
- else
- rev = (u32)m->driver_data;
+ rev = (u32)m->driver_data;
if (boot_cpu_data.microcode >= rev)
- return;
+ return true;
setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; "
"please update microcode to version: 0x%x (or later)\n", rev);
+ return false;
}
/*
@@ -648,7 +577,7 @@ static void setup_APIC_timer(void)
if (this_cpu_has(X86_FEATURE_ARAT)) {
lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
- /* Make LAPIC timer preferrable over percpu HPET */
+ /* Make LAPIC timer preferable over percpu HPET */
lapic_clockevent.rating = 150;
}
@@ -665,6 +594,8 @@ static void setup_APIC_timer(void)
0xF, ~0UL);
} else
clockevents_register_device(levt);
+
+ apic_update_vector(smp_processor_id(), LOCAL_TIMER_VECTOR, true);
}
/*
@@ -695,7 +626,7 @@ void lapic_update_tsc_freq(void)
* In this functions we calibrate APIC bus clocks to the external timer.
*
* We want to do the calibration only once since we want to have local timer
- * irqs syncron. CPUs connected by the same APIC bus have the very same bus
+ * irqs synchronous. CPUs connected by the same APIC bus have the very same bus
* frequency.
*
* This was previously done by reading the PIT/HPET and waiting for a wrap
@@ -717,17 +648,17 @@ void lapic_update_tsc_freq(void)
static __initdata int lapic_cal_loops = -1;
static __initdata long lapic_cal_t1, lapic_cal_t2;
static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
-static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
+static __initdata u32 lapic_cal_pm1, lapic_cal_pm2;
static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
/*
- * Temporary interrupt handler.
+ * Temporary interrupt handler and polled calibration function.
*/
static void __init lapic_cal_handler(struct clock_event_device *dev)
{
unsigned long long tsc = 0;
long tapic = apic_read(APIC_TMCCT);
- unsigned long pm = acpi_pm_read_early();
+ u32 pm = acpi_pm_read_early();
if (boot_cpu_has(X86_FEATURE_TSC))
tsc = rdtsc();
@@ -752,7 +683,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
}
static int __init
-calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
+calibrate_by_pmtimer(u32 deltapm, long *delta, long *deltatsc)
{
const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
const long pm_thresh = pm_100ms / 100;
@@ -763,7 +694,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
return -1;
#endif
- apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm);
+ apic_pr_verbose("... PM-Timer delta = %u\n", deltapm);
/* Check, if the PM timer is available */
if (!deltapm)
@@ -773,14 +704,14 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
if (deltapm > (pm_100ms - pm_thresh) &&
deltapm < (pm_100ms + pm_thresh)) {
- apic_printk(APIC_VERBOSE, "... PM-Timer result ok\n");
+ apic_pr_verbose("... PM-Timer result ok\n");
return 0;
}
res = (((u64)deltapm) * mult) >> 22;
do_div(res, 1000000);
- pr_warning("APIC calibration not consistent "
- "with PM-Timer: %ldms instead of 100ms\n",(long)res);
+ pr_warn("APIC calibration not consistent with PM-Timer: %ldms instead of 100ms\n",
+ (long)res);
/* Correct the lapic counter value */
res = (((u64)(*delta)) * pm_100ms);
@@ -793,75 +724,164 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
if (boot_cpu_has(X86_FEATURE_TSC)) {
res = (((u64)(*deltatsc)) * pm_100ms);
do_div(res, deltapm);
- apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
- "PM-Timer: %lu (%ld)\n",
- (unsigned long)res, *deltatsc);
+ apic_pr_verbose("TSC delta adjusted to PM-Timer: %lu (%ld)\n",
+ (unsigned long)res, *deltatsc);
*deltatsc = (long)res;
}
return 0;
}
+static int __init lapic_init_clockevent(void)
+{
+ if (!lapic_timer_period)
+ return -1;
+
+ /* Calculate the scaled math multiplication factor */
+ lapic_clockevent.mult = div_sc(lapic_timer_period/APIC_DIVISOR,
+ TICK_NSEC, lapic_clockevent.shift);
+ lapic_clockevent.max_delta_ns =
+ clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent);
+ lapic_clockevent.max_delta_ticks = 0x7FFFFFFF;
+ lapic_clockevent.min_delta_ns =
+ clockevent_delta2ns(0xF, &lapic_clockevent);
+ lapic_clockevent.min_delta_ticks = 0xF;
+
+ return 0;
+}
+
+bool __init apic_needs_pit(void)
+{
+ /*
+ * If the frequencies are not known, PIT is required for both TSC
+ * and apic timer calibration.
+ */
+ if (!tsc_khz || !cpu_khz)
+ return true;
+
+ /* Is there an APIC at all or is it disabled? */
+ if (!boot_cpu_has(X86_FEATURE_APIC) || apic_is_disabled)
+ return true;
+
+ /*
+ * If interrupt delivery mode is legacy PIC or virtual wire without
+ * configuration, the local APIC timer won't be set up. Make sure
+ * that the PIT is initialized.
+ */
+ if (apic_intr_mode == APIC_PIC ||
+ apic_intr_mode == APIC_VIRTUAL_WIRE_NO_CONFIG)
+ return true;
+
+ /* Virt guests may lack ARAT, but still have DEADLINE */
+ if (!boot_cpu_has(X86_FEATURE_ARAT))
+ return true;
+
+ /* Deadline timer is based on TSC so no further PIT action required */
+ if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return false;
+
+ /* APIC timer disabled? */
+ if (disable_apic_timer)
+ return true;
+ /*
+ * The APIC timer frequency is known already, no PIT calibration
+ * required. If unknown, let the PIT be initialized.
+ */
+ return lapic_timer_period == 0;
+}
+
static int __init calibrate_APIC_clock(void)
{
struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
- void (*real_handler)(struct clock_event_device *dev);
+ u64 tsc_perj = 0, tsc_start = 0;
+ long delta_tsc_khz, bus_khz;
+ unsigned long jif_start;
unsigned long deltaj;
long delta, deltatsc;
int pm_referenced = 0;
- /**
- * check if lapic timer has already been calibrated by platform
- * specific routine, such as tsc calibration code. if so, we just fill
+ if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return 0;
+
+ /*
+ * Check if lapic timer has already been calibrated by platform
+ * specific routine, such as tsc calibration code. If so just fill
* in the clockevent structure and return.
*/
-
- if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) {
- return 0;
- } else if (lapic_timer_frequency) {
- apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n",
- lapic_timer_frequency);
- lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR,
- TICK_NSEC, lapic_clockevent.shift);
- lapic_clockevent.max_delta_ns =
- clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
- lapic_clockevent.max_delta_ticks = 0x7FFFFF;
- lapic_clockevent.min_delta_ns =
- clockevent_delta2ns(0xF, &lapic_clockevent);
- lapic_clockevent.min_delta_ticks = 0xF;
+ if (!lapic_init_clockevent()) {
+ apic_pr_verbose("lapic timer already calibrated %d\n", lapic_timer_period);
+ /*
+ * Direct calibration methods must have an always running
+ * local APIC timer, no need for broadcast timer.
+ */
lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
return 0;
}
- apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
- "calibrating APIC timer ...\n");
+ apic_pr_verbose("Using local APIC timer interrupts. Calibrating APIC timer ...\n");
+ /*
+ * There are platforms w/o global clockevent devices. Instead of
+ * making the calibration conditional on that, use a polling based
+ * approach everywhere.
+ */
local_irq_disable();
- /* Replace the global interrupt handler */
- real_handler = global_clock_event->event_handler;
- global_clock_event->event_handler = lapic_cal_handler;
-
/*
* Setup the APIC counter to maximum. There is no way the lapic
* can underflow in the 100ms detection time frame
*/
__setup_APIC_LVTT(0xffffffff, 0, 0);
- /* Let the interrupts run */
+ /*
+ * Methods to terminate the calibration loop:
+ * 1) Global clockevent if available (jiffies)
+ * 2) TSC if available and frequency is known
+ */
+ jif_start = READ_ONCE(jiffies);
+
+ if (tsc_khz) {
+ tsc_start = rdtsc();
+ tsc_perj = div_u64((u64)tsc_khz * 1000, HZ);
+ }
+
+ /*
+ * Enable interrupts so the tick can fire, if a global
+ * clockevent device is available
+ */
local_irq_enable();
- while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
- cpu_relax();
+ while (lapic_cal_loops <= LAPIC_CAL_LOOPS) {
+ /* Wait for a tick to elapse */
+ while (1) {
+ if (tsc_khz) {
+ u64 tsc_now = rdtsc();
+ if ((tsc_now - tsc_start) >= tsc_perj) {
+ tsc_start += tsc_perj;
+ break;
+ }
+ } else {
+ unsigned long jif_now = READ_ONCE(jiffies);
- local_irq_disable();
+ if (time_after(jif_now, jif_start)) {
+ jif_start = jif_now;
+ break;
+ }
+ }
+ cpu_relax();
+ }
- /* Restore the real event handler */
- global_clock_event->event_handler = real_handler;
+ /* Invoke the calibration routine */
+ local_irq_disable();
+ lapic_cal_handler(NULL);
+ local_irq_enable();
+ }
+
+ local_irq_disable();
/* Build delta t1-t2 as apic timer counts down */
delta = lapic_cal_t1 - lapic_cal_t2;
- apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
+ apic_pr_verbose("... lapic delta = %ld\n", delta);
deltatsc = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
@@ -869,52 +889,42 @@ static int __init calibrate_APIC_clock(void)
pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1,
&delta, &deltatsc);
- /* Calculate the scaled math multiplication factor */
- lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
- lapic_clockevent.shift);
- lapic_clockevent.max_delta_ns =
- clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent);
- lapic_clockevent.max_delta_ticks = 0x7FFFFFFF;
- lapic_clockevent.min_delta_ns =
- clockevent_delta2ns(0xF, &lapic_clockevent);
- lapic_clockevent.min_delta_ticks = 0xF;
-
- lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
+ lapic_timer_period = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
+ lapic_init_clockevent();
- apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
- apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult);
- apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
- lapic_timer_frequency);
+ apic_pr_verbose("..... delta %ld\n", delta);
+ apic_pr_verbose("..... mult: %u\n", lapic_clockevent.mult);
+ apic_pr_verbose("..... calibration result: %u\n", lapic_timer_period);
if (boot_cpu_has(X86_FEATURE_TSC)) {
- apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
- "%ld.%04ld MHz.\n",
- (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ),
- (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ));
+ delta_tsc_khz = (deltatsc * HZ) / (1000 * LAPIC_CAL_LOOPS);
+
+ apic_pr_verbose("..... CPU clock speed is %ld.%03ld MHz.\n",
+ delta_tsc_khz / 1000, delta_tsc_khz % 1000);
}
- apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
- "%u.%04u MHz.\n",
- lapic_timer_frequency / (1000000 / HZ),
- lapic_timer_frequency % (1000000 / HZ));
+ bus_khz = (long)lapic_timer_period * HZ / 1000;
+ apic_pr_verbose("..... host bus clock speed is %ld.%03ld MHz.\n",
+ bus_khz / 1000, bus_khz % 1000);
/*
* Do a sanity check on the APIC calibration result
*/
- if (lapic_timer_frequency < (1000000 / HZ)) {
+ if (lapic_timer_period < (1000000 / HZ)) {
local_irq_enable();
- pr_warning("APIC frequency too slow, disabling apic timer\n");
+ pr_warn("APIC frequency too slow, disabling apic timer\n");
return -1;
}
levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
/*
- * PM timer calibration failed or not turned on
- * so lets try APIC timer based calibration
+ * PM timer calibration failed or not turned on so lets try APIC
+ * timer based calibration, if a global clockevent device is
+ * available.
*/
- if (!pm_referenced) {
- apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
+ if (!pm_referenced && global_clock_event) {
+ apic_pr_verbose("... verify APIC timer\n");
/*
* Setup the apic timer manually
@@ -935,18 +945,18 @@ static int __init calibrate_APIC_clock(void)
/* Jiffies delta */
deltaj = lapic_cal_j2 - lapic_cal_j1;
- apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
+ apic_pr_verbose("... jiffies delta = %lu\n", deltaj);
/* Check, if the jiffies result is consistent */
if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
- apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
+ apic_pr_verbose("... jiffies result ok\n");
else
levt->features |= CLOCK_EVT_FEAT_DUMMY;
}
local_irq_enable();
if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
- pr_warning("APIC timer disabled due to verification failure\n");
+ pr_warn("APIC timer disabled due to verification failure\n");
return -1;
}
@@ -1020,8 +1030,8 @@ static void local_apic_timer_interrupt(void)
* spurious.
*/
if (!evt->event_handler) {
- pr_warning("Spurious LAPIC timer interrupt on cpu %d\n",
- smp_processor_id());
+ pr_warn("Spurious LAPIC timer interrupt on cpu %d\n",
+ smp_processor_id());
/* Switch it off */
lapic_timer_shutdown(evt);
return;
@@ -1043,32 +1053,18 @@ static void local_apic_timer_interrupt(void)
* [ if a single-CPU system runs an SMP kernel then we call the local
* interrupt as well. Thus we cannot inline the local irq ... ]
*/
-__visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_apic_timer_interrupt)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- /*
- * NOTE! We'd better ACK the irq immediately,
- * because timer handling can be slow.
- *
- * update_process_times() expects us to have done irq_enter().
- * Besides, if we don't timer interrupts ignore the global
- * interrupt lock, which is the WrongThing (tm) to do.
- */
- entering_ack_irq();
+ apic_eoi();
trace_local_timer_entry(LOCAL_TIMER_VECTOR);
local_apic_timer_interrupt();
trace_local_timer_exit(LOCAL_TIMER_VECTOR);
- exiting_irq();
set_irq_regs(old_regs);
}
-int setup_profiling_timer(unsigned int multiplier)
-{
- return -EINVAL;
-}
-
/*
* Local APIC start and shutdown
*/
@@ -1085,8 +1081,7 @@ void clear_local_APIC(void)
int maxlvt;
u32 v;
- /* APIC hasn't been mapped yet */
- if (!x2apic_mode && !apic_phys)
+ if (!apic_accessible())
return;
maxlvt = lapic_get_maxlvt();
@@ -1149,25 +1144,40 @@ void clear_local_APIC(void)
}
/**
- * disable_local_APIC - clear and disable the local APIC
+ * apic_soft_disable - Clears and software disables the local APIC on hotplug
+ *
+ * Contrary to disable_local_APIC() this does not touch the enable bit in
+ * MSR_IA32_APICBASE. Clearing that bit on systems based on the 3 wire APIC
+ * bus would require a hardware reset as the APIC would lose track of bus
+ * arbitration. On systems with FSB delivery APICBASE could be disabled,
+ * but it has to be guaranteed that no interrupt is sent to the APIC while
+ * in that state and it's not clear from the SDM whether it still responds
+ * to INIT/SIPI messages. Stay on the safe side and use software disable.
*/
-void disable_local_APIC(void)
+void apic_soft_disable(void)
{
- unsigned int value;
-
- /* APIC hasn't been mapped yet */
- if (!x2apic_mode && !apic_phys)
- return;
+ u32 value;
clear_local_APIC();
- /*
- * Disable APIC (implies clearing of registers
- * for 82489DX!).
- */
+ /* Soft disable APIC (implies clearing of registers for 82489DX!). */
value = apic_read(APIC_SPIV);
value &= ~APIC_SPIV_APIC_ENABLED;
apic_write(APIC_SPIV, value);
+}
+
+/**
+ * disable_local_APIC - clear and disable the local APIC
+ */
+void disable_local_APIC(void)
+{
+ if (!apic_accessible())
+ return;
+
+ if (apic->teardown)
+ apic->teardown();
+
+ apic_soft_disable();
#ifdef CONFIG_X86_32
/*
@@ -1227,17 +1237,16 @@ void __init sync_Arb_IDs(void)
*/
apic_wait_icr_idle();
- apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
- apic_write(APIC_ICR, APIC_DEST_ALLINC |
- APIC_INT_LEVELTRIG | APIC_DM_INIT);
+ apic_pr_debug("Synchronizing Arb IDs.\n");
+ apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT);
}
-enum apic_intr_mode_id apic_intr_mode;
+enum apic_intr_mode_id apic_intr_mode __ro_after_init;
-static int __init apic_intr_mode_select(void)
+static int __init __apic_intr_mode_select(void)
{
/* Check kernel option */
- if (disable_apic) {
+ if (apic_is_disabled) {
pr_info("APIC disabled via kernel command line\n");
return APIC_PIC;
}
@@ -1246,7 +1255,7 @@ static int __init apic_intr_mode_select(void)
#ifdef CONFIG_X86_64
/* On 64-bit, the APIC must be integrated, Check local APIC only */
if (!boot_cpu_has(X86_FEATURE_APIC)) {
- disable_apic = 1;
+ apic_is_disabled = true;
pr_info("APIC disabled by BIOS\n");
return APIC_PIC;
}
@@ -1255,16 +1264,15 @@ static int __init apic_intr_mode_select(void)
/* Neither 82489DX nor integrated APIC ? */
if (!boot_cpu_has(X86_FEATURE_APIC) && !smp_found_config) {
- disable_apic = 1;
+ apic_is_disabled = true;
return APIC_PIC;
}
/* If the BIOS pretends there is an integrated APIC ? */
if (!boot_cpu_has(X86_FEATURE_APIC) &&
APIC_INTEGRATED(boot_cpu_apic_version)) {
- disable_apic = 1;
- pr_err(FW_BUG "Local APIC %d not detected, force emulation\n",
- boot_cpu_physical_apicid);
+ apic_is_disabled = true;
+ pr_err(FW_BUG "Local APIC not detected, force emulation\n");
return APIC_PIC;
}
#endif
@@ -1285,17 +1293,17 @@ static int __init apic_intr_mode_select(void)
pr_info("APIC: SMP mode deactivated\n");
return APIC_SYMMETRIC_IO_NO_ROUTING;
}
-
- if (read_apic_id() != boot_cpu_physical_apicid) {
- panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
- read_apic_id(), boot_cpu_physical_apicid);
- /* Or can we switch back to PIC here? */
- }
#endif
return APIC_SYMMETRIC_IO;
}
+/* Select the interrupt delivery mode for the BSP */
+void __init apic_intr_mode_select(void)
+{
+ apic_intr_mode = __apic_intr_mode_select();
+}
+
/*
* An initial setup of the virtual wire mode.
*/
@@ -1345,35 +1353,37 @@ void __init init_bsp_APIC(void)
apic_write(APIC_LVT1, value);
}
+static void __init apic_bsp_setup(bool upmode);
+
/* Init the interrupt delivery mode for the BSP */
void __init apic_intr_mode_init(void)
{
bool upmode = IS_ENABLED(CONFIG_UP_LATE_INIT);
- apic_intr_mode = apic_intr_mode_select();
-
switch (apic_intr_mode) {
case APIC_PIC:
pr_info("APIC: Keep in PIC mode(8259)\n");
return;
case APIC_VIRTUAL_WIRE:
pr_info("APIC: Switch to virtual wire mode setup\n");
- default_setup_apic_routing();
break;
case APIC_VIRTUAL_WIRE_NO_CONFIG:
pr_info("APIC: Switch to virtual wire mode setup with no configuration\n");
upmode = true;
- default_setup_apic_routing();
break;
case APIC_SYMMETRIC_IO:
pr_info("APIC: Switch to symmetric I/O mode setup\n");
- default_setup_apic_routing();
break;
case APIC_SYMMETRIC_IO_NO_ROUTING:
pr_info("APIC: Switch to symmetric I/O mode setup in no SMP routine\n");
break;
}
+ x86_64_probe_apic();
+
+ if (x86_platform.apic_post_init)
+ x86_platform.apic_post_init();
+
apic_bsp_setup(upmode);
}
@@ -1412,59 +1422,76 @@ static void lapic_setup_esr(void)
if (maxlvt > 3)
apic_write(APIC_ESR, 0);
value = apic_read(APIC_ESR);
- if (value != oldvalue)
- apic_printk(APIC_VERBOSE, "ESR value before enabling "
- "vector: 0x%08x after: 0x%08x\n",
- oldvalue, value);
+ if (value != oldvalue) {
+ apic_pr_verbose("ESR value before enabling vector: 0x%08x after: 0x%08x\n",
+ oldvalue, value);
+ }
}
-static void apic_pending_intr_clear(void)
+#define APIC_IR_REGS APIC_ISR_NR
+#define APIC_IR_BITS (APIC_IR_REGS * 32)
+#define APIC_IR_MAPSIZE (APIC_IR_BITS / BITS_PER_LONG)
+
+union apic_ir {
+ unsigned long map[APIC_IR_MAPSIZE];
+ u32 regs[APIC_IR_REGS];
+};
+
+static bool apic_check_and_eoi_isr(union apic_ir *isr)
{
- long long max_loops = cpu_khz ? cpu_khz : 1000000;
- unsigned long long tsc = 0, ntsc;
- unsigned int queued;
- unsigned long value;
- int i, j, acked = 0;
+ int i, bit;
+
+ /* Read the ISRs */
+ for (i = 0; i < APIC_IR_REGS; i++)
+ isr->regs[i] = apic_read(APIC_ISR + i * 0x10);
+
+ /* If the ISR map empty, nothing to do here. */
+ if (bitmap_empty(isr->map, APIC_IR_BITS))
+ return true;
- if (boot_cpu_has(X86_FEATURE_TSC))
- tsc = rdtsc();
/*
- * After a crash, we no longer service the interrupts and a pending
- * interrupt from previous kernel might still have ISR bit set.
- *
- * Most probably by now CPU has serviced that pending interrupt and
- * it might not have done the ack_APIC_irq() because it thought,
- * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
- * does not clear the ISR bit and cpu thinks it has already serivced
- * the interrupt. Hence a vector might get locked. It was noticed
- * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
+ * There can be multiple ISR bits set when a high priority
+ * interrupt preempted a lower priority one. Issue an EOI for each
+ * set bit. The priority traversal order does not matter as there
+ * can't be new ISR bits raised at this point. What matters is that
+ * an EOI is issued for each ISR bit.
*/
- do {
- queued = 0;
- for (i = APIC_ISR_NR - 1; i >= 0; i--)
- queued |= apic_read(APIC_IRR + i*0x10);
-
- for (i = APIC_ISR_NR - 1; i >= 0; i--) {
- value = apic_read(APIC_ISR + i*0x10);
- for_each_set_bit(j, &value, 32) {
- ack_APIC_irq();
- acked++;
- }
- }
- if (acked > 256) {
- pr_err("LAPIC pending interrupts after %d EOI\n", acked);
- break;
- }
- if (queued) {
- if (boot_cpu_has(X86_FEATURE_TSC) && cpu_khz) {
- ntsc = rdtsc();
- max_loops = (cpu_khz << 10) - (ntsc - tsc);
- } else {
- max_loops--;
- }
- }
- } while (queued && max_loops > 0);
- WARN_ON(max_loops <= 0);
+ for_each_set_bit(bit, isr->map, APIC_IR_BITS)
+ apic_eoi();
+
+ /* Reread the ISRs, they should be empty now */
+ for (i = 0; i < APIC_IR_REGS; i++)
+ isr->regs[i] = apic_read(APIC_ISR + i * 0x10);
+
+ return bitmap_empty(isr->map, APIC_IR_BITS);
+}
+
+/*
+ * If a CPU services an interrupt and crashes before issuing EOI to the
+ * local APIC, the corresponding ISR bit is still set when the crashing CPU
+ * jumps into a crash kernel. Read the ISR and issue an EOI for each set
+ * bit to acknowledge it as otherwise these slots would be locked forever
+ * waiting for an EOI.
+ *
+ * If there are pending bits in the IRR, then they won't be converted into
+ * ISR bits as the CPU has interrupts disabled. They will be delivered once
+ * the CPU enables interrupts and there is nothing which can prevent that.
+ *
+ * In the worst case this results in spurious interrupt warnings.
+ */
+static void apic_clear_isr(void)
+{
+ union apic_ir ir;
+ unsigned int i;
+
+ if (!apic_check_and_eoi_isr(&ir))
+ pr_warn("APIC: Stale ISR: %256pb\n", ir.map);
+
+ for (i = 0; i < APIC_IR_REGS; i++)
+ ir.regs[i] = apic_read(APIC_IRR + i * 0x10);
+
+ if (!bitmap_empty(ir.map, APIC_IR_BITS))
+ pr_warn("APIC: Stale IRR: %256pb\n", ir.map);
}
/**
@@ -1477,16 +1504,23 @@ static void setup_local_APIC(void)
{
int cpu = smp_processor_id();
unsigned int value;
-#ifdef CONFIG_X86_32
- int logical_apicid, ldr_apicid;
-#endif
-
- if (disable_apic) {
+ if (apic_is_disabled) {
disable_ioapic_support();
return;
}
+ if (apic->setup)
+ apic->setup();
+
+ /*
+ * If this comes from kexec/kcrash the APIC might be enabled in
+ * SPIV. Soft disable it before doing further initialization.
+ */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_SPIV_APIC_ENABLED;
+ apic_write(APIC_SPIV, value);
+
#ifdef CONFIG_X86_32
/* Pound the ESR really hard over the head with a big hammer - mbligh */
if (lapic_is_integrated() && apic->disable_esr) {
@@ -1496,43 +1530,28 @@ static void setup_local_APIC(void)
apic_write(APIC_ESR, 0);
}
#endif
- perf_events_lapic_init();
-
- /*
- * Double-check whether this APIC is really registered.
- * This is meaningless in clustered apic mode, so we skip it.
- */
- BUG_ON(!apic->apic_id_registered());
-
/*
* Intel recommends to set DFR, LDR and TPR before enabling
* an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
+ * document number 292116).
+ *
+ * Except for APICs which operate in physical destination mode.
*/
- apic->init_apic_ldr();
+ if (apic->init_apic_ldr)
+ apic->init_apic_ldr();
-#ifdef CONFIG_X86_32
/*
- * APIC LDR is initialized. If logical_apicid mapping was
- * initialized during get_smp_config(), make sure it matches the
- * actual value.
- */
- logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
- ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
- WARN_ON(logical_apicid != BAD_APICID && logical_apicid != ldr_apicid);
- /* always use the value from LDR */
- early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid;
-#endif
-
- /*
- * Set Task Priority to 'accept all'. We never change this
- * later on.
+ * Set Task Priority to 'accept all except vectors 0-31'. An APIC
+ * vector in the 16-31 range could be delivered if TPR == 0, but we
+ * would think it's an exception and terrible things will happen. We
+ * never change this later on.
*/
value = apic_read(APIC_TASKPRI);
value &= ~APIC_TPRI_MASK;
+ value |= 0x10;
apic_write(APIC_TASKPRI, value);
- apic_pending_intr_clear();
+ apic_clear_isr();
/*
* Now that we are all set up, enable the APIC
@@ -1560,7 +1579,7 @@ static void setup_local_APIC(void)
*/
/*
* Actually disabling the focus CPU check just makes the hang less
- * frequent as it makes the interrupt distributon model be more
+ * frequent as it makes the interrupt distribution model be more
* like LRU than MRU (the short-term load is more even across CPUs).
*/
@@ -1578,6 +1597,8 @@ static void setup_local_APIC(void)
value |= SPURIOUS_APIC_VECTOR;
apic_write(APIC_SPIV, value);
+ perf_events_lapic_init();
+
/*
* Set up LVT0, LVT1:
*
@@ -1589,12 +1610,12 @@ static void setup_local_APIC(void)
* TODO: set up through-local-APIC from through-I/O-APIC? --macro
*/
value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
- if (!cpu && (pic_mode || !value || skip_ioapic_setup)) {
+ if (!cpu && (pic_mode || !value || ioapic_is_disabled)) {
value = APIC_DM_EXTINT;
- apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu);
+ apic_pr_verbose("Enabled ExtINT on CPU#%d\n", cpu);
} else {
value = APIC_DM_EXTINT | APIC_LVT_MASKED;
- apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", cpu);
+ apic_pr_verbose("Masked ExtINT on CPU#%d\n", cpu);
}
apic_write(APIC_LVT0, value);
@@ -1646,16 +1667,49 @@ void apic_ap_setup(void)
end_local_APIC_setup();
}
+static __init void apic_read_boot_cpu_id(bool x2apic)
+{
+ /*
+ * This can be invoked from check_x2apic() before the APIC has been
+ * selected. But that code knows for sure that the BIOS enabled
+ * X2APIC.
+ */
+ if (x2apic) {
+ boot_cpu_physical_apicid = native_apic_msr_read(APIC_ID);
+ boot_cpu_apic_version = GET_APIC_VERSION(native_apic_msr_read(APIC_LVR));
+ } else {
+ boot_cpu_physical_apicid = read_apic_id();
+ boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
+ }
+ topology_register_boot_apic(boot_cpu_physical_apicid);
+}
+
#ifdef CONFIG_X86_X2APIC
int x2apic_mode;
+EXPORT_SYMBOL_GPL(x2apic_mode);
enum {
X2APIC_OFF,
- X2APIC_ON,
X2APIC_DISABLED,
+ /* All states below here have X2APIC enabled */
+ X2APIC_ON,
+ X2APIC_ON_LOCKED
};
static int x2apic_state;
+static bool x2apic_hw_locked(void)
+{
+ u64 x86_arch_cap_msr;
+ u64 msr;
+
+ x86_arch_cap_msr = x86_read_arch_cap_msr();
+ if (x86_arch_cap_msr & ARCH_CAP_XAPIC_DISABLE) {
+ rdmsrq(MSR_IA32_XAPIC_DISABLE_STATUS, msr);
+ return (msr & LEGACY_XAPIC_DISABLED);
+ }
+ return false;
+}
+
static void __x2apic_disable(void)
{
u64 msr;
@@ -1663,12 +1717,12 @@ static void __x2apic_disable(void)
if (!boot_cpu_has(X86_FEATURE_APIC))
return;
- rdmsrl(MSR_IA32_APICBASE, msr);
+ rdmsrq(MSR_IA32_APICBASE, msr);
if (!(msr & X2APIC_ENABLE))
return;
/* Disable xapic and x2apic first and then reenable xapic mode */
- wrmsrl(MSR_IA32_APICBASE, msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
- wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
+ wrmsrq(MSR_IA32_APICBASE, msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
+ wrmsrq(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
printk_once(KERN_INFO "x2apic disabled\n");
}
@@ -1676,24 +1730,28 @@ static void __x2apic_enable(void)
{
u64 msr;
- rdmsrl(MSR_IA32_APICBASE, msr);
+ rdmsrq(MSR_IA32_APICBASE, msr);
if (msr & X2APIC_ENABLE)
return;
- wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);
+ wrmsrq(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);
printk_once(KERN_INFO "x2apic enabled\n");
}
static int __init setup_nox2apic(char *str)
{
if (x2apic_enabled()) {
- int apicid = native_apic_msr_read(APIC_ID);
+ u32 apicid = native_apic_msr_read(APIC_ID);
if (apicid >= 255) {
- pr_warning("Apicid: %08x, cannot enforce nox2apic\n",
- apicid);
+ pr_warn("Apicid: %08x, cannot enforce nox2apic\n",
+ apicid);
return 0;
}
- pr_warning("x2apic already enabled.\n");
+ if (x2apic_hw_locked()) {
+ pr_warn("APIC locked in x2apic mode, can't disable\n");
+ return 0;
+ }
+ pr_warn("x2apic already enabled.\n");
__x2apic_disable();
}
setup_clear_cpu_cap(X86_FEATURE_X2APIC);
@@ -1707,32 +1765,53 @@ early_param("nox2apic", setup_nox2apic);
void x2apic_setup(void)
{
/*
- * If x2apic is not in ON state, disable it if already enabled
+ * Try to make the AP's APIC state match that of the BSP, but if the
+ * BSP is unlocked and the AP is locked then there is a state mismatch.
+ * Warn about the mismatch in case a GP fault occurs due to a locked AP
+ * trying to be turned off.
+ */
+ if (x2apic_state != X2APIC_ON_LOCKED && x2apic_hw_locked())
+ pr_warn("x2apic lock mismatch between BSP and AP.\n");
+ /*
+ * If x2apic is not in ON or LOCKED state, disable it if already enabled
* from BIOS.
*/
- if (x2apic_state != X2APIC_ON) {
+ if (x2apic_state < X2APIC_ON) {
__x2apic_disable();
return;
}
__x2apic_enable();
}
+static __init void apic_set_fixmap(bool read_apic);
+
static __init void x2apic_disable(void)
{
- u32 x2apic_id, state = x2apic_state;
-
- x2apic_mode = 0;
- x2apic_state = X2APIC_DISABLED;
+ u32 x2apic_id;
- if (state != X2APIC_ON)
+ if (x2apic_state < X2APIC_ON)
return;
x2apic_id = read_apic_id();
if (x2apic_id >= 255)
panic("Cannot disable x2apic, id: %08x\n", x2apic_id);
+ if (x2apic_hw_locked()) {
+ pr_warn("Cannot disable locked x2apic, id: %08x\n", x2apic_id);
+ return;
+ }
+
__x2apic_disable();
- register_lapic_address(mp_lapic_addr);
+
+ x2apic_mode = 0;
+ x2apic_state = X2APIC_DISABLED;
+
+ /*
+ * Don't reread the APIC ID as it was already done from
+ * check_x2apic() and the APIC driver still is a x2APIC variant,
+ * which fails to do the read after x2APIC was disabled.
+ */
+ apic_set_fixmap(false);
}
static __init void x2apic_enable(void)
@@ -1751,20 +1830,34 @@ static __init void try_to_enable_x2apic(int remap_mode)
return;
if (remap_mode != IRQ_REMAP_X2APIC_MODE) {
- /* IR is required if there is APIC ID > 255 even when running
- * under KVM
+ u32 apic_limit = 255;
+
+ /*
+ * Using X2APIC without IR is not architecturally supported
+ * on bare metal but may be supported in guests.
*/
- if (max_physical_apicid > 255 ||
- !x86_init.hyper.x2apic_available()) {
+ if (!x86_init.hyper.x2apic_available()) {
pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n");
x2apic_disable();
return;
}
/*
- * without IR all CPUs can be addressed by IOAPIC/MSI
- * only in physical mode
+ * If the hypervisor supports extended destination ID in
+ * MSI, that increases the maximum APIC ID that can be
+ * used for non-remapped IRQ domains.
*/
+ if (x86_init.hyper.msi_ext_dest_id()) {
+ virt_ext_dest_id = 1;
+ apic_limit = 32767;
+ }
+
+ /*
+ * Without IR, all CPUs can be addressed by IOAPIC/MSI only
+ * in physical mode, and CPUs with an APIC ID that cannot
+ * be addressed must not be brought online.
+ */
+ x2apic_set_max_apicid(apic_limit);
x2apic_phys = 1;
}
x2apic_enable();
@@ -1775,22 +1868,29 @@ void __init check_x2apic(void)
if (x2apic_enabled()) {
pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n");
x2apic_mode = 1;
- x2apic_state = X2APIC_ON;
+ if (x2apic_hw_locked())
+ x2apic_state = X2APIC_ON_LOCKED;
+ else
+ x2apic_state = X2APIC_ON;
+ apic_read_boot_cpu_id(true);
} else if (!boot_cpu_has(X86_FEATURE_X2APIC)) {
x2apic_state = X2APIC_DISABLED;
}
}
#else /* CONFIG_X86_X2APIC */
-static int __init validate_x2apic(void)
+void __init check_x2apic(void)
{
if (!apic_is_x2apic_enabled())
- return 0;
+ return;
/*
- * Checkme: Can we simply turn off x2apic here instead of panic?
+ * Checkme: Can we simply turn off x2APIC here instead of disabling the APIC?
*/
- panic("BIOS has enabled x2apic but kernel doesn't support x2apic, please disable x2apic in BIOS.\n");
+ pr_err("Kernel does not support x2APIC, please recompile with CONFIG_X86_X2APIC.\n");
+ pr_err("Disabling APIC, expect reduced performance and functionality.\n");
+
+ apic_is_disabled = true;
+ setup_clear_cpu_cap(X86_FEATURE_APIC);
}
-early_initcall(validate_x2apic);
static inline void try_to_enable_x2apic(int remap_mode) { }
static inline void __x2apic_enable(void) { }
@@ -1801,7 +1901,7 @@ void __init enable_IR_x2apic(void)
unsigned long flags;
int ret, ir_stat;
- if (skip_ioapic_setup) {
+ if (ioapic_is_disabled) {
pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n");
return;
}
@@ -1839,19 +1939,19 @@ void __init enable_IR_x2apic(void)
* On AMD64 we trust the BIOS - if it says no APIC it is likely
* not correctly set up (usually the APIC timer won't work etc.)
*/
-static int __init detect_init_APIC(void)
+static bool __init detect_init_APIC(void)
{
if (!boot_cpu_has(X86_FEATURE_APIC)) {
pr_info("No local APIC present\n");
- return -1;
+ return false;
}
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
- return 0;
+ register_lapic_address(APIC_DEFAULT_PHYS_BASE);
+ return true;
}
#else
-static int __init apic_verify(void)
+static bool __init apic_verify(unsigned long addr)
{
u32 features, h, l;
@@ -1861,29 +1961,29 @@ static int __init apic_verify(void)
*/
features = cpuid_edx(1);
if (!(features & (1 << X86_FEATURE_APIC))) {
- pr_warning("Could not enable APIC!\n");
- return -1;
+ pr_warn("Could not enable APIC!\n");
+ return false;
}
set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
/* The BIOS may have set up the APIC at some other address */
if (boot_cpu_data.x86 >= 6) {
rdmsr(MSR_IA32_APICBASE, l, h);
if (l & MSR_IA32_APICBASE_ENABLE)
- mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
+ addr = l & MSR_IA32_APICBASE_BASE;
}
+ register_lapic_address(addr);
pr_info("Found and enabled local APIC!\n");
- return 0;
+ return true;
}
-int __init apic_force_enable(unsigned long addr)
+bool __init apic_force_enable(unsigned long addr)
{
u32 h, l;
- if (disable_apic)
- return -1;
+ if (apic_is_disabled)
+ return false;
/*
* Some BIOSes disable the local APIC in the APIC_BASE
@@ -1900,17 +2000,17 @@ int __init apic_force_enable(unsigned long addr)
enabled_via_apicbase = 1;
}
}
- return apic_verify();
+ return apic_verify(addr);
}
/*
* Detect and initialize APIC
*/
-static int __init detect_init_APIC(void)
+static bool __init detect_init_APIC(void)
{
/* Disabled by kernel option? */
- if (disable_apic)
- return -1;
+ if (apic_is_disabled)
+ return false;
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
@@ -1921,8 +2021,8 @@ static int __init detect_init_APIC(void)
case X86_VENDOR_HYGON:
break;
case X86_VENDOR_INTEL:
- if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 ||
- (boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC)))
+ if ((boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC)) ||
+ boot_cpu_data.x86_vfm >= INTEL_PENTIUM_PRO)
break;
goto no_apic;
default:
@@ -1937,22 +2037,22 @@ static int __init detect_init_APIC(void)
if (!force_enable_local_apic) {
pr_info("Local APIC disabled by BIOS -- "
"you can enable it with \"lapic\"\n");
- return -1;
+ return false;
}
- if (apic_force_enable(APIC_DEFAULT_PHYS_BASE))
- return -1;
+ if (!apic_force_enable(APIC_DEFAULT_PHYS_BASE))
+ return false;
} else {
- if (apic_verify())
- return -1;
+ if (!apic_verify(APIC_DEFAULT_PHYS_BASE))
+ return false;
}
apic_pm_activate();
- return 0;
+ return true;
no_apic:
pr_info("No local APIC present or hardware disabled\n");
- return -1;
+ return false;
}
#endif
@@ -1961,62 +2061,37 @@ no_apic:
*/
void __init init_apic_mappings(void)
{
- unsigned int new_apicid;
+ if (apic_validate_deadline_timer())
+ pr_info("TSC deadline timer available\n");
- apic_check_deadline_errata();
-
- if (x2apic_mode) {
- boot_cpu_physical_apicid = read_apic_id();
+ if (x2apic_mode)
return;
- }
-
- /* If no local APIC can be found return early */
- if (!smp_found_config && detect_init_APIC()) {
- /* lets NOP'ify apic operations */
- pr_info("APIC: disable apic facility\n");
- apic_disable();
- } else {
- apic_phys = mp_lapic_addr;
- /*
- * If the system has ACPI MADT tables or MP info, the LAPIC
- * address is already registered.
- */
- if (!acpi_lapic && !smp_found_config)
- register_lapic_address(apic_phys);
+ if (!smp_found_config) {
+ if (!detect_init_APIC()) {
+ pr_info("APIC: disable apic facility\n");
+ apic_disable();
+ }
}
+}
- /*
- * Fetch the APIC ID of the BSP in case we have a
- * default configuration (or the MP table is broken).
- */
- new_apicid = read_apic_id();
- if (boot_cpu_physical_apicid != new_apicid) {
- boot_cpu_physical_apicid = new_apicid;
- /*
- * yeah -- we lie about apic_version
- * in case if apic was disabled via boot option
- * but it's not a problem for SMP compiled kernel
- * since apic_intr_mode_select is prepared for such
- * a case and disable smp mode
- */
- boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
- }
+static __init void apic_set_fixmap(bool read_apic)
+{
+ set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
+ apic_mmio_base = APIC_BASE;
+ apic_pr_verbose("Mapped APIC to %16lx (%16lx)\n", apic_mmio_base, mp_lapic_addr);
+ if (read_apic)
+ apic_read_boot_cpu_id(false);
}
void __init register_lapic_address(unsigned long address)
{
+ /* This should only happen once */
+ WARN_ON_ONCE(mp_lapic_addr);
mp_lapic_addr = address;
- if (!x2apic_mode) {
- set_fixmap_nocache(FIX_APIC_BASE, address);
- apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
- APIC_BASE, address);
- }
- if (boot_cpu_physical_apicid == -1U) {
- boot_cpu_physical_apicid = read_apic_id();
- boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
- }
+ if (!x2apic_mode)
+ apic_set_fixmap(true);
}
/*
@@ -2024,39 +2099,67 @@ void __init register_lapic_address(unsigned long address)
*/
/*
- * This interrupt should _never_ happen with our APIC/SMP architecture
+ * Common handling code for spurious_interrupt and spurious_vector entry
+ * points below. No point in allowing the compiler to inline it twice.
*/
-__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
+static noinline void handle_spurious_interrupt(u8 vector)
{
- u8 vector = ~regs->orig_ax;
u32 v;
- entering_irq();
trace_spurious_apic_entry(vector);
+ inc_irq_stat(irq_spurious_count);
+
/*
- * Check if this really is a spurious interrupt and ACK it
- * if it is a vectored one. Just in case...
- * Spurious interrupts should not be ACKed.
+ * If this is a spurious interrupt then do not acknowledge
*/
- v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1));
- if (v & (1 << (vector & 0x1f)))
- ack_APIC_irq();
+ if (vector == SPURIOUS_APIC_VECTOR) {
+ /* See SDM vol 3 */
+ pr_info("Spurious APIC interrupt (vector 0xFF) on CPU#%d, should never happen.\n",
+ smp_processor_id());
+ goto out;
+ }
- inc_irq_stat(irq_spurious_count);
+ /*
+ * If it is a vectored one, verify it's set in the ISR. If set,
+ * acknowledge it.
+ */
+ v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1));
+ if (v & (1 << (vector & 0x1f))) {
+ pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Acked\n",
+ vector, smp_processor_id());
+ apic_eoi();
+ } else {
+ pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Not pending!\n",
+ vector, smp_processor_id());
+ }
+out:
+ trace_spurious_apic_exit(vector);
+}
- /* see sw-dev-man vol 3, chapter 7.4.13.5 */
- pr_info("spurious APIC interrupt through vector %02x on CPU#%d, "
- "should never happen.\n", vector, smp_processor_id());
+/**
+ * spurious_interrupt - Catch all for interrupts raised on unused vectors
+ * @regs: Pointer to pt_regs on stack
+ * @vector: The vector number
+ *
+ * This is invoked from ASM entry code to catch all interrupts which
+ * trigger on an entry which is routed to the common_spurious idtentry
+ * point.
+ */
+DEFINE_IDTENTRY_IRQ(spurious_interrupt)
+{
+ handle_spurious_interrupt(vector);
+}
- trace_spurious_apic_exit(vector);
- exiting_irq();
+DEFINE_IDTENTRY_SYSVEC(sysvec_spurious_apic_interrupt)
+{
+ handle_spurious_interrupt(SPURIOUS_APIC_VECTOR);
}
/*
* This interrupt should never happen with our APIC/SMP architecture
*/
-__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_error_interrupt)
{
static const char * const error_interrupt_reason[] = {
"Send CS error", /* APIC Error Bit 0 */
@@ -2070,31 +2173,28 @@ __visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
};
u32 v, i = 0;
- entering_irq();
trace_error_apic_entry(ERROR_APIC_VECTOR);
/* First tickle the hardware, only then report what went on. -- REW */
if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
v = apic_read(APIC_ESR);
- ack_APIC_irq();
+ apic_eoi();
atomic_inc(&irq_err_count);
- apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x",
- smp_processor_id(), v);
+ apic_pr_debug("APIC error on CPU%d: %02x", smp_processor_id(), v);
v &= 0xff;
while (v) {
if (v & 0x1)
- apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
+ apic_pr_debug_cont(" : %s", error_interrupt_reason[i]);
i++;
v >>= 1;
}
- apic_printk(APIC_DEBUG, KERN_CONT "\n");
+ apic_pr_debug_cont("\n");
trace_error_apic_exit(ERROR_APIC_VECTOR);
- exiting_irq();
}
/**
@@ -2112,8 +2212,7 @@ static void __init connect_bsp_APIC(void)
* PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
* local APIC to INT and NMI lines.
*/
- apic_printk(APIC_VERBOSE, "leaving PIC mode, "
- "enabling APIC mode.\n");
+ apic_pr_verbose("Leaving PIC mode, enabling APIC mode.\n");
imcr_pic_to_apic();
}
#endif
@@ -2138,8 +2237,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
* IPIs, won't work beyond this point! The only exception are
* INIT IPIs.
*/
- apic_printk(APIC_VERBOSE, "disabling APIC mode, "
- "entering PIC mode.\n");
+ apic_pr_verbose("Disabling APIC mode, entering PIC mode.\n");
imcr_apic_to_pic();
return;
}
@@ -2184,236 +2282,56 @@ void disconnect_bsp_APIC(int virt_wire_setup)
apic_write(APIC_LVT1, value);
}
-/*
- * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated
- * contiguously, it equals to current allocated max logical CPU ID plus 1.
- * All allocated CPU IDs should be in the [0, nr_logical_cpuids) range,
- * so the maximum of nr_logical_cpuids is nr_cpu_ids.
- *
- * NOTE: Reserve 0 for BSP.
- */
-static int nr_logical_cpuids = 1;
-
-/*
- * Used to store mapping between logical CPU IDs and APIC IDs.
- */
-static int cpuid_to_apicid[] = {
- [0 ... NR_CPUS - 1] = -1,
-};
-
-#ifdef CONFIG_SMP
-/**
- * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread
- * @id: APIC ID to check
- */
-bool apic_id_is_primary_thread(unsigned int apicid)
+void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg,
+ bool dmar)
{
- u32 mask;
-
- if (smp_num_siblings == 1)
- return true;
- /* Isolate the SMT bit(s) in the APICID and check for 0 */
- mask = (1U << (fls(smp_num_siblings) - 1)) - 1;
- return !(apicid & mask);
-}
-#endif
-
-/*
- * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids
- * and cpuid_to_apicid[] synchronized.
- */
-static int allocate_logical_cpuid(int apicid)
-{
- int i;
-
- /*
- * cpuid <-> apicid mapping is persistent, so when a cpu is up,
- * check if the kernel has allocated a cpuid for it.
- */
- for (i = 0; i < nr_logical_cpuids; i++) {
- if (cpuid_to_apicid[i] == apicid)
- return i;
- }
-
- /* Allocate a new cpuid. */
- if (nr_logical_cpuids >= nr_cpu_ids) {
- WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %u reached. "
- "Processor %d/0x%x and the rest are ignored.\n",
- nr_cpu_ids, nr_logical_cpuids, apicid);
- return -EINVAL;
- }
-
- cpuid_to_apicid[nr_logical_cpuids] = apicid;
- return nr_logical_cpuids++;
-}
-
-int generic_processor_info(int apicid, int version)
-{
- int cpu, max = nr_cpu_ids;
- bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
- phys_cpu_present_map);
-
- /*
- * boot_cpu_physical_apicid is designed to have the apicid
- * returned by read_apic_id(), i.e, the apicid of the
- * currently booting-up processor. However, on some platforms,
- * it is temporarily modified by the apicid reported as BSP
- * through MP table. Concretely:
- *
- * - arch/x86/kernel/mpparse.c: MP_processor_info()
- * - arch/x86/mm/amdtopology.c: amd_numa_init()
- *
- * This function is executed with the modified
- * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel
- * parameter doesn't work to disable APs on kdump 2nd kernel.
- *
- * Since fixing handling of boot_cpu_physical_apicid requires
- * another discussion and tests on each platform, we leave it
- * for now and here we use read_apic_id() directly in this
- * function, generic_processor_info().
- */
- if (disabled_cpu_apicid != BAD_APICID &&
- disabled_cpu_apicid != read_apic_id() &&
- disabled_cpu_apicid == apicid) {
- int thiscpu = num_processors + disabled_cpus;
-
- pr_warning("APIC: Disabling requested cpu."
- " Processor %d/0x%x ignored.\n",
- thiscpu, apicid);
-
- disabled_cpus++;
- return -ENODEV;
- }
-
- /*
- * If boot cpu has not been detected yet, then only allow upto
- * nr_cpu_ids - 1 processors and keep one slot free for boot cpu
- */
- if (!boot_cpu_detected && num_processors >= nr_cpu_ids - 1 &&
- apicid != boot_cpu_physical_apicid) {
- int thiscpu = max + disabled_cpus - 1;
-
- pr_warning(
- "APIC: NR_CPUS/possible_cpus limit of %i almost"
- " reached. Keeping one slot for boot cpu."
- " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
-
- disabled_cpus++;
- return -ENODEV;
- }
-
- if (num_processors >= nr_cpu_ids) {
- int thiscpu = max + disabled_cpus;
-
- pr_warning("APIC: NR_CPUS/possible_cpus limit of %i "
- "reached. Processor %d/0x%x ignored.\n",
- max, thiscpu, apicid);
-
- disabled_cpus++;
- return -EINVAL;
- }
+ memset(msg, 0, sizeof(*msg));
- if (apicid == boot_cpu_physical_apicid) {
- /*
- * x86_bios_cpu_apicid is required to have processors listed
- * in same order as logical cpu numbers. Hence the first
- * entry is BSP, and so on.
- * boot_cpu_init() already hold bit 0 in cpu_present_mask
- * for BSP.
- */
- cpu = 0;
+ msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
+ msg->arch_addr_lo.dest_mode_logical = apic->dest_mode_logical;
+ msg->arch_addr_lo.destid_0_7 = cfg->dest_apicid & 0xFF;
- /* Logical cpuid 0 is reserved for BSP. */
- cpuid_to_apicid[0] = apicid;
- } else {
- cpu = allocate_logical_cpuid(apicid);
- if (cpu < 0) {
- disabled_cpus++;
- return -EINVAL;
- }
- }
+ msg->arch_data.delivery_mode = APIC_DELIVERY_MODE_FIXED;
+ msg->arch_data.vector = cfg->vector;
+ msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
/*
- * Validate version
+ * Only the IOMMU itself can use the trick of putting destination
+ * APIC ID into the high bits of the address. Anything else would
+ * just be writing to memory if it tried that, and needs IR to
+ * address APICs which can't be addressed in the normal 32-bit
+ * address range at 0xFFExxxxx. That is typically just 8 bits, but
+ * some hypervisors allow the extended destination ID field in bits
+ * 5-11 to be used, giving support for 15 bits of APIC IDs in total.
*/
- if (version == 0x0) {
- pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n",
- cpu, apicid);
- version = 0x10;
- }
-
- if (version != boot_cpu_apic_version) {
- pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n",
- boot_cpu_apic_version, cpu, version);
- }
-
- if (apicid > max_physical_apicid)
- max_physical_apicid = apicid;
-
-#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
- early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
- early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
-#endif
-#ifdef CONFIG_X86_32
- early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
- apic->x86_32_early_logical_apicid(cpu);
-#endif
- set_cpu_possible(cpu, true);
- physid_set(apicid, phys_cpu_present_map);
- set_cpu_present(cpu, true);
- num_processors++;
-
- return cpu;
+ if (dmar)
+ msg->arch_addr_hi.destid_8_31 = cfg->dest_apicid >> 8;
+ else if (virt_ext_dest_id && cfg->dest_apicid < 0x8000)
+ msg->arch_addr_lo.virt_destid_8_14 = cfg->dest_apicid >> 8;
+ else
+ WARN_ON_ONCE(cfg->dest_apicid > 0xFF);
}
-int hard_smp_processor_id(void)
+u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid)
{
- return read_apic_id();
-}
+ u32 dest = msg->arch_addr_lo.destid_0_7;
-/*
- * Override the generic EOI implementation with an optimized version.
- * Only called during early boot when only one CPU is active and with
- * interrupts disabled, so we know this does not race with actual APIC driver
- * use.
- */
-void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
-{
- struct apic **drv;
-
- for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
- /* Should happen once for each apic */
- WARN_ON((*drv)->eoi_write == eoi_write);
- (*drv)->native_eoi_write = (*drv)->eoi_write;
- (*drv)->eoi_write = eoi_write;
- }
+ if (extid)
+ dest |= msg->arch_addr_hi.destid_8_31 << 8;
+ return dest;
}
+EXPORT_SYMBOL_FOR_KVM(x86_msi_msg_get_destid);
static void __init apic_bsp_up_setup(void)
{
-#ifdef CONFIG_X86_64
- apic_write(APIC_ID, apic->set_apic_id(boot_cpu_physical_apicid));
-#else
- /*
- * Hack: In case of kdump, after a crash, kernel might be booting
- * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
- * might be zero if read from MP tables. Get it from LAPIC.
- */
-# ifdef CONFIG_CRASH_DUMP
- boot_cpu_physical_apicid = read_apic_id();
-# endif
-#endif
- physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
+ reset_phys_cpu_present_map(boot_cpu_physical_apicid);
}
/**
* apic_bsp_setup - Setup function for local apic and io-apic
* @upmode: Force UP mode (for APIC_init_uniprocessor)
- *
- * Returns:
- * apic_id of BSP APIC
*/
-void __init apic_bsp_setup(bool upmode)
+static void __init apic_bsp_setup(bool upmode)
{
connect_bsp_APIC();
if (upmode)
@@ -2424,6 +2342,7 @@ void __init apic_bsp_setup(bool upmode)
end_local_APIC_setup();
irq_remap_enable_fault_handling();
setup_IO_APIC();
+ lapic_update_legacy_vectors();
}
#ifdef CONFIG_UP_LATE_INIT
@@ -2450,7 +2369,7 @@ static struct {
*/
int active;
/* r/w apic fields */
- unsigned int apic_id;
+ u32 apic_id;
unsigned int apic_taskpri;
unsigned int apic_ldr;
unsigned int apic_dfr;
@@ -2466,7 +2385,7 @@ static struct {
unsigned int apic_cmci;
} apic_pm_state;
-static int lapic_suspend(void)
+static int lapic_suspend(void *data)
{
unsigned long flags;
int maxlvt;
@@ -2499,6 +2418,13 @@ static int lapic_suspend(void)
#endif
local_irq_save(flags);
+
+ /*
+ * Mask IOAPIC before disabling the local APIC to prevent stale IRR
+ * entries on some implementations.
+ */
+ mask_ioapic_entries();
+
disable_local_APIC();
irq_remapping_disable();
@@ -2507,7 +2433,7 @@ static int lapic_suspend(void)
return 0;
}
-static void lapic_resume(void)
+static void lapic_resume(void *data)
{
unsigned int l, h;
unsigned long flags;
@@ -2582,11 +2508,15 @@ static void lapic_resume(void)
* are needed on every CPU up until machine_halt/restart/poweroff.
*/
-static struct syscore_ops lapic_syscore_ops = {
+static const struct syscore_ops lapic_syscore_ops = {
.resume = lapic_resume,
.suspend = lapic_suspend,
};
+static struct syscore lapic_syscore = {
+ .ops = &lapic_syscore_ops,
+};
+
static void apic_pm_activate(void)
{
apic_pm_state.active = 1;
@@ -2596,7 +2526,7 @@ static int __init init_lapic_sysfs(void)
{
/* XXX: remove suspend/resume procs if !apic_pm_state.active? */
if (boot_cpu_has(X86_FEATURE_APIC))
- register_syscore_ops(&lapic_syscore_ops);
+ register_syscore(&lapic_syscore);
return 0;
}
@@ -2663,19 +2593,12 @@ int apic_is_clustered_box(void)
/*
* APIC command line parameters
*/
-static int __init setup_disableapic(char *arg)
+static int __init setup_nolapic(char *arg)
{
- disable_apic = 1;
+ apic_is_disabled = true;
setup_clear_cpu_cap(X86_FEATURE_APIC);
return 0;
}
-early_param("disableapic", setup_disableapic);
-
-/* same as disableapic, for compatibility */
-static int __init setup_nolapic(char *arg)
-{
- return setup_disableapic(arg);
-}
early_param("nolapic", setup_nolapic);
static int __init parse_lapic_timer_c2_ok(char *arg)
@@ -2702,11 +2625,11 @@ early_param("nolapic_timer", parse_nolapic_timer);
static int __init apic_set_verbosity(char *arg)
{
if (!arg) {
-#ifdef CONFIG_X86_64
- skip_ioapic_setup = 0;
+ if (IS_ENABLED(CONFIG_X86_32))
+ return -EINVAL;
+
+ ioapic_is_disabled = false;
return 0;
-#endif
- return -EINVAL;
}
if (strcmp("debug", arg) == 0)
@@ -2715,7 +2638,7 @@ static int __init apic_set_verbosity(char *arg)
apic_verbosity = APIC_VERBOSE;
#ifdef CONFIG_X86_64
else {
- pr_warning("APIC Verbosity level %s not recognised"
+ pr_warn("APIC Verbosity level %s not recognised"
" use apic=verbose or apic=debug\n", arg);
return -EINVAL;
}
@@ -2727,11 +2650,11 @@ early_param("apic", apic_set_verbosity);
static int __init lapic_insert_resource(void)
{
- if (!apic_phys)
+ if (!apic_mmio_base)
return -1;
/* Put local APIC into the resource map. */
- lapic_resource.start = apic_phys;
+ lapic_resource.start = apic_mmio_base;
lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
insert_resource(&iomem_resource, &lapic_resource);
@@ -2744,15 +2667,6 @@ static int __init lapic_insert_resource(void)
*/
late_initcall(lapic_insert_resource);
-static int __init apic_set_disabled_cpu_apicid(char *arg)
-{
- if (!arg || !get_option(&arg, &disabled_cpu_apicid))
- return -EINVAL;
-
- return 0;
-}
-early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid);
-
static int __init apic_set_extnmi(char *arg)
{
if (!arg)
diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c
index 02b4839478b1..2ed3b5c88c7f 100644
--- a/arch/x86/kernel/apic/apic_common.c
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -4,8 +4,11 @@
* SPDX-License-Identifier: GPL-2.0
*/
#include <linux/irq.h>
+#include <linux/kvm_types.h>
#include <asm/apic.h>
+#include "local.h"
+
u32 apic_default_calc_apicid(unsigned int cpu)
{
return per_cpu(x86_cpu_to_apicid, cpu);
@@ -16,31 +19,25 @@ u32 apic_flat_calc_apicid(unsigned int cpu)
return 1U << cpu;
}
-bool default_check_apicid_used(physid_mask_t *map, int apicid)
-{
- return physid_isset(apicid, *map);
-}
-
-void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
-{
- *retmap = *phys_map;
-}
-
-int default_cpu_present_to_apicid(int mps_cpu)
+u32 default_cpu_present_to_apicid(int mps_cpu)
{
if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
- return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
+ return (int)per_cpu(x86_cpu_to_apicid, mps_cpu);
else
return BAD_APICID;
}
-EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid);
+EXPORT_SYMBOL_FOR_KVM(default_cpu_present_to_apicid);
-int default_check_phys_apicid_present(int phys_apicid)
+/*
+ * Set up the logical destination ID when the APIC operates in logical
+ * destination mode.
+ */
+void default_init_apic_ldr(void)
{
- return physid_isset(phys_apicid, phys_cpu_present_map);
-}
+ unsigned long val;
-int default_apic_id_valid(u32 apicid)
-{
- return (apicid < 255);
+ apic_write(APIC_DFR, APIC_DFR_FLAT);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
+ apic_write(APIC_LDR, val);
}
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 0005c284a5c5..e0308d8c4e6c 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -1,6 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2004 James Cleverdon, IBM.
- * Subject to the GNU Public License, v.2
*
* Flat APIC subarch code.
*
@@ -8,240 +8,25 @@
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
* James Cleverdon.
*/
-#include <linux/acpi.h>
-#include <linux/errno.h>
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/hardirq.h>
#include <linux/export.h>
-#include <asm/smp.h>
-#include <asm/ipi.h>
#include <asm/apic.h>
-#include <asm/apic_flat_64.h>
-#include <asm/jailhouse_para.h>
-static struct apic apic_physflat;
-static struct apic apic_flat;
+#include "local.h"
-struct apic *apic __ro_after_init = &apic_flat;
-EXPORT_SYMBOL_GPL(apic);
-
-static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- return 1;
-}
-
-/*
- * Set up the logical destination ID.
- *
- * Intel recommends to set DFR, LDR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
- */
-void flat_init_apic_ldr(void)
-{
- unsigned long val;
- unsigned long num, id;
-
- num = smp_processor_id();
- id = 1UL << num;
- apic_write(APIC_DFR, APIC_DFR_FLAT);
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- val |= SET_APIC_LOGICAL_ID(id);
- apic_write(APIC_LDR, val);
-}
-
-static void _flat_send_IPI_mask(unsigned long mask, int vector)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
- local_irq_restore(flags);
-}
-
-static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
-{
- unsigned long mask = cpumask_bits(cpumask)[0];
-
- _flat_send_IPI_mask(mask, vector);
-}
-
-static void
-flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
-{
- unsigned long mask = cpumask_bits(cpumask)[0];
- int cpu = smp_processor_id();
-
- if (cpu < BITS_PER_LONG)
- clear_bit(cpu, &mask);
-
- _flat_send_IPI_mask(mask, vector);
-}
-
-static void flat_send_IPI_allbutself(int vector)
-{
- int cpu = smp_processor_id();
-
- if (IS_ENABLED(CONFIG_HOTPLUG_CPU) || vector == NMI_VECTOR) {
- if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) {
- unsigned long mask = cpumask_bits(cpu_online_mask)[0];
-
- if (cpu < BITS_PER_LONG)
- clear_bit(cpu, &mask);
-
- _flat_send_IPI_mask(mask, vector);
- }
- } else if (num_online_cpus() > 1) {
- __default_send_IPI_shortcut(APIC_DEST_ALLBUT,
- vector, apic->dest_logical);
- }
-}
-
-static void flat_send_IPI_all(int vector)
-{
- if (vector == NMI_VECTOR) {
- flat_send_IPI_mask(cpu_online_mask, vector);
- } else {
- __default_send_IPI_shortcut(APIC_DEST_ALLINC,
- vector, apic->dest_logical);
- }
-}
-
-static unsigned int flat_get_apic_id(unsigned long x)
+static u32 physflat_get_apic_id(u32 x)
{
return (x >> 24) & 0xFF;
}
-static u32 set_apic_id(unsigned int id)
-{
- return (id & 0xFF) << 24;
-}
-
-static unsigned int read_xapic_id(void)
-{
- return flat_get_apic_id(apic_read(APIC_ID));
-}
-
-static int flat_apic_id_registered(void)
-{
- return physid_isset(read_xapic_id(), phys_cpu_present_map);
-}
-
-static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
-{
- return initial_apic_id >> index_msb;
-}
-
-static int flat_probe(void)
+static int physflat_probe(void)
{
return 1;
}
-static struct apic apic_flat __ro_after_init = {
- .name = "flat",
- .probe = flat_probe,
- .acpi_madt_oem_check = flat_acpi_madt_oem_check,
- .apic_id_valid = default_apic_id_valid,
- .apic_id_registered = flat_apic_id_registered,
-
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 1, /* logical */
-
- .disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = NULL,
-
- .init_apic_ldr = flat_init_apic_ldr,
-
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
- .cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
- .phys_pkg_id = flat_phys_pkg_id,
-
- .get_apic_id = flat_get_apic_id,
- .set_apic_id = set_apic_id,
-
- .calc_dest_apicid = apic_flat_calc_apicid,
-
- .send_IPI = default_send_IPI_single,
- .send_IPI_mask = flat_send_IPI_mask,
- .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
- .send_IPI_allbutself = flat_send_IPI_allbutself,
- .send_IPI_all = flat_send_IPI_all,
- .send_IPI_self = apic_send_IPI_self,
-
- .inquire_remote_apic = default_inquire_remote_apic,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .eoi_write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-};
-
-/*
- * Physflat mode is used when there are more than 8 CPUs on a system.
- * We cannot use logical delivery in this case because the mask
- * overflows, so use physical mode.
- */
static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
-#ifdef CONFIG_ACPI
- /*
- * Quirk: some x86_64 machines can only use physical APIC mode
- * regardless of how many processors are present (x86_64 ES7000
- * is an example).
- */
- if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
- (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
- printk(KERN_DEBUG "system APIC only can use physical flat");
- return 1;
- }
-
- if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) {
- printk(KERN_DEBUG "IBM Summit detected, will use apic physical");
- return 1;
- }
-#endif
-
- return 0;
-}
-
-static void physflat_init_apic_ldr(void)
-{
- /*
- * LDR and DFR are not involved in physflat mode, rather:
- * "In physical destination mode, the destination processor is
- * specified by its local APIC ID [...]." (Intel SDM, 10.6.2.1)
- */
-}
-
-static void physflat_send_IPI_allbutself(int vector)
-{
- default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
-}
-
-static void physflat_send_IPI_all(int vector)
-{
- default_send_IPI_mask_sequence_phys(cpu_online_mask, vector);
-}
-
-static int physflat_probe(void)
-{
- if (apic == &apic_physflat || num_possible_cpus() > 8 ||
- jailhouse_paravirt())
- return 1;
-
- return 0;
+ return 1;
}
static struct apic apic_physflat __ro_after_init = {
@@ -249,49 +34,35 @@ static struct apic apic_physflat __ro_after_init = {
.name = "physical flat",
.probe = physflat_probe,
.acpi_madt_oem_check = physflat_acpi_madt_oem_check,
- .apic_id_valid = default_apic_id_valid,
- .apic_id_registered = flat_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* physical */
+ .dest_mode_logical = false,
.disable_esr = 0,
- .dest_logical = 0,
- .check_apicid_used = NULL,
- .init_apic_ldr = physflat_init_apic_ldr,
-
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
- .phys_pkg_id = flat_phys_pkg_id,
- .get_apic_id = flat_get_apic_id,
- .set_apic_id = set_apic_id,
+ .max_apic_id = 0xFE,
+ .get_apic_id = physflat_get_apic_id,
.calc_dest_apicid = apic_default_calc_apicid,
.send_IPI = default_send_IPI_single_phys,
.send_IPI_mask = default_send_IPI_mask_sequence_phys,
.send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_phys,
- .send_IPI_allbutself = physflat_send_IPI_allbutself,
- .send_IPI_all = physflat_send_IPI_all,
- .send_IPI_self = apic_send_IPI_self,
-
- .inquire_remote_apic = default_inquire_remote_apic,
+ .send_IPI_allbutself = default_send_IPI_allbutself,
+ .send_IPI_all = default_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+ .nmi_to_offline_cpu = true,
.read = native_apic_mem_read,
.write = native_apic_mem_write,
- .eoi_write = native_apic_mem_write,
+ .eoi = native_apic_mem_eoi,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+ .wait_icr_idle = apic_mem_wait_icr_idle,
+ .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout,
};
+apic_driver(apic_physflat);
-/*
- * We need to check for physflat first, so this order is important.
- */
-apic_drivers(apic_physflat, apic_flat);
+struct apic *apic __ro_after_init = &apic_physflat;
+EXPORT_SYMBOL_GPL(apic);
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 5078b5ce63a7..58abb941c45b 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -8,130 +8,58 @@
* Though in case if apic is disabled (for some reason) we try
* to not uglify the caller's code and allow to call (some) apic routines
* like self-ipi, etc...
+ *
+ * FIXME: Remove this gunk. The above argument which was intentionally left
+ * in place is silly to begin with because none of the callbacks except for
+ * APIC::read/write() have a WARN_ON_ONCE() in them. Sigh...
*/
-
-#include <linux/threads.h>
#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/errno.h>
-#include <asm/fixmap.h>
-#include <asm/mpspec.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/setup.h>
+#include <linux/thread_info.h>
-#include <linux/smp.h>
-#include <asm/ipi.h>
+#include <asm/apic.h>
-#include <linux/interrupt.h>
-#include <asm/acpi.h>
-#include <asm/e820/api.h>
+#include "local.h"
-static void noop_init_apic_ldr(void) { }
static void noop_send_IPI(int cpu, int vector) { }
static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
static void noop_send_IPI_allbutself(int vector) { }
static void noop_send_IPI_all(int vector) { }
static void noop_send_IPI_self(int vector) { }
-static void noop_apic_wait_icr_idle(void) { }
static void noop_apic_icr_write(u32 low, u32 id) { }
-static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
+static int noop_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip,
+ unsigned int cpu)
{
return -1;
}
-static u32 noop_safe_apic_wait_icr_idle(void)
-{
- return 0;
-}
-
-static u64 noop_apic_icr_read(void)
-{
- return 0;
-}
-
-static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
-{
- return 0;
-}
-
-static unsigned int noop_get_apic_id(unsigned long x)
-{
- return 0;
-}
-
-static int noop_probe(void)
-{
- /*
- * NOOP apic should not ever be
- * enabled via probe routine
- */
- return 0;
-}
-
-static int noop_apic_id_registered(void)
-{
- /*
- * if we would be really "pedantic"
- * we should pass read_apic_id() here
- * but since NOOP suppose APIC ID = 0
- * lets save a few cycles
- */
- return physid_isset(0, phys_cpu_present_map);
-}
+static u64 noop_apic_icr_read(void) { return 0; }
+static u32 noop_get_apic_id(u32 apicid) { return 0; }
+static void noop_apic_eoi(void) { }
static u32 noop_apic_read(u32 reg)
{
- WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic);
+ WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !apic_is_disabled);
return 0;
}
-static void noop_apic_write(u32 reg, u32 v)
-{
- WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic);
-}
-
-#ifdef CONFIG_X86_32
-static int noop_x86_32_early_logical_apicid(int cpu)
+static void noop_apic_write(u32 reg, u32 val)
{
- return BAD_APICID;
+ WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !apic_is_disabled);
}
-#endif
struct apic apic_noop __ro_after_init = {
.name = "noop",
- .probe = noop_probe,
- .acpi_madt_oem_check = NULL,
-
- .apic_id_valid = default_apic_id_valid,
- .apic_id_registered = noop_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- /* logical delivery broadcast to all CPUs: */
- .irq_dest_mode = 1,
+ .dest_mode_logical = true,
.disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = default_check_apicid_used,
-
- .init_apic_ldr = noop_init_apic_ldr,
-
- .ioapic_phys_id_map = default_ioapic_phys_id_map,
- .setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = physid_set_mask_of_physid,
-
- .check_phys_apicid_present = default_check_phys_apicid_present,
-
- .phys_pkg_id = noop_phys_pkg_id,
+ .max_apic_id = 0xFE,
.get_apic_id = noop_get_apic_id,
- .set_apic_id = NULL,
.calc_dest_apicid = apic_flat_calc_apicid,
@@ -144,17 +72,9 @@ struct apic apic_noop __ro_after_init = {
.wakeup_secondary_cpu = noop_wakeup_secondary_cpu,
- .inquire_remote_apic = NULL,
-
.read = noop_apic_read,
.write = noop_apic_write,
- .eoi_write = noop_apic_write,
+ .eoi = noop_apic_eoi,
.icr_read = noop_apic_icr_read,
.icr_write = noop_apic_icr_write,
- .wait_icr_idle = noop_apic_wait_icr_idle,
- .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
-
-#ifdef CONFIG_X86_32
- .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
-#endif
};
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 78778b54f904..5c5be2d58242 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -10,68 +10,43 @@
* Send feedback to <support@numascale.com>
*
*/
-
+#include <linux/types.h>
#include <linux/init.h>
+#include <linux/pgtable.h>
+#include <asm/msr.h>
#include <asm/numachip/numachip.h>
#include <asm/numachip/numachip_csr.h>
-#include <asm/ipi.h>
-#include <asm/apic_flat_64.h>
-#include <asm/pgtable.h>
-#include <asm/pci_x86.h>
+
+
+#include "local.h"
u8 numachip_system __read_mostly;
static const struct apic apic_numachip1;
static const struct apic apic_numachip2;
static void (*numachip_apic_icr_write)(int apicid, unsigned int val) __read_mostly;
-static unsigned int numachip1_get_apic_id(unsigned long x)
+static u32 numachip1_get_apic_id(u32 x)
{
unsigned long value;
unsigned int id = (x >> 24) & 0xff;
if (static_cpu_has(X86_FEATURE_NODEID_MSR)) {
- rdmsrl(MSR_FAM10H_NODE_ID, value);
+ rdmsrq(MSR_FAM10H_NODE_ID, value);
id |= (value << 2) & 0xff00;
}
return id;
}
-static u32 numachip1_set_apic_id(unsigned int id)
-{
- return (id & 0xff) << 24;
-}
-
-static unsigned int numachip2_get_apic_id(unsigned long x)
+static u32 numachip2_get_apic_id(u32 x)
{
u64 mcfg;
- rdmsrl(MSR_FAM10H_MMIO_CONF_BASE, mcfg);
+ rdmsrq(MSR_FAM10H_MMIO_CONF_BASE, mcfg);
return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24);
}
-static u32 numachip2_set_apic_id(unsigned int id)
-{
- return id << 24;
-}
-
-static int numachip_apic_id_valid(u32 apicid)
-{
- /* Trust what bootloader passes in MADT */
- return 1;
-}
-
-static int numachip_apic_id_registered(void)
-{
- return 1;
-}
-
-static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
-{
- return initial_apic_id >> index_msb;
-}
-
static void numachip1_apic_icr_write(int apicid, unsigned int val)
{
write_lcsr(CSR_G3_EXT_IRQ_GEN, (apicid << 16) | val);
@@ -82,7 +57,7 @@ static void numachip2_apic_icr_write(int apicid, unsigned int val)
numachip2_write32_lcsr(NUMACHIP2_APIC_ICR, (apicid << 12) | val);
}
-static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+static int numachip_wakeup_secondary(u32 phys_apicid, unsigned long start_rip, unsigned int cpu)
{
numachip_apic_icr_write(phys_apicid, APIC_DM_INIT);
numachip_apic_icr_write(phys_apicid, APIC_DM_STARTUP |
@@ -172,15 +147,15 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
u64 val;
u32 nodes = 1;
- this_cpu_write(cpu_llc_id, node);
+ c->topo.llc_id = node;
/* Account for nodes per socket in multi-core-module processors */
- if (static_cpu_has(X86_FEATURE_NODEID_MSR)) {
- rdmsrl(MSR_FAM10H_NODE_ID, val);
+ if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
+ rdmsrq(MSR_FAM10H_NODE_ID, val);
nodes = ((val >> 3) & 7) + 1;
}
- c->phys_proc_id = node / nodes;
+ c->topo.pkg_id = node / nodes;
}
static int __init numachip_system_init(void)
@@ -228,42 +203,19 @@ static int numachip2_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 1;
}
-/* APIC IPIs are queued */
-static void numachip_apic_wait_icr_idle(void)
-{
-}
-
-/* APIC NMI IPIs are queued */
-static u32 numachip_safe_apic_wait_icr_idle(void)
-{
- return 0;
-}
-
static const struct apic apic_numachip1 __refconst = {
.name = "NumaConnect system",
.probe = numachip1_probe,
.acpi_madt_oem_check = numachip1_acpi_madt_oem_check,
- .apic_id_valid = numachip_apic_id_valid,
- .apic_id_registered = numachip_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* physical */
+ .dest_mode_logical = false,
.disable_esr = 0,
- .dest_logical = 0,
- .check_apicid_used = NULL,
-
- .init_apic_ldr = flat_init_apic_ldr,
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
- .phys_pkg_id = numachip_phys_pkg_id,
+ .max_apic_id = UINT_MAX,
.get_apic_id = numachip1_get_apic_id,
- .set_apic_id = numachip1_set_apic_id,
.calc_dest_apicid = apic_default_calc_apicid,
@@ -275,15 +227,12 @@ static const struct apic apic_numachip1 __refconst = {
.send_IPI_self = numachip_send_IPI_self,
.wakeup_secondary_cpu = numachip_wakeup_secondary,
- .inquire_remote_apic = NULL, /* REMRD not supported */
.read = native_apic_mem_read,
.write = native_apic_mem_write,
- .eoi_write = native_apic_mem_write,
+ .eoi = native_apic_mem_eoi,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
- .wait_icr_idle = numachip_apic_wait_icr_idle,
- .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle,
};
apic_driver(apic_numachip1);
@@ -292,27 +241,15 @@ static const struct apic apic_numachip2 __refconst = {
.name = "NumaConnect2 system",
.probe = numachip2_probe,
.acpi_madt_oem_check = numachip2_acpi_madt_oem_check,
- .apic_id_valid = numachip_apic_id_valid,
- .apic_id_registered = numachip_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* physical */
+ .dest_mode_logical = false,
.disable_esr = 0,
- .dest_logical = 0,
- .check_apicid_used = NULL,
-
- .init_apic_ldr = flat_init_apic_ldr,
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
- .phys_pkg_id = numachip_phys_pkg_id,
+ .max_apic_id = UINT_MAX,
.get_apic_id = numachip2_get_apic_id,
- .set_apic_id = numachip2_set_apic_id,
.calc_dest_apicid = apic_default_calc_apicid,
@@ -324,15 +261,12 @@ static const struct apic apic_numachip2 __refconst = {
.send_IPI_self = numachip_send_IPI_self,
.wakeup_secondary_cpu = numachip_wakeup_secondary,
- .inquire_remote_apic = NULL, /* REMRD not supported */
.read = native_apic_mem_read,
.write = native_apic_mem_write,
- .eoi_write = native_apic_mem_write,
+ .eoi = native_apic_mem_eoi,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
- .wait_icr_idle = numachip_apic_wait_icr_idle,
- .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle,
};
apic_driver(apic_numachip2);
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
deleted file mode 100644
index afee386ff711..000000000000
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ /dev/null
@@ -1,216 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs.
- *
- * Drives the local APIC in "clustered mode".
- */
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/dmi.h>
-#include <linux/smp.h>
-
-#include <asm/apicdef.h>
-#include <asm/fixmap.h>
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#include <asm/ipi.h>
-
-static unsigned bigsmp_get_apic_id(unsigned long x)
-{
- return (x >> 24) & 0xFF;
-}
-
-static int bigsmp_apic_id_registered(void)
-{
- return 1;
-}
-
-static bool bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
-{
- return false;
-}
-
-static int bigsmp_early_logical_apicid(int cpu)
-{
- /* on bigsmp, logical apicid is the same as physical */
- return early_per_cpu(x86_cpu_to_apicid, cpu);
-}
-
-static inline unsigned long calculate_ldr(int cpu)
-{
- unsigned long val, id;
-
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- id = per_cpu(x86_bios_cpu_apicid, cpu);
- val |= SET_APIC_LOGICAL_ID(id);
-
- return val;
-}
-
-/*
- * Set up the logical destination ID.
- *
- * Intel recommends to set DFR, LDR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
- */
-static void bigsmp_init_apic_ldr(void)
-{
- unsigned long val;
- int cpu = smp_processor_id();
-
- apic_write(APIC_DFR, APIC_DFR_FLAT);
- val = calculate_ldr(cpu);
- apic_write(APIC_LDR, val);
-}
-
-static void bigsmp_setup_apic_routing(void)
-{
- printk(KERN_INFO
- "Enabling APIC mode: Physflat. Using %d I/O APICs\n",
- nr_ioapics);
-}
-
-static int bigsmp_cpu_present_to_apicid(int mps_cpu)
-{
- if (mps_cpu < nr_cpu_ids)
- return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
-
- return BAD_APICID;
-}
-
-static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
-{
- /* For clustered we don't have a good way to do this yet - hack */
- physids_promote(0xFFL, retmap);
-}
-
-static int bigsmp_check_phys_apicid_present(int phys_apicid)
-{
- return 1;
-}
-
-static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
-{
- return cpuid_apic >> index_msb;
-}
-
-static void bigsmp_send_IPI_allbutself(int vector)
-{
- default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
-}
-
-static void bigsmp_send_IPI_all(int vector)
-{
- default_send_IPI_mask_sequence_phys(cpu_online_mask, vector);
-}
-
-static int dmi_bigsmp; /* can be set by dmi scanners */
-
-static int hp_ht_bigsmp(const struct dmi_system_id *d)
-{
- printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
- dmi_bigsmp = 1;
-
- return 0;
-}
-
-
-static const struct dmi_system_id bigsmp_dmi_table[] = {
- { hp_ht_bigsmp, "HP ProLiant DL760 G2",
- { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
- DMI_MATCH(DMI_BIOS_VERSION, "P44-"),
- }
- },
-
- { hp_ht_bigsmp, "HP ProLiant DL740",
- { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
- DMI_MATCH(DMI_BIOS_VERSION, "P47-"),
- }
- },
- { } /* NULL entry stops DMI scanning */
-};
-
-static int probe_bigsmp(void)
-{
- if (def_to_bigsmp)
- dmi_bigsmp = 1;
- else
- dmi_check_system(bigsmp_dmi_table);
-
- return dmi_bigsmp;
-}
-
-static struct apic apic_bigsmp __ro_after_init = {
-
- .name = "bigsmp",
- .probe = probe_bigsmp,
- .acpi_madt_oem_check = NULL,
- .apic_id_valid = default_apic_id_valid,
- .apic_id_registered = bigsmp_apic_id_registered,
-
- .irq_delivery_mode = dest_Fixed,
- /* phys delivery to target CPU: */
- .irq_dest_mode = 0,
-
- .disable_esr = 1,
- .dest_logical = 0,
- .check_apicid_used = bigsmp_check_apicid_used,
-
- .init_apic_ldr = bigsmp_init_apic_ldr,
-
- .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
- .setup_apic_routing = bigsmp_setup_apic_routing,
- .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
- .apicid_to_cpu_present = physid_set_mask_of_physid,
- .check_phys_apicid_present = bigsmp_check_phys_apicid_present,
- .phys_pkg_id = bigsmp_phys_pkg_id,
-
- .get_apic_id = bigsmp_get_apic_id,
- .set_apic_id = NULL,
-
- .calc_dest_apicid = apic_default_calc_apicid,
-
- .send_IPI = default_send_IPI_single_phys,
- .send_IPI_mask = default_send_IPI_mask_sequence_phys,
- .send_IPI_mask_allbutself = NULL,
- .send_IPI_allbutself = bigsmp_send_IPI_allbutself,
- .send_IPI_all = bigsmp_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
-
- .inquire_remote_apic = default_inquire_remote_apic,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .eoi_write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-
- .x86_32_early_logical_apicid = bigsmp_early_logical_apicid,
-};
-
-void __init generic_bigsmp_probe(void)
-{
- unsigned int cpu;
-
- if (!probe_bigsmp())
- return;
-
- apic = &apic_bigsmp;
-
- for_each_possible_cpu(cpu) {
- if (early_per_cpu(x86_cpu_to_logical_apicid,
- cpu) == BAD_APICID)
- continue;
- early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
- bigsmp_early_logical_apicid(cpu);
- }
-
- pr_info("Overriding APIC driver with %s\n", apic_bigsmp.name);
-}
-
-apic_driver(apic_bigsmp);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index d1fc62a67320..45af535c44a0 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -9,6 +9,7 @@
* Bits copied from original nmi.c file
*
*/
+#include <linux/thread_info.h>
#include <asm/apic.h>
#include <asm/nmi.h>
@@ -20,6 +21,8 @@
#include <linux/init.h>
#include <linux/delay.h>
+#include "local.h"
+
#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
u64 hw_nmi_get_sample_period(int watchdog_thresh)
{
@@ -30,12 +33,12 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
#ifdef arch_trigger_cpumask_backtrace
static void nmi_raise_cpu_backtrace(cpumask_t *mask)
{
- apic->send_IPI_mask(mask, NMI_VECTOR);
+ __apic_send_IPI_mask(mask, NMI_VECTOR);
}
-void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu)
{
- nmi_trigger_cpumask_backtrace(mask, exclude_self,
+ nmi_trigger_cpumask_backtrace(mask, exclude_cpu,
nmi_raise_cpu_backtrace);
}
diff --git a/arch/x86/kernel/apic/init.c b/arch/x86/kernel/apic/init.c
new file mode 100644
index 000000000000..821e2e536f19
--- /dev/null
+++ b/arch/x86/kernel/apic/init.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) "APIC: " fmt
+
+#include <asm/apic.h>
+
+#include "local.h"
+
+/*
+ * Use DEFINE_STATIC_CALL_NULL() to avoid having to provide stub functions
+ * for each callback. The callbacks are setup during boot and all except
+ * wait_icr_idle() must be initialized before usage. The IPI wrappers
+ * use static_call() and not static_call_cond() to catch any fails.
+ */
+#define DEFINE_APIC_CALL(__cb) \
+ DEFINE_STATIC_CALL_NULL(apic_call_##__cb, *apic->__cb)
+
+DEFINE_APIC_CALL(eoi);
+DEFINE_APIC_CALL(native_eoi);
+DEFINE_APIC_CALL(icr_read);
+DEFINE_APIC_CALL(icr_write);
+DEFINE_APIC_CALL(read);
+DEFINE_APIC_CALL(send_IPI);
+DEFINE_APIC_CALL(send_IPI_mask);
+DEFINE_APIC_CALL(send_IPI_mask_allbutself);
+DEFINE_APIC_CALL(send_IPI_allbutself);
+DEFINE_APIC_CALL(send_IPI_all);
+DEFINE_APIC_CALL(send_IPI_self);
+DEFINE_APIC_CALL(wait_icr_idle);
+DEFINE_APIC_CALL(wakeup_secondary_cpu);
+DEFINE_APIC_CALL(wakeup_secondary_cpu_64);
+DEFINE_APIC_CALL(write);
+
+EXPORT_STATIC_CALL_TRAMP_GPL(apic_call_send_IPI_mask);
+EXPORT_STATIC_CALL_TRAMP_GPL(apic_call_send_IPI_self);
+
+/* The container for function call overrides */
+struct apic_override __x86_apic_override __initdata;
+
+#define apply_override(__cb) \
+ if (__x86_apic_override.__cb) \
+ apic->__cb = __x86_apic_override.__cb
+
+static __init void restore_override_callbacks(void)
+{
+ apply_override(eoi);
+ apply_override(native_eoi);
+ apply_override(write);
+ apply_override(read);
+ apply_override(send_IPI);
+ apply_override(send_IPI_mask);
+ apply_override(send_IPI_mask_allbutself);
+ apply_override(send_IPI_allbutself);
+ apply_override(send_IPI_all);
+ apply_override(send_IPI_self);
+ apply_override(icr_read);
+ apply_override(icr_write);
+ apply_override(wakeup_secondary_cpu);
+ apply_override(wakeup_secondary_cpu_64);
+}
+
+#define update_call(__cb) \
+ static_call_update(apic_call_##__cb, *apic->__cb)
+
+static __init void update_static_calls(void)
+{
+ update_call(eoi);
+ update_call(native_eoi);
+ update_call(write);
+ update_call(read);
+ update_call(send_IPI);
+ update_call(send_IPI_mask);
+ update_call(send_IPI_mask_allbutself);
+ update_call(send_IPI_allbutself);
+ update_call(send_IPI_all);
+ update_call(send_IPI_self);
+ update_call(icr_read);
+ update_call(icr_write);
+ update_call(wait_icr_idle);
+ update_call(wakeup_secondary_cpu);
+ update_call(wakeup_secondary_cpu_64);
+}
+
+void __init apic_setup_apic_calls(void)
+{
+ /* Ensure that the default APIC has native_eoi populated */
+ apic->native_eoi = apic->eoi;
+ update_static_calls();
+ pr_info("Static calls initialized\n");
+}
+
+void __init apic_install_driver(struct apic *driver)
+{
+ if (apic == driver)
+ return;
+
+ apic = driver;
+
+ if (IS_ENABLED(CONFIG_X86_X2APIC) && apic->x2apic_set_max_apicid)
+ apic->max_apic_id = x2apic_max_apicid;
+
+ /* Copy the original eoi() callback as KVM/HyperV might overwrite it */
+ if (!apic->native_eoi)
+ apic->native_eoi = apic->eoi;
+
+ /* Apply any already installed callback overrides */
+ restore_override_callbacks();
+ update_static_calls();
+
+ pr_info("Switched APIC routing to: %s\n", driver->name);
+}
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 2953bbf05c08..28f934f05a85 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -48,6 +48,7 @@
#include <linux/jiffies.h> /* time_after() */
#include <linux/slab.h>
#include <linux/memblock.h>
+#include <linux/msi.h>
#include <asm/irqdomain.h>
#include <asm/io.h>
@@ -58,12 +59,14 @@
#include <asm/acpi.h>
#include <asm/dma.h>
#include <asm/timer.h>
+#include <asm/time.h>
#include <asm/i8259.h>
#include <asm/setup.h>
#include <asm/irq_remapping.h>
#include <asm/hw_irq.h>
-
#include <asm/apic.h>
+#include <asm/pgtable.h>
+#include <asm/x86_init.h>
#define for_each_ioapic(idx) \
for ((idx) = 0; (idx) < nr_ioapics; (idx)++)
@@ -83,17 +86,17 @@ static unsigned int ioapic_dynirq_base;
static int ioapic_initialized;
struct irq_pin_list {
- struct list_head list;
- int apic, pin;
+ struct list_head list;
+ int apic, pin;
};
struct mp_chip_data {
- struct list_head irq_2_pin;
- struct IO_APIC_route_entry entry;
- int trigger;
- int polarity;
- u32 count;
- bool isa_irq;
+ struct list_head irq_2_pin;
+ struct IO_APIC_route_entry entry;
+ bool is_level;
+ bool active_low;
+ bool isa_irq;
+ u32 count;
};
struct mp_ioapic_gsi {
@@ -102,21 +105,17 @@ struct mp_ioapic_gsi {
};
static struct ioapic {
- /*
- * # of IRQ routing registers
- */
- int nr_registers;
- /*
- * Saved state during suspend/resume, or while enabling intr-remap.
- */
- struct IO_APIC_route_entry *saved_registers;
+ /* # of IRQ routing registers */
+ int nr_registers;
+ /* Saved state during suspend/resume, or while enabling intr-remap. */
+ struct IO_APIC_route_entry *saved_registers;
/* I/O APIC config */
- struct mpc_ioapic mp_config;
+ struct mpc_ioapic mp_config;
/* IO APIC gsi routing info */
- struct mp_ioapic_gsi gsi_config;
- struct ioapic_domain_cfg irqdomain_cfg;
- struct irq_domain *irqdomain;
- struct resource *iomem_res;
+ struct mp_ioapic_gsi gsi_config;
+ struct ioapic_domain_cfg irqdomain_cfg;
+ struct irq_domain *irqdomain;
+ struct resource *iomem_res;
} ioapics[MAX_IO_APICS];
#define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver
@@ -153,19 +152,6 @@ static inline bool mp_is_legacy_irq(int irq)
return irq >= 0 && irq < nr_legacy_irqs();
}
-/*
- * Initialize all legacy IRQs and all pins on the first IOAPIC
- * if we have legacy interrupt controller. Kernel boot option "pirq="
- * may rely on non-legacy pins on the first IOAPIC.
- */
-static inline int mp_init_irq_at_boot(int ioapic, int irq)
-{
- if (!nr_legacy_irqs())
- return 0;
-
- return ioapic == 0 || mp_is_legacy_irq(irq);
-}
-
static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic)
{
return ioapics[ioapic].irqdomain;
@@ -188,7 +174,7 @@ int mp_bus_id_to_type[MAX_MP_BUSSES];
DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-int skip_ioapic_setup;
+bool ioapic_is_disabled __ro_after_init;
/**
* disable_ioapic_support() - disables ioapic support at runtime
@@ -199,7 +185,7 @@ void disable_ioapic_support(void)
noioapicquirk = 1;
noioapicreroute = -1;
#endif
- skip_ioapic_setup = 1;
+ ioapic_is_disabled = true;
}
static int __init parse_noapic(char *str)
@@ -210,15 +196,14 @@ static int __init parse_noapic(char *str)
}
early_param("noapic", parse_noapic);
-/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
+/* Will be called in mpparse/ACPI codes for saving IRQ info */
void mp_save_irq(struct mpc_intsrc *m)
{
int i;
- apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
- " IRQ %02x, APIC ID %x, APIC INT %02x\n",
- m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
- m->srcbusirq, m->dstapic, m->dstirq);
+ apic_pr_verbose("Int: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC INT %02x\n",
+ m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
+ m->srcbusirq, m->dstapic, m->dstirq);
for (i = 0; i < mp_irq_entries; i++) {
if (!memcmp(&mp_irqs[i], m, sizeof(*m)))
@@ -279,12 +264,14 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
+
writel(vector, &io_apic->eoi);
}
unsigned int native_io_apic_read(unsigned int apic, unsigned int reg)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
+
writel(reg, &io_apic->index);
return readl(&io_apic->data);
}
@@ -298,31 +285,20 @@ static void io_apic_write(unsigned int apic, unsigned int reg,
writel(value, &io_apic->data);
}
-union entry_union {
- struct { u32 w1, w2; };
- struct IO_APIC_route_entry entry;
-};
-
static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin)
{
- union entry_union eu;
+ struct IO_APIC_route_entry entry;
- eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
- eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+ entry.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+ entry.w2 = io_apic_read(apic, 0x11 + 2 * pin);
- return eu.entry;
+ return entry;
}
static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
{
- union entry_union eu;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- eu.entry = __ioapic_read_entry(apic, pin);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return eu.entry;
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ return __ioapic_read_entry(apic, pin);
}
/*
@@ -333,20 +309,14 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
*/
static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
- union entry_union eu = {{0, 0}};
-
- eu.entry = e;
- io_apic_write(apic, 0x11 + 2*pin, eu.w2);
- io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+ io_apic_write(apic, 0x11 + 2*pin, e.w2);
+ io_apic_write(apic, 0x10 + 2*pin, e.w1);
}
static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
__ioapic_write_entry(apic, pin, e);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
@@ -356,13 +326,11 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
*/
static void ioapic_mask_entry(int apic, int pin)
{
- unsigned long flags;
- union entry_union eu = { .entry.mask = IOAPIC_MASKED };
+ struct IO_APIC_route_entry e = { .masked = true };
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x10 + 2*pin, eu.w1);
- io_apic_write(apic, 0x11 + 2*pin, eu.w2);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ io_apic_write(apic, 0x10 + 2*pin, e.w1);
+ io_apic_write(apic, 0x11 + 2*pin, e.w2);
}
/*
@@ -370,95 +338,61 @@ static void ioapic_mask_entry(int apic, int pin)
* shared ISA-space IRQs, so we have to support them. We are super
* fast in the common case, and fast for shared ISA-space IRQs.
*/
-static int __add_pin_to_irq_node(struct mp_chip_data *data,
- int node, int apic, int pin)
+static bool add_pin_to_irq_node(struct mp_chip_data *data, int node, int apic, int pin)
{
struct irq_pin_list *entry;
- /* don't allow duplicates */
- for_each_irq_pin(entry, data->irq_2_pin)
+ /* Don't allow duplicates */
+ for_each_irq_pin(entry, data->irq_2_pin) {
if (entry->apic == apic && entry->pin == pin)
- return 0;
+ return true;
+ }
entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node);
if (!entry) {
- pr_err("can not alloc irq_pin_list (%d,%d,%d)\n",
- node, apic, pin);
- return -ENOMEM;
+ pr_err("Cannot allocate irq_pin_list (%d,%d,%d)\n", node, apic, pin);
+ return false;
}
+
entry->apic = apic;
entry->pin = pin;
list_add_tail(&entry->list, &data->irq_2_pin);
-
- return 0;
+ return true;
}
static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin)
{
struct irq_pin_list *tmp, *entry;
- list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list)
+ list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list) {
if (entry->apic == apic && entry->pin == pin) {
list_del(&entry->list);
kfree(entry);
return;
}
-}
-
-static void add_pin_to_irq_node(struct mp_chip_data *data,
- int node, int apic, int pin)
-{
- if (__add_pin_to_irq_node(data, node, apic, pin))
- panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
-}
-
-/*
- * Reroute an IRQ to a different pin.
- */
-static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
- int oldapic, int oldpin,
- int newapic, int newpin)
-{
- struct irq_pin_list *entry;
-
- for_each_irq_pin(entry, data->irq_2_pin) {
- if (entry->apic == oldapic && entry->pin == oldpin) {
- entry->apic = newapic;
- entry->pin = newpin;
- /* every one is different, right? */
- return;
- }
}
-
- /* old apic/pin didn't exist, so just add new ones */
- add_pin_to_irq_node(data, node, newapic, newpin);
}
-static void io_apic_modify_irq(struct mp_chip_data *data,
- int mask_and, int mask_or,
+static void io_apic_modify_irq(struct mp_chip_data *data, bool masked,
void (*final)(struct irq_pin_list *entry))
{
- union entry_union eu;
struct irq_pin_list *entry;
- eu.entry = data->entry;
- eu.w1 &= mask_and;
- eu.w1 |= mask_or;
- data->entry = eu.entry;
+ data->entry.masked = masked;
for_each_irq_pin(entry, data->irq_2_pin) {
- io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1);
+ io_apic_write(entry->apic, 0x10 + 2 * entry->pin, data->entry.w1);
if (final)
final(entry);
}
}
+/*
+ * Synchronize the IO-APIC and the CPU by doing a dummy read from the
+ * IO-APIC
+ */
static void io_apic_sync(struct irq_pin_list *entry)
{
- /*
- * Synchronize the IO-APIC and the CPU by doing
- * a dummy read from the IO-APIC
- */
struct io_apic __iomem *io_apic;
io_apic = io_apic_base(entry->apic);
@@ -468,26 +402,22 @@ static void io_apic_sync(struct irq_pin_list *entry)
static void mask_ioapic_irq(struct irq_data *irq_data)
{
struct mp_chip_data *data = irq_data->chip_data;
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ io_apic_modify_irq(data, true, &io_apic_sync);
}
static void __unmask_ioapic(struct mp_chip_data *data)
{
- io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL);
+ io_apic_modify_irq(data, false, NULL);
}
static void unmask_ioapic_irq(struct irq_data *irq_data)
{
struct mp_chip_data *data = irq_data->chip_data;
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
__unmask_ioapic(data);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
@@ -515,30 +445,24 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector)
entry = entry1 = __ioapic_read_entry(apic, pin);
- /*
- * Mask the entry and change the trigger mode to edge.
- */
- entry1.mask = IOAPIC_MASKED;
- entry1.trigger = IOAPIC_EDGE;
+ /* Mask the entry and change the trigger mode to edge. */
+ entry1.masked = true;
+ entry1.is_level = false;
__ioapic_write_entry(apic, pin, entry1);
- /*
- * Restore the previous level triggered entry.
- */
+ /* Restore the previous level triggered entry. */
__ioapic_write_entry(apic, pin, entry);
}
}
static void eoi_ioapic_pin(int vector, struct mp_chip_data *data)
{
- unsigned long flags;
struct irq_pin_list *entry;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
for_each_irq_pin(entry, data->irq_2_pin)
__eoi_ioapic_pin(entry->apic, entry->pin, vector);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
@@ -547,34 +471,31 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
/* Check delivery_mode to be sure we're not clearing an SMI pin */
entry = ioapic_read_entry(apic, pin);
- if (entry.delivery_mode == dest_SMI)
+ if (entry.delivery_mode == APIC_DELIVERY_MODE_SMI)
return;
/*
* Make sure the entry is masked and re-read the contents to check
* if it is a level triggered pin and if the remote-IRR is set.
*/
- if (entry.mask == IOAPIC_UNMASKED) {
- entry.mask = IOAPIC_MASKED;
+ if (!entry.masked) {
+ entry.masked = true;
ioapic_write_entry(apic, pin, entry);
entry = ioapic_read_entry(apic, pin);
}
if (entry.irr) {
- unsigned long flags;
-
/*
* Make sure the trigger mode is set to level. Explicit EOI
* doesn't clear the remote-IRR if the trigger mode is not
* set to level.
*/
- if (entry.trigger == IOAPIC_EDGE) {
- entry.trigger = IOAPIC_LEVEL;
+ if (!entry.is_level) {
+ entry.is_level = true;
ioapic_write_entry(apic, pin, entry);
}
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
__eoi_ioapic_pin(apic, pin, entry.vector);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
@@ -609,28 +530,23 @@ static int pirq_entries[MAX_PIRQS] = {
static int __init ioapic_pirq_setup(char *str)
{
- int i, max;
- int ints[MAX_PIRQS+1];
+ int i, max, ints[MAX_PIRQS+1];
get_options(str, ARRAY_SIZE(ints), ints);
- apic_printk(APIC_VERBOSE, KERN_INFO
- "PIRQ redirection, working around broken MP-BIOS.\n");
+ apic_pr_verbose("PIRQ redirection, working around broken MP-BIOS.\n");
+
max = MAX_PIRQS;
if (ints[0] < MAX_PIRQS)
max = ints[0];
for (i = 0; i < max; i++) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
- /*
- * PIRQs are mapped upside down, usually.
- */
+ apic_pr_verbose("... PIRQ%d -> IRQ %d\n", i, ints[i + 1]);
+ /* PIRQs are mapped upside down, usually */
pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
}
return 1;
}
-
__setup("pirq=", ioapic_pirq_setup);
#endif /* CONFIG_X86_32 */
@@ -649,8 +565,7 @@ int save_ioapic_entries(void)
}
for_each_pin(apic, pin)
- ioapics[apic].saved_registers[pin] =
- ioapic_read_entry(apic, pin);
+ ioapics[apic].saved_registers[pin] = ioapic_read_entry(apic, pin);
}
return err;
@@ -671,8 +586,8 @@ void mask_ioapic_entries(void)
struct IO_APIC_route_entry entry;
entry = ioapics[apic].saved_registers[pin];
- if (entry.mask == IOAPIC_UNMASKED) {
- entry.mask = IOAPIC_MASKED;
+ if (!entry.masked) {
+ entry.masked = true;
ioapic_write_entry(apic, pin, entry);
}
}
@@ -691,8 +606,7 @@ int restore_ioapic_entries(void)
continue;
for_each_pin(apic, pin)
- ioapic_write_entry(apic, pin,
- ioapics[apic].saved_registers[pin]);
+ ioapic_write_entry(apic, pin, ioapics[apic].saved_registers[pin]);
}
return 0;
}
@@ -704,12 +618,13 @@ static int find_irq_entry(int ioapic_idx, int pin, int type)
{
int i;
- for (i = 0; i < mp_irq_entries; i++)
+ for (i = 0; i < mp_irq_entries; i++) {
if (mp_irqs[i].irqtype == type &&
(mp_irqs[i].dstapic == mpc_ioapic_id(ioapic_idx) ||
mp_irqs[i].dstapic == MP_APIC_ALL) &&
mp_irqs[i].dstirq == pin)
return i;
+ }
return -1;
}
@@ -724,10 +639,8 @@ static int __init find_isa_irq_pin(int irq, int type)
for (i = 0; i < mp_irq_entries; i++) {
int lbus = mp_irqs[i].srcbus;
- if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].irqtype == type) &&
+ if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].irqtype == type) &&
(mp_irqs[i].srcbusirq == irq))
-
return mp_irqs[i].dstirq;
}
return -1;
@@ -740,8 +653,7 @@ static int __init find_isa_irq_apic(int irq, int type)
for (i = 0; i < mp_irq_entries; i++) {
int lbus = mp_irqs[i].srcbus;
- if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].irqtype == type) &&
+ if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].irqtype == type) &&
(mp_irqs[i].srcbusirq == irq))
break;
}
@@ -749,52 +661,16 @@ static int __init find_isa_irq_apic(int irq, int type)
if (i < mp_irq_entries) {
int ioapic_idx;
- for_each_ioapic(ioapic_idx)
+ for_each_ioapic(ioapic_idx) {
if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic)
return ioapic_idx;
+ }
}
return -1;
}
-#ifdef CONFIG_EISA
-/*
- * EISA Edge/Level control register, ELCR
- */
-static int EISA_ELCR(unsigned int irq)
-{
- if (irq < nr_legacy_irqs()) {
- unsigned int port = 0x4d0 + (irq >> 3);
- return (inb(port) >> (irq & 7)) & 1;
- }
- apic_printk(APIC_VERBOSE, KERN_INFO
- "Broken MPtable reports ISA irq %d\n", irq);
- return 0;
-}
-
-#endif
-
-/* ISA interrupts are always active high edge triggered,
- * when listed as conforming in the MP table. */
-
-#define default_ISA_trigger(idx) (IOAPIC_EDGE)
-#define default_ISA_polarity(idx) (IOAPIC_POL_HIGH)
-
-/* EISA interrupts are always polarity zero and can be edge or level
- * trigger depending on the ELCR value. If an interrupt is listed as
- * EISA conforming in the MP table, that means its trigger type must
- * be read in from the ELCR */
-
-#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq))
-#define default_EISA_polarity(idx) default_ISA_polarity(idx)
-
-/* PCI interrupts are always active low level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_PCI_trigger(idx) (IOAPIC_LEVEL)
-#define default_PCI_polarity(idx) (IOAPIC_POL_LOW)
-
-static int irq_polarity(int idx)
+static bool irq_active_low(int idx)
{
int bus = mp_irqs[idx].srcbus;
@@ -803,125 +679,175 @@ static int irq_polarity(int idx)
*/
switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) {
case MP_IRQPOL_DEFAULT:
- /* conforms to spec, ie. bus-type dependent polarity */
- if (test_bit(bus, mp_bus_not_pci))
- return default_ISA_polarity(idx);
- else
- return default_PCI_polarity(idx);
+ /*
+ * Conforms to spec, ie. bus-type dependent polarity. PCI
+ * defaults to low active. [E]ISA defaults to high active.
+ */
+ return !test_bit(bus, mp_bus_not_pci);
case MP_IRQPOL_ACTIVE_HIGH:
- return IOAPIC_POL_HIGH;
+ return false;
case MP_IRQPOL_RESERVED:
pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
+ fallthrough;
case MP_IRQPOL_ACTIVE_LOW:
default: /* Pointless default required due to do gcc stupidity */
- return IOAPIC_POL_LOW;
+ return true;
}
}
#ifdef CONFIG_EISA
-static int eisa_irq_trigger(int idx, int bus, int trigger)
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static bool EISA_ELCR(unsigned int irq)
+{
+ if (irq < nr_legacy_irqs()) {
+ unsigned int port = PIC_ELCR1 + (irq >> 3);
+ return (inb(port) >> (irq & 7)) & 1;
+ }
+ apic_pr_verbose("Broken MPtable reports ISA irq %d\n", irq);
+ return false;
+}
+
+/*
+ * EISA interrupts are always active high and can be edge or level
+ * triggered depending on the ELCR value. If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must be
+ * read in from the ELCR.
+ */
+static bool eisa_irq_is_level(int idx, int bus, bool level)
{
switch (mp_bus_id_to_type[bus]) {
case MP_BUS_PCI:
case MP_BUS_ISA:
- return trigger;
+ return level;
case MP_BUS_EISA:
- return default_EISA_trigger(idx);
+ return EISA_ELCR(mp_irqs[idx].srcbusirq);
}
pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus);
- return IOAPIC_LEVEL;
+ return true;
}
#else
-static inline int eisa_irq_trigger(int idx, int bus, int trigger)
+static inline int eisa_irq_is_level(int idx, int bus, bool level)
{
- return trigger;
+ return level;
}
#endif
-static int irq_trigger(int idx)
+static bool irq_is_level(int idx)
{
int bus = mp_irqs[idx].srcbus;
- int trigger;
+ bool level;
/*
* Determine IRQ trigger mode (edge or level sensitive):
*/
switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) {
case MP_IRQTRIG_DEFAULT:
- /* conforms to spec, ie. bus-type dependent trigger mode */
- if (test_bit(bus, mp_bus_not_pci))
- trigger = default_ISA_trigger(idx);
- else
- trigger = default_PCI_trigger(idx);
+ /*
+ * Conforms to spec, ie. bus-type dependent trigger
+ * mode. PCI defaults to level, ISA to edge.
+ */
+ level = !test_bit(bus, mp_bus_not_pci);
/* Take EISA into account */
- return eisa_irq_trigger(idx, bus, trigger);
+ return eisa_irq_is_level(idx, bus, level);
case MP_IRQTRIG_EDGE:
- return IOAPIC_EDGE;
+ return false;
case MP_IRQTRIG_RESERVED:
pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
+ fallthrough;
case MP_IRQTRIG_LEVEL:
default: /* Pointless default required due to do gcc stupidity */
- return IOAPIC_LEVEL;
+ return true;
}
}
+static int __acpi_get_override_irq(u32 gsi, bool *trigger, bool *polarity)
+{
+ int ioapic, pin, idx;
+
+ if (ioapic_is_disabled)
+ return -1;
+
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return -1;
+
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ if (pin < 0)
+ return -1;
+
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if (idx < 0)
+ return -1;
+
+ *trigger = irq_is_level(idx);
+ *polarity = irq_active_low(idx);
+ return 0;
+}
+
+#ifdef CONFIG_ACPI
+int acpi_get_override_irq(u32 gsi, int *is_level, int *active_low)
+{
+ *is_level = *active_low = 0;
+ return __acpi_get_override_irq(gsi, (bool *)is_level,
+ (bool *)active_low);
+}
+#endif
+
void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node,
int trigger, int polarity)
{
init_irq_alloc_info(info, NULL);
info->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
- info->ioapic_node = node;
- info->ioapic_trigger = trigger;
- info->ioapic_polarity = polarity;
- info->ioapic_valid = 1;
+ info->ioapic.node = node;
+ info->ioapic.is_level = trigger;
+ info->ioapic.active_low = polarity;
+ info->ioapic.valid = 1;
}
-#ifndef CONFIG_ACPI
-int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
-#endif
-
static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
struct irq_alloc_info *src,
u32 gsi, int ioapic_idx, int pin)
{
- int trigger, polarity;
+ bool level, pol_low;
copy_irq_alloc_info(dst, src);
dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
- dst->ioapic_id = mpc_ioapic_id(ioapic_idx);
- dst->ioapic_pin = pin;
- dst->ioapic_valid = 1;
- if (src && src->ioapic_valid) {
- dst->ioapic_node = src->ioapic_node;
- dst->ioapic_trigger = src->ioapic_trigger;
- dst->ioapic_polarity = src->ioapic_polarity;
+ dst->devid = mpc_ioapic_id(ioapic_idx);
+ dst->ioapic.pin = pin;
+ dst->ioapic.valid = 1;
+ if (src && src->ioapic.valid) {
+ dst->ioapic.node = src->ioapic.node;
+ dst->ioapic.is_level = src->ioapic.is_level;
+ dst->ioapic.active_low = src->ioapic.active_low;
} else {
- dst->ioapic_node = NUMA_NO_NODE;
- if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) {
- dst->ioapic_trigger = trigger;
- dst->ioapic_polarity = polarity;
+ dst->ioapic.node = NUMA_NO_NODE;
+ if (__acpi_get_override_irq(gsi, &level, &pol_low) >= 0) {
+ dst->ioapic.is_level = level;
+ dst->ioapic.active_low = pol_low;
} else {
/*
* PCI interrupts are always active low level
* triggered.
*/
- dst->ioapic_trigger = IOAPIC_LEVEL;
- dst->ioapic_polarity = IOAPIC_POL_LOW;
+ dst->ioapic.is_level = true;
+ dst->ioapic.active_low = true;
}
}
}
static int ioapic_alloc_attr_node(struct irq_alloc_info *info)
{
- return (info && info->ioapic_valid) ? info->ioapic_node : NUMA_NO_NODE;
+ return (info && info->ioapic.valid) ? info->ioapic.node : NUMA_NO_NODE;
}
-static void mp_register_handler(unsigned int irq, unsigned long trigger)
+static void mp_register_handler(unsigned int irq, bool level)
{
irq_flow_handler_t hdl;
bool fasteoi;
- if (trigger) {
+ if (level) {
irq_set_status_flags(irq, IRQ_LEVEL);
fasteoi = true;
} else {
@@ -939,26 +865,26 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
/*
* setup_IO_APIC_irqs() programs all legacy IRQs with default trigger
- * and polarity attirbutes. So allow the first user to reprogram the
+ * and polarity attributes. So allow the first user to reprogram the
* pin with real trigger and polarity attributes.
*/
if (irq < nr_legacy_irqs() && data->count == 1) {
- if (info->ioapic_trigger != data->trigger)
- mp_register_handler(irq, info->ioapic_trigger);
- data->entry.trigger = data->trigger = info->ioapic_trigger;
- data->entry.polarity = data->polarity = info->ioapic_polarity;
+ if (info->ioapic.is_level != data->is_level)
+ mp_register_handler(irq, info->ioapic.is_level);
+ data->entry.is_level = data->is_level = info->ioapic.is_level;
+ data->entry.active_low = data->active_low = info->ioapic.active_low;
}
- return data->trigger == info->ioapic_trigger &&
- data->polarity == info->ioapic_polarity;
+ return data->is_level == info->ioapic.is_level &&
+ data->active_low == info->ioapic.active_low;
}
static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
struct irq_alloc_info *info)
{
+ int type = ioapics[ioapic].irqdomain_cfg.type;
bool legacy = false;
int irq = -1;
- int type = ioapics[ioapic].irqdomain_cfg.type;
switch (type) {
case IOAPIC_DOMAIN_LEGACY:
@@ -980,8 +906,7 @@ static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
return -1;
}
- return __irq_domain_alloc_irqs(domain, irq, 1,
- ioapic_alloc_attr_node(info),
+ return __irq_domain_alloc_irqs(domain, irq, 1, ioapic_alloc_attr_node(info),
info, legacy, NULL);
}
@@ -995,29 +920,26 @@ static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
* PIRQs instead of reprogramming the interrupt routing logic. Thus there may be
* multiple pins sharing the same legacy IRQ number when ACPI is disabled.
*/
-static int alloc_isa_irq_from_domain(struct irq_domain *domain,
- int irq, int ioapic, int pin,
+static int alloc_isa_irq_from_domain(struct irq_domain *domain, int irq, int ioapic, int pin,
struct irq_alloc_info *info)
{
- struct mp_chip_data *data;
struct irq_data *irq_data = irq_get_irq_data(irq);
int node = ioapic_alloc_attr_node(info);
+ struct mp_chip_data *data;
/*
* Legacy ISA IRQ has already been allocated, just add pin to
- * the pin list assoicated with this IRQ and program the IOAPIC
- * entry. The IOAPIC entry
+ * the pin list associated with this IRQ and program the IOAPIC
+ * entry.
*/
if (irq_data && irq_data->parent_data) {
if (!mp_check_pin_attr(irq, info))
return -EBUSY;
- if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic,
- info->ioapic_pin))
+ if (!add_pin_to_irq_node(irq_data->chip_data, node, ioapic, info->ioapic.pin))
return -ENOMEM;
} else {
info->flags |= X86_IRQ_ALLOC_LEGACY;
- irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true,
- NULL);
+ irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, NULL);
if (irq >= 0) {
irq_data = irq_domain_get_irq_data(domain, irq);
data = irq_data->chip_data;
@@ -1031,11 +953,11 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain,
static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
unsigned int flags, struct irq_alloc_info *info)
{
- int irq;
- bool legacy = false;
+ struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
struct irq_alloc_info tmp;
struct mp_chip_data *data;
- struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+ bool legacy = false;
+ int irq;
if (!domain)
return -ENOSYS;
@@ -1043,9 +965,19 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) {
irq = mp_irqs[idx].srcbusirq;
legacy = mp_is_legacy_irq(irq);
+ /*
+ * IRQ2 is unusable for historical reasons on systems which
+ * have a legacy PIC. See the comment vs. IRQ2 further down.
+ *
+ * If this gets removed at some point then the related code
+ * in lapic_assign_system_vectors() needs to be adjusted as
+ * well.
+ */
+ if (legacy && irq == PIC_CASCADE_IR)
+ return -EINVAL;
}
- mutex_lock(&ioapic_mutex);
+ guard(mutex)(&ioapic_mutex);
if (!(flags & IOAPIC_MAP_ALLOC)) {
if (!legacy) {
irq = irq_find_mapping(domain, pin);
@@ -1066,8 +998,6 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
data->count++;
}
}
- mutex_unlock(&ioapic_mutex);
-
return irq;
}
@@ -1075,26 +1005,20 @@ static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
{
u32 gsi = mp_pin_to_gsi(ioapic, pin);
- /*
- * Debugging check, we are in big trouble if this message pops up!
- */
+ /* Debugging check, we are in big trouble if this message pops up! */
if (mp_irqs[idx].dstirq != pin)
pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");
#ifdef CONFIG_X86_32
- /*
- * PCI IRQ command line redirection. Yes, limits are hardcoded.
- */
+ /* PCI IRQ command line redirection. Yes, limits are hardcoded. */
if ((pin >= 16) && (pin <= 23)) {
- if (pirq_entries[pin-16] != -1) {
- if (!pirq_entries[pin-16]) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "disabling PIRQ%d\n", pin-16);
+ if (pirq_entries[pin - 16] != -1) {
+ if (!pirq_entries[pin - 16]) {
+ apic_pr_verbose("Disabling PIRQ%d\n", pin - 16);
} else {
int irq = pirq_entries[pin-16];
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "using PIRQ%d -> IRQ %d\n",
- pin-16, irq);
+
+ apic_pr_verbose("Using PIRQ%d -> IRQ %d\n", pin - 16, irq);
return irq;
}
}
@@ -1132,10 +1056,9 @@ void mp_unmap_irq(int irq)
if (!data || data->isa_irq)
return;
- mutex_lock(&ioapic_mutex);
+ guard(mutex)(&ioapic_mutex);
if (--data->count == 0)
irq_domain_free_irqs(irq, 1);
- mutex_unlock(&ioapic_mutex);
}
/*
@@ -1146,12 +1069,10 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
{
int irq, i, best_ioapic = -1, best_idx = -1;
- apic_printk(APIC_DEBUG,
- "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
- bus, slot, pin);
+ apic_pr_debug("Querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+ bus, slot, pin);
if (test_bit(bus, mp_bus_not_pci)) {
- apic_printk(APIC_VERBOSE,
- "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+ apic_pr_verbose("PCI BIOS passed nonexistent PCI bus %d!\n", bus);
return -1;
}
@@ -1196,8 +1117,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
return -1;
out:
- return pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq,
- IOAPIC_MAP_ALLOC);
+ return pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq, IOAPIC_MAP_ALLOC);
}
EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
@@ -1208,17 +1128,16 @@ static void __init setup_IO_APIC_irqs(void)
unsigned int ioapic, pin;
int idx;
- apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+ apic_pr_verbose("Init IO_APIC IRQs\n");
for_each_ioapic_pin(ioapic, pin) {
idx = find_irq_entry(ioapic, pin, mp_INT);
- if (idx < 0)
- apic_printk(APIC_VERBOSE,
- KERN_DEBUG " apic %d pin %d not connected\n",
- mpc_ioapic_id(ioapic), pin);
- else
- pin_2_irq(idx, ioapic, pin,
- ioapic ? 0 : IOAPIC_MAP_ALLOC);
+ if (idx < 0) {
+ apic_pr_verbose("apic %d pin %d not connected\n",
+ mpc_ioapic_id(ioapic), pin);
+ } else {
+ pin_2_irq(idx, ioapic, pin, ioapic ? 0 : IOAPIC_MAP_ALLOC);
+ }
}
}
@@ -1229,31 +1148,26 @@ void ioapic_zap_locks(void)
static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
{
- int i;
- char buf[256];
struct IO_APIC_route_entry entry;
- struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry;
+ char buf[256];
+ int i;
- printk(KERN_DEBUG "IOAPIC %d:\n", apic);
+ apic_dbg("IOAPIC %d:\n", apic);
for (i = 0; i <= nr_entries; i++) {
entry = ioapic_read_entry(apic, i);
- snprintf(buf, sizeof(buf),
- " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)",
- i,
- entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ",
- entry.trigger == IOAPIC_LEVEL ? "level" : "edge ",
- entry.polarity == IOAPIC_POL_LOW ? "low " : "high",
+ snprintf(buf, sizeof(buf), " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)",
+ i, entry.masked ? "disabled" : "enabled ",
+ entry.is_level ? "level" : "edge ",
+ entry.active_low ? "low " : "high",
entry.vector, entry.irr, entry.delivery_status);
- if (ir_entry->format)
- printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n",
- buf, (ir_entry->index2 << 15) | ir_entry->index,
- ir_entry->zero);
- else
- printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n",
- buf,
- entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ?
- "logical " : "physical",
- entry.dest, entry.delivery_mode);
+ if (entry.ir_format) {
+ apic_dbg("%s, remapped, I(%04X), Z(%X)\n", buf,
+ (entry.ir_index_15 << 15) | entry.ir_index_0_14, entry.ir_zero);
+ } else {
+ apic_dbg("%s, %s, D(%02X%02X), M(%1d)\n", buf,
+ entry.dest_mode_logical ? "logical " : "physical",
+ entry.virt_destid_8_14, entry.destid_0_7, entry.delivery_mode);
+ }
}
}
@@ -1263,30 +1177,25 @@ static void __init print_IO_APIC(int ioapic_idx)
union IO_APIC_reg_01 reg_01;
union IO_APIC_reg_02 reg_02;
union IO_APIC_reg_03 reg_03;
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(ioapic_idx, 0);
- reg_01.raw = io_apic_read(ioapic_idx, 1);
- if (reg_01.bits.version >= 0x10)
- reg_02.raw = io_apic_read(ioapic_idx, 2);
- if (reg_01.bits.version >= 0x20)
- reg_03.raw = io_apic_read(ioapic_idx, 3);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
- printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx));
- printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
- printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
- printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
- printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
-
- printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
- printk(KERN_DEBUG "....... : max redirection entries: %02X\n",
- reg_01.bits.entries);
-
- printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
- printk(KERN_DEBUG "....... : IO APIC version: %02X\n",
- reg_01.bits.version);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock) {
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ reg_01.raw = io_apic_read(ioapic_idx, 1);
+ if (reg_01.bits.version >= 0x10)
+ reg_02.raw = io_apic_read(ioapic_idx, 2);
+ if (reg_01.bits.version >= 0x20)
+ reg_03.raw = io_apic_read(ioapic_idx, 3);
+ }
+
+ apic_dbg("IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx));
+ apic_dbg(".... register #00: %08X\n", reg_00.raw);
+ apic_dbg("....... : physical APIC id: %02X\n", reg_00.bits.ID);
+ apic_dbg("....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
+ apic_dbg("....... : LTS : %X\n", reg_00.bits.LTS);
+ apic_dbg(".... register #01: %08X\n", *(int *)&reg_01);
+ apic_dbg("....... : max redirection entries: %02X\n", reg_01.bits.entries);
+ apic_dbg("....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
+ apic_dbg("....... : IO APIC version: %02X\n", reg_01.bits.version);
/*
* Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
@@ -1294,8 +1203,8 @@ static void __init print_IO_APIC(int ioapic_idx)
* value, so ignore it if reg_02 == reg_01.
*/
if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
- printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
- printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
+ apic_dbg(".... register #02: %08X\n", reg_02.raw);
+ apic_dbg("....... : arbitration: %02X\n", reg_02.bits.arbitration);
}
/*
@@ -1305,11 +1214,11 @@ static void __init print_IO_APIC(int ioapic_idx)
*/
if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
reg_03.raw != reg_01.raw) {
- printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
- printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
+ apic_dbg(".... register #03: %08X\n", reg_03.raw);
+ apic_dbg("....... : Boot DT : %X\n", reg_03.bits.boot_DT);
}
- printk(KERN_DEBUG ".... IRQ redirection table:\n");
+ apic_dbg(".... IRQ redirection table:\n");
io_apic_print_entries(ioapic_idx, reg_01.bits.entries);
}
@@ -1318,11 +1227,11 @@ void __init print_IO_APICs(void)
int ioapic_idx;
unsigned int irq;
- printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
- for_each_ioapic(ioapic_idx)
- printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
- mpc_ioapic_id(ioapic_idx),
- ioapics[ioapic_idx].nr_registers);
+ apic_dbg("number of MP IRQ sources: %d.\n", mp_irq_entries);
+ for_each_ioapic(ioapic_idx) {
+ apic_dbg("number of IO-APIC #%d registers: %d.\n",
+ mpc_ioapic_id(ioapic_idx), ioapics[ioapic_idx].nr_registers);
+ }
/*
* We are a bit conservative about what we expect. We have to
@@ -1333,7 +1242,7 @@ void __init print_IO_APICs(void)
for_each_ioapic(ioapic_idx)
print_IO_APIC(ioapic_idx);
- printk(KERN_DEBUG "IRQ to pin mappings:\n");
+ apic_dbg("IRQ to pin mappings:\n");
for_each_active_irq(irq) {
struct irq_pin_list *entry;
struct irq_chip *chip;
@@ -1348,7 +1257,7 @@ void __init print_IO_APICs(void)
if (list_empty(&data->irq_2_pin))
continue;
- printk(KERN_DEBUG "IRQ%d ", irq);
+ apic_dbg("IRQ%d ", irq);
for_each_irq_pin(entry, data->irq_2_pin)
pr_cont("-> %d:%d", entry->apic, entry->pin);
pr_cont("\n");
@@ -1362,10 +1271,9 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
void __init enable_IO_APIC(void)
{
- int i8259_apic, i8259_pin;
- int apic, pin;
+ int i8259_apic, i8259_pin, apic, pin;
- if (skip_ioapic_setup)
+ if (ioapic_is_disabled)
nr_ioapics = 0;
if (!nr_legacy_irqs() || !nr_ioapics)
@@ -1375,18 +1283,21 @@ void __init enable_IO_APIC(void)
/* See if any of the pins is in ExtINT mode */
struct IO_APIC_route_entry entry = ioapic_read_entry(apic, pin);
- /* If the interrupt line is enabled and in ExtInt mode
- * I have found the pin where the i8259 is connected.
+ /*
+ * If the interrupt line is enabled and in ExtInt mode I
+ * have found the pin where the i8259 is connected.
*/
- if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+ if (!entry.masked && entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) {
ioapic_i8259.apic = apic;
ioapic_i8259.pin = pin;
- goto found_i8259;
+ break;
}
}
- found_i8259:
- /* Look to see what if the MP table has reported the ExtINT */
- /* If we could not find the appropriate pin by looking at the ioapic
+
+ /*
+ * Look to see what if the MP table has reported the ExtINT
+ *
+ * If we could not find the appropriate pin by looking at the ioapic
* the i8259 probably is not connected the ioapic but give the
* mptable a chance anyway.
*/
@@ -1394,44 +1305,39 @@ void __init enable_IO_APIC(void)
i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
/* Trust the MP table if nothing is setup in the hardware */
if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
- printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
+ pr_warn("ExtINT not setup in hardware but reported by MP table\n");
ioapic_i8259.pin = i8259_pin;
ioapic_i8259.apic = i8259_apic;
}
/* Complain if the MP table and the hardware disagree */
if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
- (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
- {
- printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
- }
+ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
+ pr_warn("ExtINT in hardware and MP table differ\n");
- /*
- * Do not trust the IO-APIC being empty at bootup
- */
+ /* Do not trust the IO-APIC being empty at bootup */
clear_IO_APIC();
}
void native_restore_boot_irq_mode(void)
{
/*
- * If the i8259 is routed through an IOAPIC
- * Put that IOAPIC in virtual wire mode
- * so legacy interrupts can be delivered.
+ * If the i8259 is routed through an IOAPIC Put that IOAPIC in
+ * virtual wire mode so legacy interrupts can be delivered.
*/
if (ioapic_i8259.pin != -1) {
struct IO_APIC_route_entry entry;
+ u32 apic_id = read_apic_id();
memset(&entry, 0, sizeof(entry));
- entry.mask = IOAPIC_UNMASKED;
- entry.trigger = IOAPIC_EDGE;
- entry.polarity = IOAPIC_POL_HIGH;
- entry.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
- entry.delivery_mode = dest_ExtINT;
- entry.dest = read_apic_id();
-
- /*
- * Add it to the IO-APIC irq-routing table:
- */
+ entry.masked = false;
+ entry.is_level = false;
+ entry.active_low = false;
+ entry.dest_mode_logical = false;
+ entry.delivery_mode = APIC_DELIVERY_MODE_EXTINT;
+ entry.destid_0_7 = apic_id & 0xFF;
+ entry.virt_destid_8_14 = apic_id >> 8;
+
+ /* Add it to the IO-APIC irq-routing table */
ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
}
@@ -1454,37 +1360,34 @@ void restore_boot_irq_mode(void)
*
* by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
*/
-void __init setup_ioapic_ids_from_mpc_nocheck(void)
+static void __init setup_ioapic_ids_from_mpc_nocheck(void)
{
+ DECLARE_BITMAP(phys_id_present_map, MAX_LOCAL_APIC);
+ const u32 broadcast_id = 0xF;
union IO_APIC_reg_00 reg_00;
- physid_mask_t phys_id_present_map;
- int ioapic_idx;
- int i;
unsigned char old_id;
- unsigned long flags;
+ int ioapic_idx, i;
/*
* This is broken; anything with a real cpu count has to
* circumvent this idiocy regardless.
*/
- apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map);
+ copy_phys_cpu_present_map(phys_id_present_map);
/*
* Set the IOAPIC ID to the value stored in the MPC table.
*/
for_each_ioapic(ioapic_idx) {
/* Read the register 0 value */
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(ioapic_idx, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock)
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
old_id = mpc_ioapic_id(ioapic_idx);
- if (mpc_ioapic_id(ioapic_idx) >= get_physical_broadcast()) {
- printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
- ioapic_idx, mpc_ioapic_id(ioapic_idx));
- printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
- reg_00.bits.ID);
+ if (mpc_ioapic_id(ioapic_idx) >= broadcast_id) {
+ pr_err(FW_BUG "IO-APIC#%d ID is %d in the MPC table!...\n",
+ ioapic_idx, mpc_ioapic_id(ioapic_idx));
+ pr_err("... fixing up to %d. (tell your hw vendor)\n", reg_00.bits.ID);
ioapics[ioapic_idx].mp_config.apicid = reg_00.bits.ID;
}
@@ -1493,65 +1396,54 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
* system must have a unique ID or we get lots of nice
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
- if (apic->check_apicid_used(&phys_id_present_map,
- mpc_ioapic_id(ioapic_idx))) {
- printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
- ioapic_idx, mpc_ioapic_id(ioapic_idx));
- for (i = 0; i < get_physical_broadcast(); i++)
- if (!physid_isset(i, phys_id_present_map))
+ if (test_bit(mpc_ioapic_id(ioapic_idx), phys_id_present_map)) {
+ pr_err(FW_BUG "IO-APIC#%d ID %d is already used!...\n",
+ ioapic_idx, mpc_ioapic_id(ioapic_idx));
+ for (i = 0; i < broadcast_id; i++)
+ if (!test_bit(i, phys_id_present_map))
break;
- if (i >= get_physical_broadcast())
+ if (i >= broadcast_id)
panic("Max APIC ID exceeded!\n");
- printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
- i);
- physid_set(i, phys_id_present_map);
+ pr_err("... fixing up to %d. (tell your hw vendor)\n", i);
+ set_bit(i, phys_id_present_map);
ioapics[ioapic_idx].mp_config.apicid = i;
} else {
- physid_mask_t tmp;
- apic->apicid_to_cpu_present(mpc_ioapic_id(ioapic_idx),
- &tmp);
- apic_printk(APIC_VERBOSE, "Setting %d in the "
- "phys_id_present_map\n",
+ apic_pr_verbose("Setting %d in the phys_id_present_map\n",
mpc_ioapic_id(ioapic_idx));
- physids_or(phys_id_present_map, phys_id_present_map, tmp);
+ set_bit(mpc_ioapic_id(ioapic_idx), phys_id_present_map);
}
/*
- * We need to adjust the IRQ routing table
- * if the ID changed.
+ * We need to adjust the IRQ routing table if the ID
+ * changed.
*/
- if (old_id != mpc_ioapic_id(ioapic_idx))
- for (i = 0; i < mp_irq_entries; i++)
+ if (old_id != mpc_ioapic_id(ioapic_idx)) {
+ for (i = 0; i < mp_irq_entries; i++) {
if (mp_irqs[i].dstapic == old_id)
- mp_irqs[i].dstapic
- = mpc_ioapic_id(ioapic_idx);
+ mp_irqs[i].dstapic = mpc_ioapic_id(ioapic_idx);
+ }
+ }
/*
- * Update the ID register according to the right value
- * from the MPC table if they are different.
+ * Update the ID register according to the right value from
+ * the MPC table if they are different.
*/
if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID)
continue;
- apic_printk(APIC_VERBOSE, KERN_INFO
- "...changing IO-APIC physical APIC ID to %d ...",
- mpc_ioapic_id(ioapic_idx));
+ apic_pr_verbose("...changing IO-APIC physical APIC ID to %d ...",
+ mpc_ioapic_id(ioapic_idx));
reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(ioapic_idx, 0, reg_00.raw);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
- /*
- * Sanity check
- */
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(ioapic_idx, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock) {
+ io_apic_write(ioapic_idx, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ }
+ /* Sanity check */
if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx))
pr_cont("could not set ID!\n");
else
- apic_printk(APIC_VERBOSE, " ok.\n");
+ apic_pr_verbose(" ok.\n");
}
}
@@ -1594,10 +1486,9 @@ static void __init delay_with_tsc(void)
* 1 GHz == 40 jiffies
*/
do {
- rep_nop();
+ native_pause();
now = rdtsc();
- } while ((now - start) < 40000000000ULL / HZ &&
- time_before_eq(jiffies, end));
+ } while ((now - start) < 40000000000ULL / HZ && time_before_eq(jiffies, end));
}
static void __init delay_without_tsc(void)
@@ -1628,21 +1519,16 @@ static void __init delay_without_tsc(void)
static int __init timer_irq_works(void)
{
unsigned long t1 = jiffies;
- unsigned long flags;
if (no_timer_check)
return 1;
- local_save_flags(flags);
local_irq_enable();
-
if (boot_cpu_has(X86_FEATURE_TSC))
delay_with_tsc();
else
delay_without_tsc();
- local_irq_restore(flags);
-
/*
* Expect a few ticks at least, to be sure some possible
* glue logic does not lock up after one or two first
@@ -1651,10 +1537,10 @@ static int __init timer_irq_works(void)
* least one tick may be lost due to delays.
*/
- /* jiffies wrap? */
- if (time_after(jiffies, t1 + 4))
- return 1;
- return 0;
+ local_irq_disable();
+
+ /* Did jiffies advance? */
+ return time_after(jiffies, t1 + 4);
}
/*
@@ -1663,36 +1549,29 @@ static int __init timer_irq_works(void)
* so we 'resend' these IRQs via IPIs, to the same CPU. It's much
* better to do it this way as thus we do not have to be aware of
* 'pending' interrupts in the IRQ path, except at this point.
- */
-/*
- * Edge triggered needs to resend any interrupt
- * that was delayed but this is now handled in the device
- * independent code.
- */
-
-/*
- * Starting up a edge-triggered IO-APIC interrupt is
- * nasty - we need to make sure that we get the edge.
- * If it is already asserted for some reason, we need
- * return 1 to indicate that is was pending.
*
- * This is not complete - we should be able to fake
- * an edge even if it isn't on the 8259A...
+ *
+ * Edge triggered needs to resend any interrupt that was delayed but this
+ * is now handled in the device independent code.
+ *
+ * Starting up a edge-triggered IO-APIC interrupt is nasty - we need to
+ * make sure that we get the edge. If it is already asserted for some
+ * reason, we need return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake an edge even if it
+ * isn't on the 8259A...
*/
static unsigned int startup_ioapic_irq(struct irq_data *data)
{
int was_pending = 0, irq = data->irq;
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
if (irq < nr_legacy_irqs()) {
legacy_pic->mask(irq);
if (legacy_pic->irq_pending(irq))
was_pending = 1;
}
__unmask_ioapic(data->chip_data);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
return was_pending;
}
@@ -1702,40 +1581,37 @@ atomic_t irq_mis_count;
static bool io_apic_level_ack_pending(struct mp_chip_data *data)
{
struct irq_pin_list *entry;
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
for_each_irq_pin(entry, data->irq_2_pin) {
- unsigned int reg;
+ struct IO_APIC_route_entry e;
int pin;
pin = entry->pin;
- reg = io_apic_read(entry->apic, 0x10 + pin*2);
+ e.w1 = io_apic_read(entry->apic, 0x10 + pin*2);
/* Is the remote IRR bit set? */
- if (reg & IO_APIC_REDIR_REMOTE_IRR) {
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ if (e.irr)
return true;
- }
}
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
return false;
}
-static inline bool ioapic_irqd_mask(struct irq_data *data)
+static inline bool ioapic_prepare_move(struct irq_data *data)
{
- /* If we are moving the irq we need to mask it */
+ /* If we are moving the IRQ we need to mask it */
if (unlikely(irqd_is_setaffinity_pending(data))) {
- mask_ioapic_irq(data);
+ if (!irqd_irq_masked(data))
+ mask_ioapic_irq(data);
return true;
}
return false;
}
-static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
+static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
{
- if (unlikely(masked)) {
- /* Only migrate the irq if the ack has been received.
+ if (unlikely(moveit)) {
+ /*
+ * Only migrate the irq if the ack has been received.
*
* On rare occasions the broadcast level triggered ack gets
* delayed going to ioapics, and if we reprogram the
@@ -1754,7 +1630,7 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
* with masking the ioapic entry and then polling until
* Remote IRR was clear before reprogramming the
* ioapic I don't trust the Remote IRR bit to be
- * completey accurate.
+ * completely accurate.
*
* However there appears to be no other way to plug
* this race, so if the Remote IRR bit is not
@@ -1763,15 +1639,17 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
*/
if (!io_apic_level_ack_pending(data->chip_data))
irq_move_masked_irq(data);
- unmask_ioapic_irq(data);
+ /* If the IRQ is masked in the core, leave it: */
+ if (!irqd_irq_masked(data))
+ unmask_ioapic_irq(data);
}
}
#else
-static inline bool ioapic_irqd_mask(struct irq_data *data)
+static inline bool ioapic_prepare_move(struct irq_data *data)
{
return false;
}
-static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
+static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
{
}
#endif
@@ -1780,11 +1658,11 @@ static void ioapic_ack_level(struct irq_data *irq_data)
{
struct irq_cfg *cfg = irqd_cfg(irq_data);
unsigned long v;
- bool masked;
+ bool moveit;
int i;
irq_complete_move(cfg);
- masked = ioapic_irqd_mask(irq_data);
+ moveit = ioapic_prepare_move(irq_data);
/*
* It appears there is an erratum which affects at least version 0x11
@@ -1825,12 +1703,12 @@ static void ioapic_ack_level(struct irq_data *irq_data)
* We must acknowledge the irq before we move it or the acknowledge will
* not propagate properly.
*/
- ack_APIC_irq();
+ apic_eoi();
/*
* Tail end of clearing remote IRR bit (either by delivering the EOI
* message via io-apic EOI register write or simulating it using
- * mask+edge followed by unnask+level logic) manually when the
+ * mask+edge followed by unmask+level logic) manually when the
* level triggered interrupt is seen as the edge triggered interrupt
* at the cpu.
*/
@@ -1839,7 +1717,7 @@ static void ioapic_ack_level(struct irq_data *irq_data)
eoi_ioapic_pin(cfg->vector, irq_data->chip_data);
}
- ioapic_irqd_unmask(irq_data, masked);
+ ioapic_finish_move(irq_data, moveit);
}
static void ioapic_ir_ack_level(struct irq_data *irq_data)
@@ -1856,41 +1734,123 @@ static void ioapic_ir_ack_level(struct irq_data *irq_data)
eoi_ioapic_pin(data->entry.vector, data);
}
+/*
+ * The I/OAPIC is just a device for generating MSI messages from legacy
+ * interrupt pins. Various fields of the RTE translate into bits of the
+ * resulting MSI which had a historical meaning.
+ *
+ * With interrupt remapping, many of those bits have different meanings
+ * in the underlying MSI, but the way that the I/OAPIC transforms them
+ * from its RTE to the MSI message is the same. This function allows
+ * the parent IRQ domain to compose the MSI message, then takes the
+ * relevant bits to put them in the appropriate places in the RTE in
+ * order to generate that message when the IRQ happens.
+ *
+ * The setup here relies on a preconfigured route entry (is_level,
+ * active_low, masked) because the parent domain is merely composing the
+ * generic message routing information which is used for the MSI.
+ */
+static void ioapic_setup_msg_from_msi(struct irq_data *irq_data,
+ struct IO_APIC_route_entry *entry)
+{
+ struct msi_msg msg;
+
+ /* Let the parent domain compose the MSI message */
+ irq_chip_compose_msi_msg(irq_data, &msg);
+
+ /*
+ * - Real vector
+ * - DMAR/IR: 8bit subhandle (ioapic.pin)
+ * - AMD/IR: 8bit IRTE index
+ */
+ entry->vector = msg.arch_data.vector;
+ /* Delivery mode (for DMAR/IR all 0) */
+ entry->delivery_mode = msg.arch_data.delivery_mode;
+ /* Destination mode or DMAR/IR index bit 15 */
+ entry->dest_mode_logical = msg.arch_addr_lo.dest_mode_logical;
+ /* DMAR/IR: 1, 0 for all other modes */
+ entry->ir_format = msg.arch_addr_lo.dmar_format;
+ /*
+ * - DMAR/IR: index bit 0-14.
+ *
+ * - Virt: If the host supports x2apic without a virtualized IR
+ * unit then bit 0-6 of dmar_index_0_14 are providing bit
+ * 8-14 of the destination id.
+ *
+ * All other modes have bit 0-6 of dmar_index_0_14 cleared and the
+ * topmost 8 bits are destination id bit 0-7 (entry::destid_0_7).
+ */
+ entry->ir_index_0_14 = msg.arch_addr_lo.dmar_index_0_14;
+}
+
static void ioapic_configure_entry(struct irq_data *irqd)
{
struct mp_chip_data *mpd = irqd->chip_data;
- struct irq_cfg *cfg = irqd_cfg(irqd);
struct irq_pin_list *entry;
- /*
- * Only update when the parent is the vector domain, don't touch it
- * if the parent is the remapping domain. Check the installed
- * ioapic chip to verify that.
- */
- if (irqd->chip == &ioapic_chip) {
- mpd->entry.dest = cfg->dest_apicid;
- mpd->entry.vector = cfg->vector;
- }
+ ioapic_setup_msg_from_msi(irqd, &mpd->entry);
+
for_each_irq_pin(entry, mpd->irq_2_pin)
__ioapic_write_entry(entry->apic, entry->pin, mpd->entry);
}
-static int ioapic_set_affinity(struct irq_data *irq_data,
- const struct cpumask *mask, bool force)
+static int ioapic_set_affinity(struct irq_data *irq_data, const struct cpumask *mask, bool force)
{
struct irq_data *parent = irq_data->parent_data;
- unsigned long flags;
int ret;
ret = parent->chip->irq_set_affinity(parent, mask, force);
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE)
ioapic_configure_entry(irq_data);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return ret;
}
+/*
+ * Interrupt shutdown masks the ioapic pin, but the interrupt might already
+ * be in flight, but not yet serviced by the target CPU. That means
+ * __synchronize_hardirq() would return and claim that everything is calmed
+ * down. So free_irq() would proceed and deactivate the interrupt and free
+ * resources.
+ *
+ * Once the target CPU comes around to service it it will find a cleared
+ * vector and complain. While the spurious interrupt is harmless, the full
+ * release of resources might prevent the interrupt from being acknowledged
+ * which keeps the hardware in a weird state.
+ *
+ * Verify that the corresponding Remote-IRR bits are clear.
+ */
+static int ioapic_irq_get_chip_state(struct irq_data *irqd, enum irqchip_irq_state which,
+ bool *state)
+{
+ struct mp_chip_data *mcd = irqd->chip_data;
+ struct IO_APIC_route_entry rentry;
+ struct irq_pin_list *p;
+
+ if (which != IRQCHIP_STATE_ACTIVE)
+ return -EINVAL;
+
+ *state = false;
+
+ guard(raw_spinlock)(&ioapic_lock);
+ for_each_irq_pin(p, mcd->irq_2_pin) {
+ rentry = __ioapic_read_entry(p->apic, p->pin);
+ /*
+ * The remote IRR is only valid in level trigger mode. It's
+ * meaning is undefined for edge triggered interrupts and
+ * irrelevant because the IO-APIC treats them as fire and
+ * forget.
+ */
+ if (rentry.irr && rentry.is_level) {
+ *state = true;
+ break;
+ }
+ }
+ return 0;
+}
+
static struct irq_chip ioapic_chip __read_mostly = {
.name = "IO-APIC",
.irq_startup = startup_ioapic_irq,
@@ -1900,7 +1860,9 @@ static struct irq_chip ioapic_chip __read_mostly = {
.irq_eoi = ioapic_ack_level,
.irq_set_affinity = ioapic_set_affinity,
.irq_retrigger = irq_chip_retrigger_hierarchy,
- .flags = IRQCHIP_SKIP_SET_WAKE,
+ .irq_get_irqchip_state = ioapic_irq_get_chip_state,
+ .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED |
+ IRQCHIP_AFFINITY_PRE_STARTUP,
};
static struct irq_chip ioapic_ir_chip __read_mostly = {
@@ -1912,7 +1874,9 @@ static struct irq_chip ioapic_ir_chip __read_mostly = {
.irq_eoi = ioapic_ir_ack_level,
.irq_set_affinity = ioapic_set_affinity,
.irq_retrigger = irq_chip_retrigger_hierarchy,
- .flags = IRQCHIP_SKIP_SET_WAKE,
+ .irq_get_irqchip_state = ioapic_irq_get_chip_state,
+ .flags = IRQCHIP_SKIP_SET_WAKE |
+ IRQCHIP_AFFINITY_PRE_STARTUP,
};
static inline void init_IO_APIC_traps(void)
@@ -1924,14 +1888,13 @@ static inline void init_IO_APIC_traps(void)
cfg = irq_cfg(irq);
if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
/*
- * Hmm.. We don't have an entry for this,
- * so default to an old-fashioned 8259
- * interrupt if we can..
+ * Hmm.. We don't have an entry for this, so
+ * default to an old-fashioned 8259 interrupt if we
+ * can. Otherwise set the dummy interrupt chip.
*/
if (irq < nr_legacy_irqs())
legacy_pic->make_irq(irq);
else
- /* Strange. Oh, well.. */
irq_set_chip(irq, &no_irq_chip);
}
}
@@ -1940,26 +1903,23 @@ static inline void init_IO_APIC_traps(void)
/*
* The local APIC irq-chip implementation:
*/
-
static void mask_lapic_irq(struct irq_data *data)
{
- unsigned long v;
+ unsigned long v = apic_read(APIC_LVT0);
- v = apic_read(APIC_LVT0);
apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
}
static void unmask_lapic_irq(struct irq_data *data)
{
- unsigned long v;
+ unsigned long v = apic_read(APIC_LVT0);
- v = apic_read(APIC_LVT0);
apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
}
static void ack_lapic_irq(struct irq_data *data)
{
- ack_APIC_irq();
+ apic_eoi();
}
static struct irq_chip lapic_chip __read_mostly = {
@@ -1972,8 +1932,7 @@ static struct irq_chip lapic_chip __read_mostly = {
static void lapic_register_intr(int irq)
{
irq_clear_status_flags(irq, IRQ_LEVEL);
- irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
- "edge");
+ irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge");
}
/*
@@ -1985,9 +1944,10 @@ static void lapic_register_intr(int irq)
*/
static inline void __init unlock_ExtINT_logic(void)
{
- int apic, pin, i;
- struct IO_APIC_route_entry entry0, entry1;
unsigned char save_control, save_freq_select;
+ struct IO_APIC_route_entry entry0, entry1;
+ int apic, pin, i;
+ u32 apic_id;
pin = find_isa_irq_pin(8, mp_INT);
if (pin == -1) {
@@ -2003,14 +1963,16 @@ static inline void __init unlock_ExtINT_logic(void)
entry0 = ioapic_read_entry(apic, pin);
clear_IO_APIC_pin(apic, pin);
+ apic_id = read_apic_id();
memset(&entry1, 0, sizeof(entry1));
- entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
- entry1.mask = IOAPIC_UNMASKED;
- entry1.dest = hard_smp_processor_id();
- entry1.delivery_mode = dest_ExtINT;
- entry1.polarity = entry0.polarity;
- entry1.trigger = IOAPIC_EDGE;
+ entry1.dest_mode_logical = true;
+ entry1.masked = false;
+ entry1.destid_0_7 = apic_id & 0xFF;
+ entry1.virt_destid_8_14 = apic_id >> 8;
+ entry1.delivery_mode = APIC_DELIVERY_MODE_EXTINT;
+ entry1.active_low = entry0.active_low;
+ entry1.is_level = false;
entry1.vector = 0;
ioapic_write_entry(apic, pin, entry1);
@@ -2044,32 +2006,47 @@ static int __init disable_timer_pin_setup(char *arg)
}
early_param("disable_timer_pin_1", disable_timer_pin_setup);
-static int mp_alloc_timer_irq(int ioapic, int pin)
+static int __init mp_alloc_timer_irq(int ioapic, int pin)
{
- int irq = -1;
struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+ int irq = -1;
if (domain) {
struct irq_alloc_info info;
ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0);
- info.ioapic_id = mpc_ioapic_id(ioapic);
- info.ioapic_pin = pin;
- mutex_lock(&ioapic_mutex);
+ info.devid = mpc_ioapic_id(ioapic);
+ info.ioapic.pin = pin;
+ guard(mutex)(&ioapic_mutex);
irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info);
- mutex_unlock(&ioapic_mutex);
}
return irq;
}
+static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
+ int oldapic, int oldpin,
+ int newapic, int newpin)
+{
+ struct irq_pin_list *entry;
+
+ for_each_irq_pin(entry, data->irq_2_pin) {
+ if (entry->apic == oldapic && entry->pin == oldpin) {
+ entry->apic = newapic;
+ entry->pin = newpin;
+ return;
+ }
+ }
+
+ /* Old apic/pin didn't exist, so just add a new one */
+ add_pin_to_irq_node(data, node, newapic, newpin);
+}
+
/*
* This code may look a bit paranoid, but it's supposed to cooperate with
* a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
* is so screwy. Thanks to Brian Perkins for testing/hacking this beast
* fanatically on his truly buggy board.
- *
- * FIXME: really need to revamp this for all platforms.
*/
static inline void __init check_timer(void)
{
@@ -2078,10 +2055,12 @@ static inline void __init check_timer(void)
struct irq_cfg *cfg = irqd_cfg(irq_data);
int node = cpu_to_node(0);
int apic1, pin1, apic2, pin2;
- unsigned long flags;
int no_pin1 = 0;
- local_irq_save(flags);
+ if (!global_clock_event)
+ return;
+
+ local_irq_disable();
/*
* get/set the timer IRQ vector:
@@ -2105,9 +2084,8 @@ static inline void __init check_timer(void)
pin2 = ioapic_i8259.pin;
apic2 = ioapic_i8259.apic;
- apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
- "apic1=%d pin1=%d apic2=%d pin2=%d\n",
- cfg->vector, apic1, pin1, apic2, pin2);
+ pr_info("..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
+ cfg->vector, apic1, pin1, apic2, pin2);
/*
* Some BIOS writers are clueless and report the ExtINTA
@@ -2117,7 +2095,7 @@ static inline void __init check_timer(void)
* 8259A.
*/
if (pin1 == -1) {
- panic_if_irq_remap("BIOS bug: timer not connected to IO-APIC");
+ panic_if_irq_remap(FW_BUG "Timer not connected to IO-APIC");
pin1 = pin2;
apic1 = apic2;
no_pin1 = 1;
@@ -2136,9 +2114,9 @@ static inline void __init check_timer(void)
* so only need to unmask if it is level-trigger
* do we really have level trigger timer?
*/
- int idx;
- idx = find_irq_entry(apic1, pin1, mp_INT);
- if (idx != -1 && irq_trigger(idx))
+ int idx = find_irq_entry(apic1, pin1, mp_INT);
+
+ if (idx != -1 && irq_is_level(idx))
unmask_ioapic_irq(irq_get_irq_data(0));
}
irq_domain_deactivate_irq(irq_data);
@@ -2149,16 +2127,12 @@ static inline void __init check_timer(void)
goto out;
}
panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC");
- local_irq_disable();
clear_IO_APIC_pin(apic1, pin1);
if (!no_pin1)
- apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
- "8254 timer not connected to IO-APIC\n");
+ pr_err("..MP-BIOS bug: 8254 timer not connected to IO-APIC\n");
- apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
- "(IRQ0) through the 8259A ...\n");
- apic_printk(APIC_QUIET, KERN_INFO
- "..... (found apic %d pin %d) ...\n", apic2, pin2);
+ pr_info("...trying to set up timer (IRQ0) through the 8259A ...\n");
+ pr_info("..... (found apic %d pin %d) ...\n", apic2, pin2);
/*
* legacy devices should be connected to IO APIC #0
*/
@@ -2167,57 +2141,54 @@ static inline void __init check_timer(void)
irq_domain_activate_irq(irq_data, false);
legacy_pic->unmask(0);
if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
+ pr_info("....... works.\n");
goto out;
}
/*
* Cleanup, just in case ...
*/
- local_irq_disable();
legacy_pic->mask(0);
clear_IO_APIC_pin(apic2, pin2);
- apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
+ pr_info("....... failed.\n");
}
- apic_printk(APIC_QUIET, KERN_INFO
- "...trying to set up timer as Virtual Wire IRQ...\n");
+ pr_info("...trying to set up timer as Virtual Wire IRQ...\n");
lapic_register_intr(0);
apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
legacy_pic->unmask(0);
if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
+ pr_info("..... works.\n");
goto out;
}
- local_irq_disable();
legacy_pic->mask(0);
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
- apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
+ pr_info("..... failed.\n");
- apic_printk(APIC_QUIET, KERN_INFO
- "...trying to set up timer as ExtINT IRQ...\n");
+ pr_info("...trying to set up timer as ExtINT IRQ...\n");
legacy_pic->init(0);
legacy_pic->make_irq(0);
apic_write(APIC_LVT0, APIC_DM_EXTINT);
+ legacy_pic->unmask(0);
unlock_ExtINT_logic();
if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
+ pr_info("..... works.\n");
goto out;
}
- local_irq_disable();
- apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
- if (apic_is_x2apic_enabled())
- apic_printk(APIC_QUIET, KERN_INFO
- "Perhaps problem with the pre-enabled x2apic mode\n"
- "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
+
+ pr_info("..... failed :\n");
+ if (apic_is_x2apic_enabled()) {
+ pr_info("Perhaps problem with the pre-enabled x2apic mode\n"
+ "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
+ }
panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
"report. Then try booting with the 'noapic' option.\n");
out:
- local_irq_restore(flags);
+ local_irq_enable();
}
/*
@@ -2241,60 +2212,61 @@ out:
static int mp_irqdomain_create(int ioapic)
{
- struct irq_alloc_info info;
- struct irq_domain *parent;
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
int hwirqs = mp_ioapic_pin_count(ioapic);
struct ioapic *ip = &ioapics[ioapic];
struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
- struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+ struct irq_domain *parent;
struct fwnode_handle *fn;
- char *name = "IO-APIC";
+ struct irq_fwspec fwspec;
if (cfg->type == IOAPIC_DOMAIN_INVALID)
return 0;
- init_irq_alloc_info(&info, NULL);
- info.type = X86_IRQ_ALLOC_TYPE_IOAPIC;
- info.ioapic_id = mpc_ioapic_id(ioapic);
- parent = irq_remapping_get_ir_irq_domain(&info);
- if (!parent)
- parent = x86_vector_domain;
- else
- name = "IO-APIC-IR";
-
/* Handle device tree enumerated APICs proper */
if (cfg->dev) {
- fn = of_node_to_fwnode(cfg->dev);
+ fn = of_fwnode_handle(cfg->dev);
} else {
- fn = irq_domain_alloc_named_id_fwnode(name, ioapic);
+ fn = irq_domain_alloc_named_id_fwnode("IO-APIC", mpc_ioapic_id(ioapic));
if (!fn)
return -ENOMEM;
}
- ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops,
- (void *)(long)ioapic);
+ fwspec.fwnode = fn;
+ fwspec.param_count = 1;
+ fwspec.param[0] = mpc_ioapic_id(ioapic);
- /* Release fw handle if it was allocated above */
- if (!cfg->dev)
- irq_domain_free_fwnode(fn);
+ parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_GENERIC_MSI);
+ if (!parent) {
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
+ return -ENODEV;
+ }
- if (!ip->irqdomain)
+ ip->irqdomain = irq_domain_create_hierarchy(parent, 0, hwirqs, fn, cfg->ops,
+ (void *)(long)ioapic);
+ if (!ip->irqdomain) {
+ /* Release fw handle if it was allocated above */
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
return -ENOMEM;
+ }
- ip->irqdomain->parent = parent;
-
- if (cfg->type == IOAPIC_DOMAIN_LEGACY ||
- cfg->type == IOAPIC_DOMAIN_STRICT)
- ioapic_dynirq_base = max(ioapic_dynirq_base,
- gsi_cfg->gsi_end + 1);
+ if (cfg->type == IOAPIC_DOMAIN_LEGACY || cfg->type == IOAPIC_DOMAIN_STRICT)
+ ioapic_dynirq_base = max(ioapic_dynirq_base, gsi_cfg->gsi_end + 1);
return 0;
}
static void ioapic_destroy_irqdomain(int idx)
{
+ struct ioapic_domain_cfg *cfg = &ioapics[idx].irqdomain_cfg;
+ struct fwnode_handle *fn = ioapics[idx].irqdomain->fwnode;
+
if (ioapics[idx].irqdomain) {
irq_domain_remove(ioapics[idx].irqdomain);
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
ioapics[idx].irqdomain = NULL;
}
}
@@ -2303,18 +2275,16 @@ void __init setup_IO_APIC(void)
{
int ioapic;
- if (skip_ioapic_setup || !nr_ioapics)
+ if (ioapic_is_disabled || !nr_ioapics)
return;
io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL;
- apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
+ apic_pr_verbose("ENABLING IO-APIC IRQs\n");
for_each_ioapic(ioapic)
BUG_ON(mp_irqdomain_create(ioapic));
- /*
- * Set up IO-APIC IRQ routing.
- */
+ /* Set up IO-APIC IRQ routing. */
x86_init.mpparse.setup_ioapic_ids();
sync_Arb_IDs();
@@ -2328,19 +2298,22 @@ void __init setup_IO_APIC(void)
static void resume_ioapic_id(int ioapic_idx)
{
- unsigned long flags;
union IO_APIC_reg_00 reg_00;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
reg_00.raw = io_apic_read(ioapic_idx, 0);
if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) {
reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
io_apic_write(ioapic_idx, 0, reg_00.raw);
}
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
-static void ioapic_resume(void)
+static int ioapic_suspend(void *data)
+{
+ return save_ioapic_entries();
+}
+
+static void ioapic_resume(void *data)
{
int ioapic_idx;
@@ -2350,14 +2323,18 @@ static void ioapic_resume(void)
restore_ioapic_entries();
}
-static struct syscore_ops ioapic_syscore_ops = {
- .suspend = save_ioapic_entries,
- .resume = ioapic_resume,
+static const struct syscore_ops ioapic_syscore_ops = {
+ .suspend = ioapic_suspend,
+ .resume = ioapic_resume,
+};
+
+static struct syscore ioapic_syscore = {
+ .ops = &ioapic_syscore_ops,
};
static int __init ioapic_init_ops(void)
{
- register_syscore_ops(&ioapic_syscore_ops);
+ register_syscore(&ioapic_syscore);
return 0;
}
@@ -2367,118 +2344,104 @@ device_initcall(ioapic_init_ops);
static int io_apic_get_redir_entries(int ioapic)
{
union IO_APIC_reg_01 reg_01;
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
reg_01.raw = io_apic_read(ioapic, 1);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- /* The register returns the maximum index redir index
- * supported, which is one less than the total number of redir
- * entries.
+ /*
+ * The register returns the maximum index redir index supported,
+ * which is one less than the total number of redir entries.
*/
return reg_01.bits.entries + 1;
}
unsigned int arch_dynirq_lower_bound(unsigned int from)
{
+ unsigned int ret;
+
/*
* dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use
* gsi_top if ioapic_dynirq_base hasn't been initialized yet.
*/
- return ioapic_initialized ? ioapic_dynirq_base : gsi_top;
+ ret = ioapic_dynirq_base ? : gsi_top;
+
+ /*
+ * For DT enabled machines ioapic_dynirq_base is irrelevant and
+ * always 0. gsi_top can be 0 if there is no IO/APIC registered.
+ * 0 is an invalid interrupt number for dynamic allocations. Return
+ * @from instead.
+ */
+ return ret ? : from;
}
#ifdef CONFIG_X86_32
static int io_apic_get_unique_id(int ioapic, int apic_id)
{
+ static DECLARE_BITMAP(apic_id_map, MAX_LOCAL_APIC);
+ const u32 broadcast_id = 0xF;
union IO_APIC_reg_00 reg_00;
- static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
- physid_mask_t tmp;
- unsigned long flags;
int i = 0;
- /*
- * The P4 platform supports up to 256 APIC IDs on two separate APIC
- * buses (one for LAPICs, one for IOAPICs), where predecessors only
- * supports up to 16 on one shared APIC bus.
- *
- * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
- * advantage of new APIC bus architecture.
- */
+ /* Initialize the ID map */
+ if (bitmap_empty(apic_id_map, MAX_LOCAL_APIC))
+ copy_phys_cpu_present_map(apic_id_map);
- if (physids_empty(apic_id_map))
- apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(ioapic, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock)
+ reg_00.raw = io_apic_read(ioapic, 0);
- if (apic_id >= get_physical_broadcast()) {
- printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
- "%d\n", ioapic, apic_id, reg_00.bits.ID);
+ if (apic_id >= broadcast_id) {
+ pr_warn("IOAPIC[%d]: Invalid apic_id %d, trying %d\n",
+ ioapic, apic_id, reg_00.bits.ID);
apic_id = reg_00.bits.ID;
}
- /*
- * Every APIC in a system must have a unique ID or we get lots of nice
- * 'stuck on smp_invalidate_needed IPI wait' messages.
- */
- if (apic->check_apicid_used(&apic_id_map, apic_id)) {
-
- for (i = 0; i < get_physical_broadcast(); i++) {
- if (!apic->check_apicid_used(&apic_id_map, i))
+ /* Every APIC in a system must have a unique ID */
+ if (test_bit(apic_id, apic_id_map)) {
+ for (i = 0; i < broadcast_id; i++) {
+ if (!test_bit(i, apic_id_map))
break;
}
- if (i == get_physical_broadcast())
+ if (i == broadcast_id)
panic("Max apic_id exceeded!\n");
- printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
- "trying %d\n", ioapic, apic_id, i);
-
+ pr_warn("IOAPIC[%d]: apic_id %d already used, trying %d\n", ioapic, apic_id, i);
apic_id = i;
}
- apic->apicid_to_cpu_present(apic_id, &tmp);
- physids_or(apic_id_map, apic_id_map, tmp);
+ set_bit(apic_id, apic_id_map);
if (reg_00.bits.ID != apic_id) {
reg_00.bits.ID = apic_id;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(ioapic, 0, reg_00.raw);
- reg_00.raw = io_apic_read(ioapic, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock) {
+ io_apic_write(ioapic, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(ioapic, 0);
+ }
/* Sanity check */
if (reg_00.bits.ID != apic_id) {
- pr_err("IOAPIC[%d]: Unable to change apic_id!\n",
- ioapic);
+ pr_err("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
return -1;
}
}
- apic_printk(APIC_VERBOSE, KERN_INFO
- "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+ apic_pr_verbose("IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
return apic_id;
}
static u8 io_apic_unique_id(int idx, u8 id)
{
- if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
- !APIC_XAPIC(boot_cpu_apic_version))
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && !APIC_XAPIC(boot_cpu_apic_version))
return io_apic_get_unique_id(idx, id);
- else
- return id;
+ return id;
}
#else
static u8 io_apic_unique_id(int idx, u8 id)
{
union IO_APIC_reg_00 reg_00;
DECLARE_BITMAP(used, 256);
- unsigned long flags;
u8 new_id;
int i;
@@ -2494,26 +2457,23 @@ static u8 io_apic_unique_id(int idx, u8 id)
* Read the current id from the ioapic and keep it if
* available.
*/
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(idx, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock)
+ reg_00.raw = io_apic_read(idx, 0);
+
new_id = reg_00.bits.ID;
if (!test_bit(new_id, used)) {
- apic_printk(APIC_VERBOSE, KERN_INFO
- "IOAPIC[%d]: Using reg apic_id %d instead of %d\n",
- idx, new_id, id);
+ apic_pr_verbose("IOAPIC[%d]: Using reg apic_id %d instead of %d\n",
+ idx, new_id, id);
return new_id;
}
- /*
- * Get the next free id and write it to the ioapic.
- */
+ /* Get the next free id and write it to the ioapic. */
new_id = find_first_zero_bit(used, 256);
reg_00.bits.ID = new_id;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(idx, 0, reg_00.raw);
- reg_00.raw = io_apic_read(idx, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock) {
+ io_apic_write(idx, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(idx, 0);
+ }
/* Sanity check */
BUG_ON(reg_00.bits.ID != new_id);
@@ -2523,40 +2483,14 @@ static u8 io_apic_unique_id(int idx, u8 id)
static int io_apic_get_version(int ioapic)
{
- union IO_APIC_reg_01 reg_01;
- unsigned long flags;
+ union IO_APIC_reg_01 reg_01;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
reg_01.raw = io_apic_read(ioapic, 1);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return reg_01.bits.version;
}
-int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
-{
- int ioapic, pin, idx;
-
- if (skip_ioapic_setup)
- return -1;
-
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0)
- return -1;
-
- pin = mp_find_ioapic_pin(ioapic, gsi);
- if (pin < 0)
- return -1;
-
- idx = find_irq_entry(ioapic, pin, mp_INT);
- if (idx < 0)
- return -1;
-
- *trigger = irq_trigger(idx);
- *polarity = irq_polarity(idx);
- return 0;
-}
-
/*
* This function updates target affinity of IOAPIC interrupts to include
* the CPUs which came online during SMP bringup.
@@ -2567,8 +2501,8 @@ static struct resource *ioapic_resources;
static struct resource * __init ioapic_setup_resources(void)
{
- unsigned long n;
struct resource *res;
+ unsigned long n;
char *mem;
int i;
@@ -2578,7 +2512,7 @@ static struct resource * __init ioapic_setup_resources(void)
n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
n *= nr_ioapics;
- mem = memblock_alloc(n, SMP_CACHE_BYTES);
+ mem = memblock_alloc_or_panic(n, SMP_CACHE_BYTES);
res = (void *)mem;
mem += sizeof(struct resource) * nr_ioapics;
@@ -2596,6 +2530,24 @@ static struct resource * __init ioapic_setup_resources(void)
return res;
}
+static void io_apic_set_fixmap(enum fixed_addresses idx, phys_addr_t phys)
+{
+ pgprot_t flags = FIXMAP_PAGE_NOCACHE;
+
+ /*
+ * Ensure fixmaps for IO-APIC MMIO respect memory encryption pgprot
+ * bits, just like normal ioremap():
+ */
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+ if (x86_platform.hyper.is_private_mmio(phys))
+ flags = pgprot_encrypted(flags);
+ else
+ flags = pgprot_decrypted(flags);
+ }
+
+ __set_fixmap(idx, phys, flags);
+}
+
void __init io_apic_init_mappings(void)
{
unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
@@ -2608,12 +2560,10 @@ void __init io_apic_init_mappings(void)
ioapic_phys = mpc_ioapic_addr(i);
#ifdef CONFIG_X86_32
if (!ioapic_phys) {
- printk(KERN_ERR
- "WARNING: bogus zero IO-APIC "
- "address found in MPTABLE, "
+ pr_err("WARNING: bogus zero IO-APIC address found in MPTABLE, "
"disabling IO/APIC support!\n");
smp_found_config = 0;
- skip_ioapic_setup = 1;
+ ioapic_is_disabled = true;
goto fake_ioapic_page;
}
#endif
@@ -2621,14 +2571,13 @@ void __init io_apic_init_mappings(void)
#ifdef CONFIG_X86_32
fake_ioapic_page:
#endif
- ioapic_phys = (unsigned long)memblock_alloc(PAGE_SIZE,
+ ioapic_phys = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE,
PAGE_SIZE);
ioapic_phys = __pa(ioapic_phys);
}
- set_fixmap_nocache(idx, ioapic_phys);
- apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
- __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
- ioapic_phys);
+ io_apic_set_fixmap(idx, ioapic_phys);
+ apic_pr_verbose("mapped IOAPIC to %08lx (%08lx)\n",
+ __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), ioapic_phys);
idx++;
ioapic_res->start = ioapic_phys;
@@ -2639,13 +2588,12 @@ fake_ioapic_page:
void __init ioapic_insert_resources(void)
{
- int i;
struct resource *r = ioapic_resources;
+ int i;
if (!r) {
if (nr_ioapics > 0)
- printk(KERN_ERR
- "IO APIC resources couldn't be allocated.\n");
+ pr_err("IO APIC resources couldn't be allocated.\n");
return;
}
@@ -2665,11 +2613,12 @@ int mp_find_ioapic(u32 gsi)
/* Find the IOAPIC that manages this GSI. */
for_each_ioapic(i) {
struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
+
if (gsi >= gsi_cfg->gsi_base && gsi <= gsi_cfg->gsi_end)
return i;
}
- printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+ pr_err("ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
return -1;
}
@@ -2708,12 +2657,10 @@ static int bad_ioapic_register(int idx)
static int find_free_ioapic_entry(void)
{
- int idx;
-
- for (idx = 0; idx < MAX_IO_APICS; idx++)
+ for (int idx = 0; idx < MAX_IO_APICS; idx++) {
if (ioapics[idx].nr_registers == 0)
return idx;
-
+ }
return MAX_IO_APICS;
}
@@ -2724,8 +2671,7 @@ static int find_free_ioapic_entry(void)
* @gsi_base: base of GSI associated with the IOAPIC
* @cfg: configuration information for the IOAPIC
*/
-int mp_register_ioapic(int id, u32 address, u32 gsi_base,
- struct ioapic_domain_cfg *cfg)
+int mp_register_ioapic(int id, u32 address, u32 gsi_base, struct ioapic_domain_cfg *cfg)
{
bool hotplug = !!ioapic_initialized;
struct mp_ioapic_gsi *gsi_cfg;
@@ -2736,12 +2682,13 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base,
pr_warn("Bogus (zero) I/O APIC address found, skipping!\n");
return -EINVAL;
}
- for_each_ioapic(ioapic)
+
+ for_each_ioapic(ioapic) {
if (ioapics[ioapic].mp_config.apicaddr == address) {
- pr_warn("address 0x%x conflicts with IOAPIC%d\n",
- address, ioapic);
+ pr_warn("address 0x%x conflicts with IOAPIC%d\n", address, ioapic);
return -EEXIST;
}
+ }
idx = find_free_ioapic_entry();
if (idx >= MAX_IO_APICS) {
@@ -2754,7 +2701,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base,
ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
ioapics[idx].mp_config.apicaddr = address;
- set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+ io_apic_set_fixmap(FIX_IO_APIC_BASE_0 + idx, address);
if (bad_ioapic_register(idx)) {
clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
return -ENODEV;
@@ -2776,8 +2723,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base,
(gsi_end >= gsi_cfg->gsi_base &&
gsi_end <= gsi_cfg->gsi_end)) {
pr_warn("GSI range [%u-%u] for new IOAPIC conflicts with GSI[%u-%u]\n",
- gsi_base, gsi_end,
- gsi_cfg->gsi_base, gsi_cfg->gsi_end);
+ gsi_base, gsi_end, gsi_cfg->gsi_base, gsi_cfg->gsi_end);
clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
return -ENOSPC;
}
@@ -2791,7 +2737,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base,
/*
* If mp_register_ioapic() is called during early boot stage when
- * walking ACPI/SFI/DT tables, it's too early to create irqdomain,
+ * walking ACPI/DT tables, it's too early to create irqdomain,
* we are still using bootmem allocator. So delay it to setup_IO_APIC().
*/
if (hotplug) {
@@ -2811,8 +2757,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base,
ioapics[idx].nr_registers = entries;
pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n",
- idx, mpc_ioapic_id(idx),
- mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
+ idx, mpc_ioapic_id(idx), mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
gsi_cfg->gsi_base, gsi_cfg->gsi_end);
return 0;
@@ -2823,11 +2768,13 @@ int mp_unregister_ioapic(u32 gsi_base)
int ioapic, pin;
int found = 0;
- for_each_ioapic(ioapic)
+ for_each_ioapic(ioapic) {
if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) {
found = 1;
break;
}
+ }
+
if (!found) {
pr_warn("can't find IOAPIC for GSI %d\n", gsi_base);
return -ENODEV;
@@ -2841,8 +2788,7 @@ int mp_unregister_ioapic(u32 gsi_base)
if (irq >= 0) {
data = irq_get_chip_data(irq);
if (data && data->count) {
- pr_warn("pin%d on IOAPIC%d is still in use.\n",
- pin, ioapic);
+ pr_warn("pin%d on IOAPIC%d is still in use.\n", pin, ioapic);
return -EBUSY;
}
}
@@ -2874,45 +2820,49 @@ int mp_ioapic_registered(u32 gsi_base)
static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
struct irq_alloc_info *info)
{
- if (info && info->ioapic_valid) {
- data->trigger = info->ioapic_trigger;
- data->polarity = info->ioapic_polarity;
- } else if (acpi_get_override_irq(gsi, &data->trigger,
- &data->polarity) < 0) {
+ if (info && info->ioapic.valid) {
+ data->is_level = info->ioapic.is_level;
+ data->active_low = info->ioapic.active_low;
+ } else if (__acpi_get_override_irq(gsi, &data->is_level, &data->active_low) < 0) {
/* PCI interrupts are always active low level triggered. */
- data->trigger = IOAPIC_LEVEL;
- data->polarity = IOAPIC_POL_LOW;
+ data->is_level = true;
+ data->active_low = true;
}
}
-static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
- struct IO_APIC_route_entry *entry)
+/*
+ * Configure the I/O-APIC specific fields in the routing entry.
+ *
+ * This is important to setup the I/O-APIC specific bits (is_level,
+ * active_low, masked) because the underlying parent domain will only
+ * provide the routing information and is oblivious of the I/O-APIC
+ * specific bits.
+ *
+ * The entry is just preconfigured at this point and not written into the
+ * RTE. This happens later during activation which will fill in the actual
+ * routing information.
+ */
+static void mp_preconfigure_entry(struct mp_chip_data *data)
{
+ struct IO_APIC_route_entry *entry = &data->entry;
+
memset(entry, 0, sizeof(*entry));
- entry->delivery_mode = apic->irq_delivery_mode;
- entry->dest_mode = apic->irq_dest_mode;
- entry->dest = cfg->dest_apicid;
- entry->vector = cfg->vector;
- entry->trigger = data->trigger;
- entry->polarity = data->polarity;
+ entry->is_level = data->is_level;
+ entry->active_low = data->active_low;
/*
* Mask level triggered irqs. Edge triggered irqs are masked
* by the irq core code in case they fire.
*/
- if (data->trigger == IOAPIC_LEVEL)
- entry->mask = IOAPIC_MASKED;
- else
- entry->mask = IOAPIC_UNMASKED;
+ entry->masked = data->is_level;
}
int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
unsigned int nr_irqs, void *arg)
{
- int ret, ioapic, pin;
- struct irq_cfg *cfg;
- struct irq_data *irq_data;
- struct mp_chip_data *data;
struct irq_alloc_info *info = arg;
+ struct mp_chip_data *data;
+ struct irq_data *irq_data;
+ int ret, ioapic, pin;
unsigned long flags;
if (!info || nr_irqs > 1)
@@ -2922,45 +2872,47 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
return -EINVAL;
ioapic = mp_irqdomain_ioapic_idx(domain);
- pin = info->ioapic_pin;
- if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0)
+ pin = info->ioapic.pin;
+ if (irq_resolve_mapping(domain, (irq_hw_number_t)pin))
return -EEXIST;
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
return -ENOMEM;
- info->ioapic_entry = &data->entry;
ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
- if (ret < 0) {
- kfree(data);
- return ret;
- }
+ if (ret < 0)
+ goto free_data;
INIT_LIST_HEAD(&data->irq_2_pin);
- irq_data->hwirq = info->ioapic_pin;
+ irq_data->hwirq = info->ioapic.pin;
irq_data->chip = (domain->parent == x86_vector_domain) ?
&ioapic_chip : &ioapic_ir_chip;
irq_data->chip_data = data;
mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info);
- cfg = irqd_cfg(irq_data);
- add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin);
+ if (!add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin)) {
+ ret = -ENOMEM;
+ goto free_irqs;
+ }
+
+ mp_preconfigure_entry(data);
+ mp_register_handler(virq, data->is_level);
local_irq_save(flags);
- if (info->ioapic_entry)
- mp_setup_entry(cfg, data, info->ioapic_entry);
- mp_register_handler(virq, data->trigger);
if (virq < nr_legacy_irqs())
legacy_pic->mask(virq);
local_irq_restore(flags);
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n",
- ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector,
- virq, data->trigger, data->polarity, cfg->dest_apicid);
-
+ apic_pr_verbose("IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n",
+ ioapic, mpc_ioapic_id(ioapic), pin, virq, data->is_level, data->active_low);
return 0;
+
+free_irqs:
+ irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+free_data:
+ kfree(data);
+ return ret;
}
void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
@@ -2973,22 +2925,17 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
irq_data = irq_domain_get_irq_data(domain, virq);
if (irq_data && irq_data->chip_data) {
data = irq_data->chip_data;
- __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain),
- (int)irq_data->hwirq);
+ __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain), (int)irq_data->hwirq);
WARN_ON(!list_empty(&data->irq_2_pin));
kfree(irq_data->chip_data);
}
irq_domain_free_irqs_top(domain, virq, nr_irqs);
}
-int mp_irqdomain_activate(struct irq_domain *domain,
- struct irq_data *irq_data, bool reserve)
+int mp_irqdomain_activate(struct irq_domain *domain, struct irq_data *irq_data, bool reserve)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
ioapic_configure_entry(irq_data);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return 0;
}
@@ -2996,8 +2943,7 @@ void mp_irqdomain_deactivate(struct irq_domain *domain,
struct irq_data *irq_data)
{
/* It won't be called for IRQ with multiple IOAPIC pins associated */
- ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain),
- (int)irq_data->hwirq);
+ ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain), (int)irq_data->hwirq);
}
int mp_irqdomain_ioapic_idx(struct irq_domain *domain)
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 82f9244fe61f..98a57cb4aa86 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -1,81 +1,189 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/cpumask.h>
-#include <linux/interrupt.h>
-#include <linux/mm.h>
+#include <linux/cpumask.h>
#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/kernel_stat.h>
-#include <linux/mc146818rtc.h>
-#include <linux/cache.h>
-#include <linux/cpu.h>
-
-#include <asm/smp.h>
-#include <asm/mtrr.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <asm/apic.h>
-#include <asm/proto.h>
-#include <asm/ipi.h>
-
-void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
+#include <linux/smp.h>
+#include <linux/string_choices.h>
+
+#include <asm/io_apic.h>
+
+#include "local.h"
+
+DEFINE_STATIC_KEY_FALSE(apic_use_ipi_shorthand);
+
+#ifdef CONFIG_SMP
+static int apic_ipi_shorthand_off __ro_after_init;
+
+static __init int apic_ipi_shorthand(char *str)
{
- /*
- * Subtle. In the case of the 'never do double writes' workaround
- * we have to lock out interrupts to be safe. As we don't care
- * of the value read we use an atomic rmw access to avoid costly
- * cli/sti. Otherwise we use an even cheaper single atomic write
- * to the APIC.
- */
- unsigned int cfg;
+ get_option(&str, &apic_ipi_shorthand_off);
+ return 1;
+}
+__setup("no_ipi_broadcast=", apic_ipi_shorthand);
- /*
- * Wait for idle.
- */
- __xapic_wait_icr_idle();
+static int __init print_ipi_mode(void)
+{
+ pr_info("IPI shorthand broadcast: %s\n",
+ str_disabled_enabled(apic_ipi_shorthand_off));
+ return 0;
+}
+late_initcall(print_ipi_mode);
+void apic_smt_update(void)
+{
/*
- * No need to touch the target chip field
+ * Do not switch to broadcast mode if:
+ * - Disabled on the command line
+ * - Only a single CPU is online
+ * - Not all present CPUs have been at least booted once
+ *
+ * The latter is important as the local APIC might be in some
+ * random state and a broadcast might cause havoc. That's
+ * especially true for NMI broadcasting.
*/
- cfg = __prepare_ICR(shortcut, vector, dest);
+ if (apic_ipi_shorthand_off || num_online_cpus() == 1 ||
+ !cpumask_equal(cpu_present_mask, &cpus_booted_once_mask)) {
+ static_branch_disable(&apic_use_ipi_shorthand);
+ } else {
+ static_branch_enable(&apic_use_ipi_shorthand);
+ }
+}
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- native_apic_mem_write(APIC_ICR, cfg);
+void apic_send_IPI_allbutself(unsigned int vector)
+{
+ if (num_online_cpus() < 2)
+ return;
+
+ if (static_branch_likely(&apic_use_ipi_shorthand))
+ __apic_send_IPI_allbutself(vector);
+ else
+ __apic_send_IPI_mask_allbutself(cpu_online_mask, vector);
}
/*
- * This is used to send an IPI with no shorthand notation (the destination is
- * specified in bits 56 to 63 of the ICR).
+ * Send a 'reschedule' IPI to another CPU. It goes straight through and
+ * wastes no time serializing anything. Worst case is that we lose a
+ * reschedule ...
*/
-void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest)
+void native_smp_send_reschedule(int cpu)
+{
+ if (unlikely(cpu_is_offline(cpu))) {
+ WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu);
+ return;
+ }
+ __apic_send_IPI(cpu, RESCHEDULE_VECTOR);
+}
+
+void native_send_call_func_single_ipi(int cpu)
+{
+ __apic_send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR);
+}
+
+void native_send_call_func_ipi(const struct cpumask *mask)
+{
+ if (static_branch_likely(&apic_use_ipi_shorthand)) {
+ unsigned int cpu = smp_processor_id();
+
+ if (!cpumask_or_equal(mask, cpumask_of(cpu), cpu_online_mask))
+ goto sendmask;
+
+ if (cpumask_test_cpu(cpu, mask))
+ __apic_send_IPI_all(CALL_FUNCTION_VECTOR);
+ else if (num_online_cpus() > 1)
+ __apic_send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+ return;
+ }
+
+sendmask:
+ __apic_send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+}
+
+void apic_send_nmi_to_offline_cpu(unsigned int cpu)
+{
+ if (WARN_ON_ONCE(!apic->nmi_to_offline_cpu))
+ return;
+ if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, &cpus_booted_once_mask)))
+ return;
+ apic->send_IPI(cpu, NMI_VECTOR);
+}
+#endif /* CONFIG_SMP */
+
+static inline int __prepare_ICR2(unsigned int mask)
{
- unsigned long cfg;
+ return SET_XAPIC_DEST_FIELD(mask);
+}
+u32 apic_mem_wait_icr_idle_timeout(void)
+{
+ int cnt;
+
+ for (cnt = 0; cnt < 1000; cnt++) {
+ if (!(apic_read(APIC_ICR) & APIC_ICR_BUSY))
+ return 0;
+ inc_irq_stat(icr_read_retry_count);
+ udelay(100);
+ }
+ return APIC_ICR_BUSY;
+}
+
+void apic_mem_wait_icr_idle(void)
+{
+ while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY)
+ cpu_relax();
+}
+
+/*
+ * This is safe against interruption because it only writes the lower 32
+ * bits of the APIC_ICR register. The destination field is ignored for
+ * short hand IPIs.
+ *
+ * wait_icr_idle()
+ * write(ICR2, dest)
+ * NMI
+ * wait_icr_idle()
+ * write(ICR)
+ * wait_icr_idle()
+ * write(ICR)
+ *
+ * This function does not need to disable interrupts as there is no ICR2
+ * interaction. The memory write is direct except when the machine is
+ * affected by the 11AP Pentium erratum, which turns the plain write into
+ * an XCHG operation.
+ */
+static void __default_send_IPI_shortcut(unsigned int shortcut, int vector)
+{
/*
- * Wait for idle.
+ * Wait for the previous ICR command to complete. Use
+ * safe_apic_wait_icr_idle() for the NMI vector as there have been
+ * issues where otherwise the system hangs when the panic CPU tries
+ * to stop the others before launching the kdump kernel.
*/
if (unlikely(vector == NMI_VECTOR))
- safe_apic_wait_icr_idle();
+ apic_mem_wait_icr_idle_timeout();
else
- __xapic_wait_icr_idle();
+ apic_mem_wait_icr_idle();
- /*
- * prepare target chip field
- */
- cfg = __prepare_ICR2(mask);
- native_apic_mem_write(APIC_ICR2, cfg);
+ /* Destination field (ICR2) and the destination mode are ignored */
+ native_apic_mem_write(APIC_ICR, __prepare_ICR(shortcut, vector, 0));
+}
- /*
- * program the ICR
- */
- cfg = __prepare_ICR(0, vector, dest);
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+void __default_send_IPI_dest_field(unsigned int dest_mask, int vector,
+ unsigned int dest_mode)
+{
+ /* See comment in __default_send_IPI_shortcut() */
+ if (unlikely(vector == NMI_VECTOR))
+ apic_mem_wait_icr_idle_timeout();
+ else
+ apic_mem_wait_icr_idle();
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- native_apic_mem_write(APIC_ICR, cfg);
+ /* Set the IPI destination field in the ICR */
+ native_apic_mem_write(APIC_ICR2, __prepare_ICR2(dest_mask));
+ /* Send it with the proper destination mode */
+ native_apic_mem_write(APIC_ICR, __prepare_ICR(0, vector, dest_mode));
}
void default_send_IPI_single_phys(int cpu, int vector)
@@ -90,18 +198,13 @@ void default_send_IPI_single_phys(int cpu, int vector)
void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector)
{
- unsigned long query_cpu;
unsigned long flags;
+ unsigned long cpu;
- /*
- * Hack. The clustered APIC addressing mode doesn't allow us to send
- * to an arbitrary mask, so I do a unicast to each CPU instead.
- * - mbligh
- */
local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
+ for_each_cpu(cpu, mask) {
__default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
- query_cpu), vector, APIC_DEST_PHYSICAL);
+ cpu), vector, APIC_DEST_PHYSICAL);
}
local_irq_restore(flags);
}
@@ -109,18 +212,15 @@ void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector)
void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
int vector)
{
- unsigned int this_cpu = smp_processor_id();
- unsigned int query_cpu;
+ unsigned int cpu, this_cpu = smp_processor_id();
unsigned long flags;
- /* See Hack comment above */
-
local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- if (query_cpu == this_cpu)
+ for_each_cpu(cpu, mask) {
+ if (cpu == this_cpu)
continue;
__default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
- query_cpu), vector, APIC_DEST_PHYSICAL);
+ cpu), vector, APIC_DEST_PHYSICAL);
}
local_irq_restore(flags);
}
@@ -130,54 +230,51 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
*/
void default_send_IPI_single(int cpu, int vector)
{
- apic->send_IPI_mask(cpumask_of(cpu), vector);
+ __apic_send_IPI_mask(cpumask_of(cpu), vector);
}
-#ifdef CONFIG_X86_32
+void default_send_IPI_allbutself(int vector)
+{
+ __default_send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
+}
-void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
- int vector)
+void default_send_IPI_all(int vector)
{
- unsigned long flags;
- unsigned int query_cpu;
+ __default_send_IPI_shortcut(APIC_DEST_ALLINC, vector);
+}
- /*
- * Hack. The clustered APIC addressing mode doesn't allow us to send
- * to an arbitrary mask, so I do a unicasts to each CPU instead. This
- * should be modified to do 1 message per cluster ID - mbligh
- */
+void default_send_IPI_self(int vector)
+{
+ __default_send_IPI_shortcut(APIC_DEST_SELF, vector);
+}
+
+#ifdef CONFIG_X86_32
+void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector)
+{
+ unsigned long flags;
+ unsigned int cpu;
local_irq_save(flags);
- for_each_cpu(query_cpu, mask)
- __default_send_IPI_dest_field(
- early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, apic->dest_logical);
+ for_each_cpu(cpu, mask)
+ __default_send_IPI_dest_field(1U << cpu, vector, APIC_DEST_LOGICAL);
local_irq_restore(flags);
}
void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
int vector)
{
+ unsigned int cpu, this_cpu = smp_processor_id();
unsigned long flags;
- unsigned int query_cpu;
- unsigned int this_cpu = smp_processor_id();
-
- /* See Hack comment above */
local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- if (query_cpu == this_cpu)
+ for_each_cpu(cpu, mask) {
+ if (cpu == this_cpu)
continue;
- __default_send_IPI_dest_field(
- early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, apic->dest_logical);
- }
+ __default_send_IPI_dest_field(1U << cpu, vector, APIC_DEST_LOGICAL);
+ }
local_irq_restore(flags);
}
-/*
- * This is only used on smaller machines.
- */
void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
{
unsigned long mask = cpumask_bits(cpumask)[0];
@@ -188,57 +285,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
local_irq_save(flags);
WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
- __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
+ __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
local_irq_restore(flags);
}
-
-void default_send_IPI_allbutself(int vector)
-{
- /*
- * if there are no other CPUs in the system then we get an APIC send
- * error if we try to broadcast, thus avoid sending IPIs in this case.
- */
- if (!(num_online_cpus() > 1))
- return;
-
- __default_local_send_IPI_allbutself(vector);
-}
-
-void default_send_IPI_all(int vector)
-{
- __default_local_send_IPI_all(vector);
-}
-
-void default_send_IPI_self(int vector)
-{
- __default_send_IPI_shortcut(APIC_DEST_SELF, vector, apic->dest_logical);
-}
-
-/* must come after the send_IPI functions above for inlining */
-static int convert_apicid_to_cpu(int apic_id)
-{
- int i;
-
- for_each_possible_cpu(i) {
- if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
- return i;
- }
- return -1;
-}
-
-int safe_smp_processor_id(void)
-{
- int apicid, cpuid;
-
- if (!boot_cpu_has(X86_FEATURE_APIC))
- return 0;
-
- apicid = hard_smp_processor_id();
- if (apicid == BAD_APICID)
- return 0;
-
- cpuid = convert_apicid_to_cpu(apicid);
-
- return cpuid >= 0 ? cpuid : 0;
-}
#endif
diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h
new file mode 100644
index 000000000000..bdcf609eb283
--- /dev/null
+++ b/arch/x86/kernel/apic/local.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Historical copyright notices:
+ *
+ * Copyright 2004 James Cleverdon, IBM.
+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ * (c) 2002,2003 Andi Kleen, SuSE Labs.
+ */
+
+#include <linux/jump_label.h>
+
+#include <asm/irq_vectors.h>
+#include <asm/apic.h>
+
+/* X2APIC */
+void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest);
+u32 x2apic_get_apic_id(u32 id);
+
+void x2apic_send_IPI_all(int vector);
+void x2apic_send_IPI_allbutself(int vector);
+void x2apic_send_IPI_self(int vector);
+extern u32 x2apic_max_apicid;
+
+/* IPI */
+
+DECLARE_STATIC_KEY_FALSE(apic_use_ipi_shorthand);
+
+static inline unsigned int __prepare_ICR(unsigned int shortcut, int vector,
+ unsigned int dest)
+{
+ unsigned int icr = shortcut | dest;
+
+ switch (vector) {
+ default:
+ icr |= APIC_DM_FIXED | vector;
+ break;
+ case NMI_VECTOR:
+ icr |= APIC_DM_NMI;
+ break;
+ }
+ return icr;
+}
+
+void default_init_apic_ldr(void);
+
+void apic_mem_wait_icr_idle(void);
+u32 apic_mem_wait_icr_idle_timeout(void);
+
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest);
+
+void default_send_IPI_single(int cpu, int vector);
+void default_send_IPI_single_phys(int cpu, int vector);
+void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector);
+void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, int vector);
+void default_send_IPI_allbutself(int vector);
+void default_send_IPI_all(int vector);
+void default_send_IPI_self(int vector);
+
+#ifdef CONFIG_X86_32
+void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector);
+void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector);
+void default_send_IPI_mask_logical(const struct cpumask *mask, int vector);
+#endif
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 72a94401f9e0..66bc5d3e79db 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Support of MSI, HPET and DMAR interrupts.
*
@@ -5,10 +6,6 @@
* Moved from arch/x86/kernel/apic/io_apic.c.
* Jiang Liu <jiang.liu@linux.intel.com>
* Convert to hierarchical irqdomain
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/mm.h>
#include <linux/interrupt.h>
@@ -18,175 +15,293 @@
#include <linux/hpet.h>
#include <linux/msi.h>
#include <asm/irqdomain.h>
-#include <asm/msidef.h>
#include <asm/hpet.h>
#include <asm/hw_irq.h>
#include <asm/apic.h>
#include <asm/irq_remapping.h>
+#include <asm/xen/hypervisor.h>
-static struct irq_domain *msi_default_domain;
+struct irq_domain *x86_pci_msi_default_domain __ro_after_init;
-static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
{
- struct irq_cfg *cfg = irqd_cfg(data);
-
- msg->address_hi = MSI_ADDR_BASE_HI;
-
- if (x2apic_enabled())
- msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid);
-
- msg->address_lo =
- MSI_ADDR_BASE_LO |
- ((apic->irq_dest_mode == 0) ?
- MSI_ADDR_DEST_MODE_PHYSICAL :
- MSI_ADDR_DEST_MODE_LOGICAL) |
- MSI_ADDR_REDIRECTION_CPU |
- MSI_ADDR_DEST_ID(cfg->dest_apicid);
-
- msg->data =
- MSI_DATA_TRIGGER_EDGE |
- MSI_DATA_LEVEL_ASSERT |
- MSI_DATA_DELIVERY_FIXED |
- MSI_DATA_VECTOR(cfg->vector);
-}
+ struct msi_msg msg[2] = { [1] = { }, };
-/*
- * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
- * which implement the MSI or MSI-X Capability Structure.
- */
-static struct irq_chip pci_msi_controller = {
- .name = "PCI-MSI",
- .irq_unmask = pci_msi_unmask_irq,
- .irq_mask = pci_msi_mask_irq,
- .irq_ack = irq_chip_ack_parent,
- .irq_retrigger = irq_chip_retrigger_hierarchy,
- .irq_compose_msi_msg = irq_msi_compose_msg,
- .flags = IRQCHIP_SKIP_SET_WAKE,
-};
+ __irq_msi_compose_msg(cfg, msg, false);
+ irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg);
+}
-int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+static int
+msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
{
- struct irq_domain *domain;
- struct irq_alloc_info info;
-
- init_irq_alloc_info(&info, NULL);
- info.type = X86_IRQ_ALLOC_TYPE_MSI;
- info.msi_dev = dev;
+ struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd);
+ struct irq_data *parent = irqd->parent_data;
+ unsigned int cpu;
+ int ret;
+
+ /* Save the current configuration */
+ cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd));
+ old_cfg = *cfg;
+
+ /* Allocate a new target vector */
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+ return ret;
+
+ /*
+ * For non-maskable and non-remapped MSI interrupts the migration
+ * to a different destination CPU and a different vector has to be
+ * done careful to handle the possible stray interrupt which can be
+ * caused by the non-atomic update of the address/data pair.
+ *
+ * Direct update is possible when:
+ * - The MSI is maskable (remapped MSI does not use this code path).
+ * The reservation mode bit is set in this case.
+ * - The new vector is the same as the old vector
+ * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
+ * - The interrupt is not yet started up
+ * - The new destination CPU is the same as the old destination CPU
+ */
+ if (!irqd_can_reserve(irqd) ||
+ cfg->vector == old_cfg.vector ||
+ old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
+ !irqd_is_started(irqd) ||
+ cfg->dest_apicid == old_cfg.dest_apicid) {
+ irq_msi_update_msg(irqd, cfg);
+ return ret;
+ }
- domain = irq_remapping_get_irq_domain(&info);
- if (domain == NULL)
- domain = msi_default_domain;
- if (domain == NULL)
- return -ENOSYS;
+ /*
+ * Paranoia: Validate that the interrupt target is the local
+ * CPU.
+ */
+ if (WARN_ON_ONCE(cpu != smp_processor_id())) {
+ irq_msi_update_msg(irqd, cfg);
+ return ret;
+ }
- return msi_domain_alloc_irqs(domain, &dev->dev, nvec);
+ /*
+ * Redirect the interrupt to the new vector on the current CPU
+ * first. This might cause a spurious interrupt on this vector if
+ * the device raises an interrupt right between this update and the
+ * update to the final destination CPU.
+ *
+ * If the vector is in use then the installed device handler will
+ * denote it as spurious which is no harm as this is a rare event
+ * and interrupt handlers have to cope with spurious interrupts
+ * anyway. If the vector is unused, then it is marked so it won't
+ * trigger the 'No irq handler for vector' warning in
+ * common_interrupt().
+ *
+ * This requires to hold vector lock to prevent concurrent updates to
+ * the affected vector.
+ */
+ lock_vector_lock();
+
+ /*
+ * Mark the new target vector on the local CPU if it is currently
+ * unused. Reuse the VECTOR_RETRIGGERED state which is also used in
+ * the CPU hotplug path for a similar purpose. This cannot be
+ * undone here as the current CPU has interrupts disabled and
+ * cannot handle the interrupt before the whole set_affinity()
+ * section is done. In the CPU unplug case, the current CPU is
+ * about to vanish and will not handle any interrupts anymore. The
+ * vector is cleaned up when the CPU comes online again.
+ */
+ if (IS_ERR_OR_NULL(this_cpu_read(vector_irq[cfg->vector])))
+ this_cpu_write(vector_irq[cfg->vector], VECTOR_RETRIGGERED);
+
+ /* Redirect it to the new vector on the local CPU temporarily */
+ old_cfg.vector = cfg->vector;
+ irq_msi_update_msg(irqd, &old_cfg);
+
+ /* Now transition it to the target CPU */
+ irq_msi_update_msg(irqd, cfg);
+
+ /*
+ * All interrupts after this point are now targeted at the new
+ * vector/CPU.
+ *
+ * Drop vector lock before testing whether the temporary assignment
+ * to the local CPU was hit by an interrupt raised in the device,
+ * because the retrigger function acquires vector lock again.
+ */
+ unlock_vector_lock();
+
+ /*
+ * Check whether the transition raced with a device interrupt and
+ * is pending in the local APICs IRR. It is safe to do this outside
+ * of vector lock as the irq_desc::lock of this interrupt is still
+ * held and interrupts are disabled: The check is not accessing the
+ * underlying vector store. It's just checking the local APIC's
+ * IRR.
+ */
+ if (lapic_vector_set_in_irr(cfg->vector))
+ irq_data_get_irq_chip(irqd)->irq_retrigger(irqd);
+
+ return ret;
}
-void native_teardown_msi_irq(unsigned int irq)
+/**
+ * pci_dev_has_default_msi_parent_domain - Check whether the device has the default
+ * MSI parent domain associated
+ * @dev: Pointer to the PCI device
+ */
+bool pci_dev_has_default_msi_parent_domain(struct pci_dev *dev)
{
- irq_domain_free_irqs(irq, 1);
+ struct irq_domain *domain = dev_get_msi_domain(&dev->dev);
+
+ if (!domain)
+ domain = dev_get_msi_domain(&dev->bus->dev);
+ if (!domain)
+ return false;
+
+ return domain == x86_vector_domain;
}
-static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info,
- msi_alloc_info_t *arg)
+/**
+ * x86_msi_prepare - Setup of msi_alloc_info_t for allocations
+ * @domain: The domain for which this setup happens
+ * @dev: The device for which interrupts are allocated
+ * @nvec: The number of vectors to allocate
+ * @alloc: The allocation info structure to initialize
+ *
+ * This function is to be used for all types of MSI domains above the x86
+ * vector domain and any intermediates. It is always invoked from the
+ * top level interrupt domain. The domain specific allocation
+ * functionality is determined via the @domain's bus token which allows to
+ * map the X86 specific allocation type.
+ */
+static int x86_msi_prepare(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *alloc)
{
- return arg->msi_hwirq;
+ struct msi_domain_info *info = domain->host_data;
+
+ init_irq_alloc_info(alloc, NULL);
+
+ switch (info->bus_token) {
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
+ return 0;
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
+ alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
+ return 0;
+ default:
+ return -EINVAL;
+ }
}
-int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
- msi_alloc_info_t *arg)
+/**
+ * x86_init_dev_msi_info - Domain info setup for MSI domains
+ * @dev: The device for which the domain should be created
+ * @domain: The (root) domain providing this callback
+ * @real_parent: The real parent domain of the to initialize domain
+ * @info: The domain info for the to initialize domain
+ *
+ * This function is to be used for all types of MSI domains above the x86
+ * vector domain and any intermediates. The domain specific functionality
+ * is determined via the @real_parent.
+ */
+static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+ struct irq_domain *real_parent, struct msi_domain_info *info)
{
- struct pci_dev *pdev = to_pci_dev(dev);
- struct msi_desc *desc = first_pci_msi_entry(pdev);
+ const struct msi_parent_ops *pops = real_parent->msi_parent_ops;
+
+ /* MSI parent domain specific settings */
+ switch (real_parent->bus_token) {
+ case DOMAIN_BUS_ANY:
+ /* Only the vector domain can have the ANY token */
+ if (WARN_ON_ONCE(domain != real_parent))
+ return false;
+ info->chip->irq_set_affinity = msi_set_affinity;
+ info->chip->flags |= IRQCHIP_MOVE_DEFERRED;
+ break;
+ case DOMAIN_BUS_DMAR:
+ case DOMAIN_BUS_AMDVI:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
- init_irq_alloc_info(arg, NULL);
- arg->msi_dev = pdev;
- if (desc->msi_attrib.is_msix) {
- arg->type = X86_IRQ_ALLOC_TYPE_MSIX;
- } else {
- arg->type = X86_IRQ_ALLOC_TYPE_MSI;
- arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
+ /* Is the target supported? */
+ switch(info->bus_token) {
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
}
- return 0;
-}
-EXPORT_SYMBOL_GPL(pci_msi_prepare);
+ /*
+ * Mask out the domain specific MSI feature flags which are not
+ * supported by the real parent.
+ */
+ info->flags &= pops->supported_flags;
+ /* Enforce the required flags */
+ info->flags |= X86_VECTOR_MSI_FLAGS_REQUIRED;
-void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
-{
- arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc);
-}
-EXPORT_SYMBOL_GPL(pci_msi_set_desc);
+ /* This is always invoked from the top level MSI domain! */
+ info->ops->msi_prepare = x86_msi_prepare;
-static struct msi_domain_ops pci_msi_domain_ops = {
- .get_hwirq = pci_msi_get_hwirq,
- .msi_prepare = pci_msi_prepare,
- .set_desc = pci_msi_set_desc,
-};
+ info->chip->irq_ack = irq_chip_ack_parent;
+ info->chip->irq_retrigger = irq_chip_retrigger_hierarchy;
+ info->chip->flags |= IRQCHIP_SKIP_SET_WAKE |
+ IRQCHIP_AFFINITY_PRE_STARTUP;
+
+ info->handler = handle_edge_irq;
+ info->handler_name = "edge";
+
+ return true;
+}
-static struct msi_domain_info pci_msi_domain_info = {
- .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
- MSI_FLAG_PCI_MSIX,
- .ops = &pci_msi_domain_ops,
- .chip = &pci_msi_controller,
- .handler = handle_edge_irq,
- .handler_name = "edge",
+static const struct msi_parent_ops x86_vector_msi_parent_ops = {
+ .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED,
+ .init_dev_msi_info = x86_init_dev_msi_info,
};
-void __init arch_init_msi_domain(struct irq_domain *parent)
+struct irq_domain * __init native_create_pci_msi_domain(void)
{
- struct fwnode_handle *fn;
-
- if (disable_apic)
- return;
+ if (apic_is_disabled)
+ return NULL;
- fn = irq_domain_alloc_named_fwnode("PCI-MSI");
- if (fn) {
- msi_default_domain =
- pci_msi_create_irq_domain(fn, &pci_msi_domain_info,
- parent);
- irq_domain_free_fwnode(fn);
- }
- if (!msi_default_domain)
- pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
+ x86_vector_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT;
+ x86_vector_domain->msi_parent_ops = &x86_vector_msi_parent_ops;
+ return x86_vector_domain;
}
-#ifdef CONFIG_IRQ_REMAP
-static struct irq_chip pci_msi_ir_controller = {
- .name = "IR-PCI-MSI",
- .irq_unmask = pci_msi_unmask_irq,
- .irq_mask = pci_msi_mask_irq,
- .irq_ack = irq_chip_ack_parent,
- .irq_retrigger = irq_chip_retrigger_hierarchy,
- .irq_set_vcpu_affinity = irq_chip_set_vcpu_affinity_parent,
- .flags = IRQCHIP_SKIP_SET_WAKE,
-};
-
-static struct msi_domain_info pci_msi_ir_domain_info = {
- .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
- MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX,
- .ops = &pci_msi_domain_ops,
- .chip = &pci_msi_ir_controller,
- .handler = handle_edge_irq,
- .handler_name = "edge",
-};
+void __init x86_create_pci_msi_domain(void)
+{
+ x86_pci_msi_default_domain = x86_init.irqs.create_pci_msi_domain();
+}
-struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent,
- const char *name, int id)
+/* Keep around for hyperV */
+int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+ msi_alloc_info_t *arg)
{
- struct fwnode_handle *fn;
- struct irq_domain *d;
+ init_irq_alloc_info(arg, NULL);
- fn = irq_domain_alloc_named_id_fwnode(name, id);
- if (!fn)
- return NULL;
- d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent);
- irq_domain_free_fwnode(fn);
- return d;
+ if (to_pci_dev(dev)->msix_enabled)
+ arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
+ else
+ arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
+ return 0;
}
-#endif
+EXPORT_SYMBOL_GPL(pci_msi_prepare);
#ifdef CONFIG_DMAR_TABLE
+/*
+ * The Intel IOMMU (ab)uses the high bits of the MSI address to contain the
+ * high bits of the destination APIC ID. This can't be done in the general
+ * case for MSIs as it would be targeting real memory above 4GiB not the
+ * APIC.
+ */
+static void dmar_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ __irq_msi_compose_msg(irqd_cfg(data), msg, true);
+}
+
static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
{
dmar_msi_write(data->irq, msg);
@@ -199,35 +314,30 @@ static struct irq_chip dmar_msi_controller = {
.irq_ack = irq_chip_ack_parent,
.irq_set_affinity = msi_domain_set_affinity,
.irq_retrigger = irq_chip_retrigger_hierarchy,
- .irq_compose_msi_msg = irq_msi_compose_msg,
+ .irq_compose_msi_msg = dmar_msi_compose_msg,
.irq_write_msi_msg = dmar_msi_write_msg,
- .flags = IRQCHIP_SKIP_SET_WAKE,
+ .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED |
+ IRQCHIP_AFFINITY_PRE_STARTUP,
};
-static irq_hw_number_t dmar_msi_get_hwirq(struct msi_domain_info *info,
- msi_alloc_info_t *arg)
-{
- return arg->dmar_id;
-}
-
static int dmar_msi_init(struct irq_domain *domain,
struct msi_domain_info *info, unsigned int virq,
irq_hw_number_t hwirq, msi_alloc_info_t *arg)
{
- irq_domain_set_info(domain, virq, arg->dmar_id, info->chip, NULL,
- handle_edge_irq, arg->dmar_data, "edge");
+ irq_domain_set_info(domain, virq, arg->devid, info->chip, NULL,
+ handle_edge_irq, arg->data, "edge");
return 0;
}
static struct msi_domain_ops dmar_msi_domain_ops = {
- .get_hwirq = dmar_msi_get_hwirq,
.msi_init = dmar_msi_init,
};
static struct msi_domain_info dmar_msi_domain_info = {
.ops = &dmar_msi_domain_ops,
.chip = &dmar_msi_controller,
+ .flags = MSI_FLAG_USE_DEF_DOM_OPS,
};
static struct irq_domain *dmar_get_irq_domain(void)
@@ -244,7 +354,8 @@ static struct irq_domain *dmar_get_irq_domain(void)
if (fn) {
dmar_domain = msi_create_irq_domain(fn, &dmar_msi_domain_info,
x86_vector_domain);
- irq_domain_free_fwnode(fn);
+ if (!dmar_domain)
+ irq_domain_free_fwnode(fn);
}
out:
mutex_unlock(&dmar_lock);
@@ -261,8 +372,9 @@ int dmar_alloc_hwirq(int id, int node, void *arg)
init_irq_alloc_info(&info, NULL);
info.type = X86_IRQ_ALLOC_TYPE_DMAR;
- info.dmar_id = id;
- info.dmar_data = arg;
+ info.devid = id;
+ info.hwirq = id;
+ info.data = arg;
return irq_domain_alloc_irqs(domain, 1, node, &info);
}
@@ -273,117 +385,7 @@ void dmar_free_hwirq(int irq)
}
#endif
-/*
- * MSI message composition
- */
-#ifdef CONFIG_HPET_TIMER
-static inline int hpet_dev_id(struct irq_domain *domain)
-{
- struct msi_domain_info *info = msi_get_domain_info(domain);
-
- return (int)(long)info->data;
-}
-
-static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
-{
- hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
-}
-
-static struct irq_chip hpet_msi_controller __ro_after_init = {
- .name = "HPET-MSI",
- .irq_unmask = hpet_msi_unmask,
- .irq_mask = hpet_msi_mask,
- .irq_ack = irq_chip_ack_parent,
- .irq_set_affinity = msi_domain_set_affinity,
- .irq_retrigger = irq_chip_retrigger_hierarchy,
- .irq_compose_msi_msg = irq_msi_compose_msg,
- .irq_write_msi_msg = hpet_msi_write_msg,
- .flags = IRQCHIP_SKIP_SET_WAKE,
-};
-
-static irq_hw_number_t hpet_msi_get_hwirq(struct msi_domain_info *info,
- msi_alloc_info_t *arg)
-{
- return arg->hpet_index;
-}
-
-static int hpet_msi_init(struct irq_domain *domain,
- struct msi_domain_info *info, unsigned int virq,
- irq_hw_number_t hwirq, msi_alloc_info_t *arg)
-{
- irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
- irq_domain_set_info(domain, virq, arg->hpet_index, info->chip, NULL,
- handle_edge_irq, arg->hpet_data, "edge");
-
- return 0;
-}
-
-static void hpet_msi_free(struct irq_domain *domain,
- struct msi_domain_info *info, unsigned int virq)
+bool arch_restore_msi_irqs(struct pci_dev *dev)
{
- irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
+ return xen_initdom_restore_msi(dev);
}
-
-static struct msi_domain_ops hpet_msi_domain_ops = {
- .get_hwirq = hpet_msi_get_hwirq,
- .msi_init = hpet_msi_init,
- .msi_free = hpet_msi_free,
-};
-
-static struct msi_domain_info hpet_msi_domain_info = {
- .ops = &hpet_msi_domain_ops,
- .chip = &hpet_msi_controller,
-};
-
-struct irq_domain *hpet_create_irq_domain(int hpet_id)
-{
- struct msi_domain_info *domain_info;
- struct irq_domain *parent, *d;
- struct irq_alloc_info info;
- struct fwnode_handle *fn;
-
- if (x86_vector_domain == NULL)
- return NULL;
-
- domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL);
- if (!domain_info)
- return NULL;
-
- *domain_info = hpet_msi_domain_info;
- domain_info->data = (void *)(long)hpet_id;
-
- init_irq_alloc_info(&info, NULL);
- info.type = X86_IRQ_ALLOC_TYPE_HPET;
- info.hpet_id = hpet_id;
- parent = irq_remapping_get_ir_irq_domain(&info);
- if (parent == NULL)
- parent = x86_vector_domain;
- else
- hpet_msi_controller.name = "IR-HPET-MSI";
-
- fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name,
- hpet_id);
- if (!fn) {
- kfree(domain_info);
- return NULL;
- }
-
- d = msi_create_irq_domain(fn, domain_info, parent);
- irq_domain_free_fwnode(fn);
- return d;
-}
-
-int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev,
- int dev_num)
-{
- struct irq_alloc_info info;
-
- init_irq_alloc_info(&info, NULL);
- info.type = X86_IRQ_ALLOC_TYPE_HPET;
- info.hpet_data = dev;
- info.hpet_id = hpet_dev_id(domain);
- info.hpet_index = dev_num;
-
- return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
-}
-#endif
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 47ff2976c292..87bc9e7ca5d6 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -1,94 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Default generic APIC driver. This handles up to 8 CPUs.
*
* Copyright 2003 Andi Kleen, SuSE Labs.
- * Subject to the GNU Public License, v.2
*
* Generic x86 APIC driver probe layer.
*/
-#include <linux/threads.h>
-#include <linux/cpumask.h>
#include <linux/export.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
#include <linux/errno.h>
-#include <asm/fixmap.h>
-#include <asm/mpspec.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/setup.h>
-
#include <linux/smp.h>
-#include <asm/ipi.h>
-
-#include <linux/interrupt.h>
-#include <asm/acpi.h>
-#include <asm/e820/api.h>
-
-#ifdef CONFIG_HOTPLUG_CPU
-#define DEFAULT_SEND_IPI (1)
-#else
-#define DEFAULT_SEND_IPI (0)
-#endif
-
-int no_broadcast = DEFAULT_SEND_IPI;
-
-static __init int no_ipi_broadcast(char *str)
-{
- get_option(&str, &no_broadcast);
- pr_info("Using %s mode\n",
- no_broadcast ? "No IPI Broadcast" : "IPI Broadcast");
- return 1;
-}
-__setup("no_ipi_broadcast=", no_ipi_broadcast);
-
-static int __init print_ipi_mode(void)
-{
- pr_info("Using IPI %s mode\n",
- no_broadcast ? "No-Shortcut" : "Shortcut");
- return 0;
-}
-late_initcall(print_ipi_mode);
-static int default_x86_32_early_logical_apicid(int cpu)
-{
- return 1 << cpu;
-}
+#include <xen/xen.h>
-static void setup_apic_flat_routing(void)
-{
-#ifdef CONFIG_X86_IO_APIC
- printk(KERN_INFO
- "Enabling APIC mode: Flat. Using %d I/O APICs\n",
- nr_ioapics);
-#endif
-}
+#include <asm/io_apic.h>
+#include <asm/apic.h>
+#include <asm/acpi.h>
-static int default_apic_id_registered(void)
-{
- return physid_isset(read_apic_id(), phys_cpu_present_map);
-}
+#include "local.h"
-/*
- * Set up the logical destination ID. Intel recommends to set DFR, LDR and
- * TPR before enabling an APIC. See e.g. "AP-388 82489DX User's Manual"
- * (Intel document number 292116).
- */
-static void default_init_apic_ldr(void)
+static u32 default_get_apic_id(u32 x)
{
- unsigned long val;
+ unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
- apic_write(APIC_DFR, APIC_DFR_VALUE);
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
- apic_write(APIC_LDR, val);
-}
-
-static int default_phys_pkg_id(int cpuid_apic, int index_msb)
-{
- return cpuid_apic >> index_msb;
+ if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID))
+ return (x >> 24) & 0xFF;
+ else
+ return (x >> 24) & 0x0F;
}
/* should be called last. */
@@ -101,29 +38,16 @@ static struct apic apic_default __ro_after_init = {
.name = "default",
.probe = probe_default,
- .acpi_madt_oem_check = NULL,
- .apic_id_valid = default_apic_id_valid,
- .apic_id_registered = default_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- /* logical delivery broadcast to all CPUs: */
- .irq_dest_mode = 1,
+ .dest_mode_logical = true,
.disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = default_check_apicid_used,
.init_apic_ldr = default_init_apic_ldr,
-
- .ioapic_phys_id_map = default_ioapic_phys_id_map,
- .setup_apic_routing = setup_apic_flat_routing,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = physid_set_mask_of_physid,
- .check_phys_apicid_present = default_check_phys_apicid_present,
- .phys_pkg_id = default_phys_pkg_id,
+ .max_apic_id = 0xFE,
.get_apic_id = default_get_apic_id,
- .set_apic_id = NULL,
.calc_dest_apicid = apic_flat_calc_apicid,
@@ -134,17 +58,13 @@ static struct apic apic_default __ro_after_init = {
.send_IPI_all = default_send_IPI_all,
.send_IPI_self = default_send_IPI_self,
- .inquire_remote_apic = default_inquire_remote_apic,
-
.read = native_apic_mem_read,
.write = native_apic_mem_write,
- .eoi_write = native_apic_mem_write,
+ .eoi = native_apic_mem_eoi,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-
- .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid,
+ .wait_icr_idle = apic_mem_wait_icr_idle,
+ .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout,
};
apic_driver(apic_default);
@@ -162,7 +82,7 @@ static int __init parse_apic(char *arg)
for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
if (!strcmp((*drv)->name, arg)) {
- apic = *drv;
+ apic_install_driver(*drv);
cmdline_apic = 1;
return 0;
}
@@ -173,51 +93,14 @@ static int __init parse_apic(char *arg)
}
early_param("apic", parse_apic);
-void __init default_setup_apic_routing(void)
-{
- int version = boot_cpu_apic_version;
-
- if (num_possible_cpus() > 8) {
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- if (!APIC_XAPIC(version)) {
- def_to_bigsmp = 0;
- break;
- }
- /* If P4 and above fall through */
- case X86_VENDOR_HYGON:
- case X86_VENDOR_AMD:
- def_to_bigsmp = 1;
- }
- }
-
-#ifdef CONFIG_X86_BIGSMP
- /*
- * This is used to switch to bigsmp mode when
- * - There is no apic= option specified by the user
- * - generic_apic_probe() has chosen apic_default as the sub_arch
- * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
- */
-
- if (!cmdline_apic && apic == &apic_default)
- generic_bigsmp_probe();
-#endif
-
- if (apic->setup_apic_routing)
- apic->setup_apic_routing();
-
- if (x86_platform.apic_post_init)
- x86_platform.apic_post_init();
-}
-
-void __init generic_apic_probe(void)
+void __init x86_32_probe_apic(void)
{
if (!cmdline_apic) {
struct apic **drv;
for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
if ((*drv)->probe()) {
- apic = *drv;
+ apic_install_driver(*drv);
break;
}
}
@@ -225,26 +108,4 @@ void __init generic_apic_probe(void)
if (drv == __apicdrivers_end)
panic("Didn't find an APIC driver");
}
- printk(KERN_INFO "Using APIC driver %s\n", apic->name);
-}
-
-/* This function can switch the APIC even after the initial ->probe() */
-int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- struct apic **drv;
-
- for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
- if (!(*drv)->acpi_madt_oem_check)
- continue;
- if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id))
- continue;
-
- if (!cmdline_apic) {
- apic = *drv;
- printk(KERN_INFO "Switched to APIC driver `%s'.\n",
- apic->name);
- }
- return 1;
- }
- return 0;
}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index c303054b90b5..ecdf0c4121e1 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -1,6 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2004 James Cleverdon, IBM.
- * Subject to the GNU Public License, v.2
*
* Generic APIC sub-arch probe layer.
*
@@ -8,24 +8,13 @@
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
* James Cleverdon.
*/
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/hardirq.h>
-#include <linux/dmar.h>
-
-#include <asm/smp.h>
+#include <linux/thread_info.h>
#include <asm/apic.h>
-#include <asm/ipi.h>
-#include <asm/setup.h>
-/*
- * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
- */
-void __init default_setup_apic_routing(void)
+#include "local.h"
+
+/* Select the appropriate APIC driver */
+void __init x86_64_probe_apic(void)
{
struct apic **drv;
@@ -33,24 +22,10 @@ void __init default_setup_apic_routing(void)
for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
if ((*drv)->probe && (*drv)->probe()) {
- if (apic != *drv) {
- apic = *drv;
- pr_info("Switched APIC routing to %s.\n",
- apic->name);
- }
+ apic_install_driver(*drv);
break;
}
}
-
- if (x86_platform.apic_post_init)
- x86_platform.apic_post_init();
-}
-
-/* Same for both flat and physical. */
-
-void apic_send_IPI_self(int vector)
-{
- __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
}
int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
@@ -59,11 +34,7 @@ int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) {
- if (apic != *drv) {
- apic = *drv;
- pr_info("Setting APIC routing to %s.\n",
- apic->name);
- }
+ apic_install_driver(*drv);
return 1;
}
}
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 3173e07d3791..bddc54465399 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Local APIC related interfaces to support IOAPIC, MSI, etc.
*
@@ -5,10 +6,6 @@
* Moved from arch/x86/kernel/apic/io_apic.c.
* Jiang Liu <jiang.liu@linux.intel.com>
* Enable support of hierarchical irqdomains
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/interrupt.h>
#include <linux/irq.h>
@@ -47,7 +44,18 @@ static cpumask_var_t vector_searchmask;
static struct irq_chip lapic_controller;
static struct irq_matrix *vector_matrix;
#ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct hlist_head, cleanup_list);
+
+static void vector_cleanup_callback(struct timer_list *tmr);
+
+struct vector_cleanup {
+ struct hlist_head head;
+ struct timer_list timer;
+};
+
+static DEFINE_PER_CPU(struct vector_cleanup, vector_cleanup) = {
+ .head = HLIST_HEAD_INIT,
+ .timer = __TIMER_INITIALIZER(vector_cleanup_callback, TIMER_PINNED),
+};
#endif
void lock_vector_lock(void)
@@ -126,13 +134,20 @@ static void apic_update_irq_cfg(struct irq_data *irqd, unsigned int vector,
apicd->hw_irq_cfg.vector = vector;
apicd->hw_irq_cfg.dest_apicid = apic->calc_dest_apicid(cpu);
+
+ apic_update_vector(cpu, vector, true);
+
irq_data_update_effective_affinity(irqd, cpumask_of(cpu));
- trace_vector_config(irqd->irq, vector, cpu,
- apicd->hw_irq_cfg.dest_apicid);
+ trace_vector_config(irqd->irq, vector, cpu, apicd->hw_irq_cfg.dest_apicid);
}
-static void apic_update_vector(struct irq_data *irqd, unsigned int newvec,
- unsigned int newcpu)
+static void apic_free_vector(unsigned int cpu, unsigned int vector, bool managed)
+{
+ apic_update_vector(cpu, vector, false);
+ irq_matrix_free(vector_matrix, cpu, vector, managed);
+}
+
+static void chip_data_update(struct irq_data *irqd, unsigned int newvec, unsigned int newcpu)
{
struct apic_chip_data *apicd = apic_chip_data(irqd);
struct irq_desc *desc = irq_data_to_desc(irqd);
@@ -164,9 +179,9 @@ static void apic_update_vector(struct irq_data *irqd, unsigned int newvec,
apicd->move_in_progress = true;
apicd->prev_vector = apicd->vector;
apicd->prev_cpu = apicd->cpu;
+ WARN_ON_ONCE(apicd->cpu == newcpu);
} else {
- irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector,
- managed);
+ apic_free_vector(apicd->cpu, apicd->vector, managed);
}
setnew:
@@ -174,6 +189,7 @@ setnew:
apicd->cpu = newcpu;
BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec]));
per_cpu(vector_irq, newcpu)[newvec] = desc;
+ apic_update_irq_cfg(irqd, newvec, newcpu);
}
static void vector_assign_managed_shutdown(struct irq_data *irqd)
@@ -251,8 +267,7 @@ assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest)
trace_vector_alloc(irqd->irq, vector, resvd, vector);
if (vector < 0)
return vector;
- apic_update_vector(irqd, vector, cpu);
- apic_update_irq_cfg(irqd, vector, cpu);
+ chip_data_update(irqd, vector, cpu);
return 0;
}
@@ -275,20 +290,24 @@ static int assign_irq_vector_any_locked(struct irq_data *irqd)
const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd);
int node = irq_data_get_node(irqd);
- if (node == NUMA_NO_NODE)
- goto all;
- /* Try the intersection of @affmsk and node mask */
- cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk);
- if (!assign_vector_locked(irqd, vector_searchmask))
- return 0;
- /* Try the node mask */
- if (!assign_vector_locked(irqd, cpumask_of_node(node)))
- return 0;
-all:
+ if (node != NUMA_NO_NODE) {
+ /* Try the intersection of @affmsk and node mask */
+ cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk);
+ if (!assign_vector_locked(irqd, vector_searchmask))
+ return 0;
+ }
+
/* Try the full affinity mask */
cpumask_and(vector_searchmask, affmsk, cpu_online_mask);
if (!assign_vector_locked(irqd, vector_searchmask))
return 0;
+
+ if (node != NUMA_NO_NODE) {
+ /* Try the node mask */
+ if (!assign_vector_locked(irqd, cpumask_of_node(node)))
+ return 0;
+ }
+
/* Try the full online mask */
return assign_vector_locked(irqd, cpu_online_mask);
}
@@ -324,8 +343,8 @@ assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest)
trace_vector_alloc_managed(irqd->irq, vector, vector);
if (vector < 0)
return vector;
- apic_update_vector(irqd, vector, cpu);
- apic_update_irq_cfg(irqd, vector, cpu);
+ chip_data_update(irqd, vector, cpu);
+
return 0;
}
@@ -343,8 +362,8 @@ static void clear_irq_vector(struct irq_data *irqd)
trace_vector_clear(irqd->irq, vector, apicd->cpu, apicd->prev_vector,
apicd->prev_cpu);
- per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_UNUSED;
- irq_matrix_free(vector_matrix, apicd->cpu, vector, managed);
+ per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_SHUTDOWN;
+ apic_free_vector(apicd->cpu, vector, managed);
apicd->vector = 0;
/* Clean up move in progress */
@@ -352,8 +371,8 @@ static void clear_irq_vector(struct irq_data *irqd)
if (!vector)
return;
- per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_UNUSED;
- irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, managed);
+ per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_SHUTDOWN;
+ apic_free_vector(apicd->prev_cpu, vector, managed);
apicd->prev_vector = 0;
apicd->move_in_progress = 0;
hlist_del_init(&apicd->clist);
@@ -401,6 +420,17 @@ static int activate_reserved(struct irq_data *irqd)
if (!irqd_can_reserve(irqd))
apicd->can_reserve = false;
}
+
+ /*
+ * Check to ensure that the effective affinity mask is a subset
+ * the user supplied affinity mask, and warn the user if it is not
+ */
+ if (!cpumask_subset(irq_data_get_effective_affinity_mask(irqd),
+ irq_data_get_affinity_mask(irqd))) {
+ pr_warn("irq %u: Affinity broken due to vector space exhaustion.\n",
+ irqd->irq);
+ }
+
return ret;
}
@@ -438,12 +468,10 @@ static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd,
trace_vector_activate(irqd->irq, apicd->is_managed,
apicd->can_reserve, reserve);
- /* Nothing to do for fixed assigned vectors */
- if (!apicd->can_reserve && !apicd->is_managed)
- return 0;
-
raw_spin_lock_irqsave(&vector_lock, flags);
- if (reserve || irqd_is_managed_and_shutdown(irqd))
+ if (!apicd->can_reserve && !apicd->is_managed)
+ assign_irq_vector_any_locked(irqd);
+ else if (reserve || irqd_is_managed_and_shutdown(irqd))
vector_assign_managed_shutdown(irqd);
else if (apicd->is_managed)
ret = activate_managed(irqd);
@@ -525,12 +553,16 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
struct irq_data *irqd;
int i, err, node;
- if (disable_apic)
+ if (apic_is_disabled)
return -ENXIO;
- /* Currently vector allocator can't guarantee contiguous allocations */
- if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1)
- return -ENOSYS;
+ /*
+ * Catch any attempt to touch the cascade interrupt on a PIC
+ * equipped system.
+ */
+ if (WARN_ON_ONCE(info->flags & X86_IRQ_ALLOC_LEGACY &&
+ virq == PIC_CASCADE_IR))
+ return -EINVAL;
for (i = 0; i < nr_irqs; i++) {
irqd = irq_domain_get_irq_data(domain, virq + i);
@@ -549,6 +581,16 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
irqd->hwirq = virq + i;
irqd_set_single_target(irqd);
/*
+ * Prevent that any of these interrupts is invoked in
+ * non interrupt context via e.g. generic_handle_irq()
+ * as that can corrupt the affinity move state.
+ */
+ irqd_set_handle_enforce_irqctx(irqd);
+
+ /* Don't invoke affinity setter on deactivated interrupts */
+ irqd_set_affinity_on_activate(irqd);
+
+ /*
* Legacy vectors are already assigned when the IOAPIC
* takes them over. They stay on the same vector. This is
* required for check_timer() to work correctly as it might
@@ -619,7 +661,50 @@ static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d,
}
#endif
+int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec)
+{
+ if (fwspec->param_count != 1)
+ return 0;
+
+ if (is_fwnode_irqchip(fwspec->fwnode)) {
+ const char *fwname = fwnode_get_name(fwspec->fwnode);
+ return fwname && !strncmp(fwname, "IO-APIC-", 8) &&
+ simple_strtol(fwname+8, NULL, 10) == fwspec->param[0];
+ }
+ return to_of_node(fwspec->fwnode) &&
+ of_device_is_compatible(to_of_node(fwspec->fwnode),
+ "intel,ce4100-ioapic");
+}
+
+int x86_fwspec_is_hpet(struct irq_fwspec *fwspec)
+{
+ if (fwspec->param_count != 1)
+ return 0;
+
+ if (is_fwnode_irqchip(fwspec->fwnode)) {
+ const char *fwname = fwnode_get_name(fwspec->fwnode);
+ return fwname && !strncmp(fwname, "HPET-MSI-", 9) &&
+ simple_strtol(fwname+9, NULL, 10) == fwspec->param[0];
+ }
+ return 0;
+}
+
+static int x86_vector_select(struct irq_domain *d, struct irq_fwspec *fwspec,
+ enum irq_domain_bus_token bus_token)
+{
+ /*
+ * HPET and I/OAPIC cannot be parented in the vector domain
+ * if IRQ remapping is enabled. APIC IDs above 15 bits are
+ * only permitted if IRQ remapping is enabled, so check that.
+ */
+ if (apic_id_valid(32768))
+ return 0;
+
+ return x86_fwspec_is_ioapic(fwspec) || x86_fwspec_is_hpet(fwspec);
+}
+
static const struct irq_domain_ops x86_vector_domain_ops = {
+ .select = x86_vector_select,
.alloc = x86_vector_alloc_irqs,
.free = x86_vector_free_irqs,
.activate = x86_vector_activate,
@@ -633,8 +718,8 @@ int __init arch_probe_nr_irqs(void)
{
int nr;
- if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
- nr_irqs = NR_VECTORS * nr_cpu_ids;
+ if (irq_get_nr_irqs() > NR_VECTORS * nr_cpu_ids)
+ irq_set_nr_irqs(NR_VECTORS * nr_cpu_ids);
nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids;
#if defined(CONFIG_PCI_MSI)
@@ -646,8 +731,8 @@ int __init arch_probe_nr_irqs(void)
else
nr += gsi_top * 16;
#endif
- if (nr < nr_irqs)
- nr_irqs = nr;
+ if (nr < irq_get_nr_irqs())
+ irq_set_nr_irqs(nr);
/*
* We don't know if PIC is present at this point so we need to do
@@ -659,18 +744,38 @@ int __init arch_probe_nr_irqs(void)
void lapic_assign_legacy_vector(unsigned int irq, bool replace)
{
/*
- * Use assign system here so it wont get accounted as allocated
- * and moveable in the cpu hotplug check and it prevents managed
+ * Use assign system here so it won't get accounted as allocated
+ * and movable in the cpu hotplug check and it prevents managed
* irq reservation from touching it.
*/
irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace);
}
+void __init lapic_update_legacy_vectors(void)
+{
+ unsigned int i;
+
+ if (IS_ENABLED(CONFIG_X86_IO_APIC) && nr_ioapics > 0)
+ return;
+
+ /*
+ * If the IO/APIC is disabled via config, kernel command line or
+ * lack of enumeration then all legacy interrupts are routed
+ * through the PIC. Make sure that they are marked as legacy
+ * vectors. PIC_CASCADE_IRQ has already been marked in
+ * lapic_assign_system_vectors().
+ */
+ for (i = 0; i < nr_legacy_irqs(); i++) {
+ if (i != PIC_CASCADE_IR)
+ lapic_assign_legacy_vector(i, true);
+ }
+}
+
void __init lapic_assign_system_vectors(void)
{
- unsigned int i, vector = 0;
+ unsigned int i, vector;
- for_each_set_bit_from(vector, system_vectors, NR_VECTORS)
+ for_each_set_bit(vector, system_vectors, NR_VECTORS)
irq_matrix_assign_system(vector_matrix, vector, false);
if (nr_legacy_irqs() > 1)
@@ -681,6 +786,11 @@ void __init lapic_assign_system_vectors(void)
/* Mark the preallocated legacy interrupts */
for (i = 0; i < nr_legacy_irqs(); i++) {
+ /*
+ * Don't touch the cascade interrupt. It's unusable
+ * on PIC equipped machines. See the large comment
+ * in the IO/APIC code.
+ */
if (i != PIC_CASCADE_IR)
irq_matrix_assign(vector_matrix, ISA_IRQ_VECTOR(i));
}
@@ -695,10 +805,7 @@ int __init arch_early_irq_init(void)
x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops,
NULL);
BUG_ON(x86_vector_domain == NULL);
- irq_domain_free_fwnode(fn);
- irq_set_default_host(x86_vector_domain);
-
- arch_init_msi_domain(x86_vector_domain);
+ irq_set_default_domain(x86_vector_domain);
BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL));
@@ -751,30 +858,31 @@ void lapic_online(void)
this_cpu_write(vector_irq[vector], __setup_vector_irq(vector));
}
+static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr);
+
void lapic_offline(void)
{
+ struct vector_cleanup *cl = this_cpu_ptr(&vector_cleanup);
+
lock_vector_lock();
+
+ /* In case the vector cleanup timer has not expired */
+ __vector_cleanup(cl, false);
+
irq_matrix_offline(vector_matrix);
+ WARN_ON_ONCE(timer_delete_sync_try(&cl->timer) < 0);
+ WARN_ON_ONCE(!hlist_empty(&cl->head));
+
unlock_vector_lock();
}
static int apic_set_affinity(struct irq_data *irqd,
const struct cpumask *dest, bool force)
{
- struct apic_chip_data *apicd = apic_chip_data(irqd);
int err;
- /*
- * Core code can call here for inactive interrupts. For inactive
- * interrupts which use managed or reservation mode there is no
- * point in going through the vector assignment right now as the
- * activation will assign a vector which fits the destination
- * cpumask. Let the core code store the destination mask and be
- * done with it.
- */
- if (!irqd_is_activated(irqd) &&
- (apicd->is_managed || apicd->can_reserve))
- return IRQ_SET_MASK_OK;
+ if (WARN_ON_ONCE(!irqd_is_activated(irqd)))
+ return -EIO;
raw_spin_lock(&vector_lock);
cpumask_and(vector_searchmask, dest, cpu_online_mask);
@@ -786,43 +894,6 @@ static int apic_set_affinity(struct irq_data *irqd,
return err ? err : IRQ_SET_MASK_OK;
}
-#else
-# define apic_set_affinity NULL
-#endif
-
-static int apic_retrigger_irq(struct irq_data *irqd)
-{
- struct apic_chip_data *apicd = apic_chip_data(irqd);
- unsigned long flags;
-
- raw_spin_lock_irqsave(&vector_lock, flags);
- apic->send_IPI(apicd->cpu, apicd->vector);
- raw_spin_unlock_irqrestore(&vector_lock, flags);
-
- return 1;
-}
-
-void apic_ack_irq(struct irq_data *irqd)
-{
- irq_move_irq(irqd);
- ack_APIC_irq();
-}
-
-void apic_ack_edge(struct irq_data *irqd)
-{
- irq_complete_move(irqd_cfg(irqd));
- apic_ack_irq(irqd);
-}
-
-static struct irq_chip lapic_controller = {
- .name = "APIC",
- .irq_ack = apic_ack_edge,
- .irq_set_affinity = apic_set_affinity,
- .irq_retrigger = apic_retrigger_irq,
-};
-
-#ifdef CONFIG_SMP
-
static void free_moved_vector(struct apic_chip_data *apicd)
{
unsigned int vector = apicd->prev_vector;
@@ -830,131 +901,44 @@ static void free_moved_vector(struct apic_chip_data *apicd)
bool managed = apicd->is_managed;
/*
- * This should never happen. Managed interrupts are not
- * migrated except on CPU down, which does not involve the
- * cleanup vector. But try to keep the accounting correct
- * nevertheless.
+ * Managed interrupts are usually not migrated away
+ * from an online CPU, but CPU isolation 'managed_irq'
+ * can make that happen.
+ * 1) Activation does not take the isolation into account
+ * to keep the code simple
+ * 2) Migration away from an isolated CPU can happen when
+ * a non-isolated CPU which is in the calculated
+ * affinity mask comes online.
*/
- WARN_ON_ONCE(managed);
-
trace_vector_free_moved(apicd->irq, cpu, vector, managed);
- irq_matrix_free(vector_matrix, cpu, vector, managed);
+ apic_free_vector(cpu, vector, managed);
per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
hlist_del_init(&apicd->clist);
apicd->prev_vector = 0;
apicd->move_in_progress = 0;
}
-asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void)
-{
- struct hlist_head *clhead = this_cpu_ptr(&cleanup_list);
- struct apic_chip_data *apicd;
- struct hlist_node *tmp;
-
- entering_ack_irq();
- /* Prevent vectors vanishing under us */
- raw_spin_lock(&vector_lock);
-
- hlist_for_each_entry_safe(apicd, tmp, clhead, clist) {
- unsigned int irr, vector = apicd->prev_vector;
-
- /*
- * Paranoia: Check if the vector that needs to be cleaned
- * up is registered at the APICs IRR. If so, then this is
- * not the best time to clean it up. Clean it up in the
- * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
- * to this CPU. IRQ_MOVE_CLEANUP_VECTOR is the lowest
- * priority external vector, so on return from this
- * interrupt the device interrupt will happen first.
- */
- irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
- if (irr & (1U << (vector % 32))) {
- apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
- continue;
- }
- free_moved_vector(apicd);
- }
-
- raw_spin_unlock(&vector_lock);
- exiting_irq();
-}
-
-static void __send_cleanup_vector(struct apic_chip_data *apicd)
-{
- unsigned int cpu;
-
- raw_spin_lock(&vector_lock);
- apicd->move_in_progress = 0;
- cpu = apicd->prev_cpu;
- if (cpu_online(cpu)) {
- hlist_add_head(&apicd->clist, per_cpu_ptr(&cleanup_list, cpu));
- apic->send_IPI(cpu, IRQ_MOVE_CLEANUP_VECTOR);
- } else {
- apicd->prev_vector = 0;
- }
- raw_spin_unlock(&vector_lock);
-}
-
-void send_cleanup_vector(struct irq_cfg *cfg)
-{
- struct apic_chip_data *apicd;
-
- apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg);
- if (apicd->move_in_progress)
- __send_cleanup_vector(apicd);
-}
-
-static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
-{
- struct apic_chip_data *apicd;
-
- apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg);
- if (likely(!apicd->move_in_progress))
- return;
-
- if (vector == apicd->vector && apicd->cpu == smp_processor_id())
- __send_cleanup_vector(apicd);
-}
-
-void irq_complete_move(struct irq_cfg *cfg)
-{
- __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
-}
-
/*
* Called from fixup_irqs() with @desc->lock held and interrupts disabled.
*/
-void irq_force_complete_move(struct irq_desc *desc)
+static void apic_force_complete_move(struct irq_data *irqd)
{
+ unsigned int cpu = smp_processor_id();
struct apic_chip_data *apicd;
- struct irq_data *irqd;
unsigned int vector;
- /*
- * The function is called for all descriptors regardless of which
- * irqdomain they belong to. For example if an IRQ is provided by
- * an irq_chip as part of a GPIO driver, the chip data for that
- * descriptor is specific to the irq_chip in question.
- *
- * Check first that the chip_data is what we expect
- * (apic_chip_data) before touching it any further.
- */
- irqd = irq_domain_get_irq_data(x86_vector_domain,
- irq_desc_get_irq(desc));
- if (!irqd)
- return;
-
- raw_spin_lock(&vector_lock);
+ guard(raw_spinlock)(&vector_lock);
apicd = apic_chip_data(irqd);
if (!apicd)
- goto unlock;
+ return;
/*
- * If prev_vector is empty, no action required.
+ * If prev_vector is empty or the descriptor is neither currently
+ * nor previously on the outgoing CPU no action required.
*/
vector = apicd->prev_vector;
- if (!vector)
- goto unlock;
+ if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu))
+ return;
/*
* This is tricky. If the cleanup of the old vector has not been
@@ -985,7 +969,7 @@ void irq_force_complete_move(struct irq_desc *desc)
*
* But in case of cpu hotplug this should be a non issue
* because if the affinity update happens right before all
- * cpus rendevouz in stop machine, there is no way that the
+ * cpus rendezvous in stop machine, there is no way that the
* interrupt can be blocked on the target cpu because all cpus
* loops first with interrupts enabled in stop machine, so the
* old vector is not yet cleaned up when the interrupt fires.
@@ -994,7 +978,7 @@ void irq_force_complete_move(struct irq_desc *desc)
* of the interrupt on the apic/system bus would be delayed
* beyond the point where the target cpu disables interrupts
* in stop machine. I doubt that it can happen, but at least
- * there is a theroretical chance. Virtualization might be
+ * there is a theoretical chance. Virtualization might be
* able to expose this, but AFAICT the IOAPIC emulation is not
* as stupid as the real hardware.
*
@@ -1008,10 +992,166 @@ void irq_force_complete_move(struct irq_desc *desc)
irqd->irq, vector);
}
free_moved_vector(apicd);
-unlock:
+}
+
+#else
+# define apic_set_affinity NULL
+# define apic_force_complete_move NULL
+#endif
+
+static int apic_retrigger_irq(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ __apic_send_IPI(apicd->cpu, apicd->vector);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+ return 1;
+}
+
+void apic_ack_irq(struct irq_data *irqd)
+{
+ irq_move_irq(irqd);
+ apic_eoi();
+}
+
+void apic_ack_edge(struct irq_data *irqd)
+{
+ irq_complete_move(irqd_cfg(irqd));
+ apic_ack_irq(irqd);
+}
+
+static void x86_vector_msi_compose_msg(struct irq_data *data,
+ struct msi_msg *msg)
+{
+ __irq_msi_compose_msg(irqd_cfg(data), msg, false);
+}
+
+static struct irq_chip lapic_controller = {
+ .name = "APIC",
+ .irq_ack = apic_ack_edge,
+ .irq_set_affinity = apic_set_affinity,
+ .irq_compose_msi_msg = x86_vector_msi_compose_msg,
+ .irq_force_complete_move = apic_force_complete_move,
+ .irq_retrigger = apic_retrigger_irq,
+};
+
+#ifdef CONFIG_SMP
+
+static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr)
+{
+ struct apic_chip_data *apicd;
+ struct hlist_node *tmp;
+ bool rearm = false;
+
+ lockdep_assert_held(&vector_lock);
+
+ hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) {
+ unsigned int vector = apicd->prev_vector;
+
+ /*
+ * Paranoia: Check if the vector that needs to be cleaned
+ * up is registered at the APICs IRR. That's clearly a
+ * hardware issue if the vector arrived on the old target
+ * _after_ interrupts were disabled above. Keep @apicd
+ * on the list and schedule the timer again to give the CPU
+ * a chance to handle the pending interrupt.
+ *
+ * Do not check IRR when called from lapic_offline(), because
+ * fixup_irqs() was just called to scan IRR for set bits and
+ * forward them to new destination CPUs via IPIs.
+ */
+ if (check_irr && is_vector_pending(vector)) {
+ pr_warn_once("Moved interrupt pending in old target APIC %u\n", apicd->irq);
+ rearm = true;
+ continue;
+ }
+ free_moved_vector(apicd);
+ }
+
+ /*
+ * Must happen under vector_lock to make the timer_pending() check
+ * in __vector_schedule_cleanup() race free against the rearm here.
+ */
+ if (rearm)
+ mod_timer(&cl->timer, jiffies + 1);
+}
+
+static void vector_cleanup_callback(struct timer_list *tmr)
+{
+ struct vector_cleanup *cl = container_of(tmr, typeof(*cl), timer);
+
+ /* Prevent vectors vanishing under us */
+ raw_spin_lock_irq(&vector_lock);
+ __vector_cleanup(cl, true);
+ raw_spin_unlock_irq(&vector_lock);
+}
+
+static void __vector_schedule_cleanup(struct apic_chip_data *apicd)
+{
+ unsigned int cpu = apicd->prev_cpu;
+
+ raw_spin_lock(&vector_lock);
+ apicd->move_in_progress = 0;
+ if (cpu_online(cpu)) {
+ struct vector_cleanup *cl = per_cpu_ptr(&vector_cleanup, cpu);
+
+ hlist_add_head(&apicd->clist, &cl->head);
+
+ /*
+ * The lockless timer_pending() check is safe here. If it
+ * returns true, then the callback will observe this new
+ * apic data in the hlist as everything is serialized by
+ * vector lock.
+ *
+ * If it returns false then the timer is either not armed
+ * or the other CPU executes the callback, which again
+ * would be blocked on vector lock. Rearming it in the
+ * latter case makes it fire for nothing.
+ *
+ * This is also safe against the callback rearming the timer
+ * because that's serialized via vector lock too.
+ */
+ if (!timer_pending(&cl->timer)) {
+ cl->timer.expires = jiffies + 1;
+ add_timer_on(&cl->timer, cpu);
+ }
+ } else {
+ pr_warn("IRQ %u schedule cleanup for offline CPU %u\n", apicd->irq, cpu);
+ free_moved_vector(apicd);
+ }
raw_spin_unlock(&vector_lock);
}
+void vector_schedule_cleanup(struct irq_cfg *cfg)
+{
+ struct apic_chip_data *apicd;
+
+ apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg);
+ if (apicd->move_in_progress)
+ __vector_schedule_cleanup(apicd);
+}
+
+void irq_complete_move(struct irq_cfg *cfg)
+{
+ struct apic_chip_data *apicd;
+
+ apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg);
+ if (likely(!apicd->move_in_progress))
+ return;
+
+ /*
+ * If the interrupt arrived on the new target CPU, cleanup the
+ * vector on the old target CPU. A vector check is not required
+ * because an interrupt can never move from one vector to another
+ * on the same CPU.
+ */
+ if (apicd->cpu == smp_processor_id())
+ __vector_schedule_cleanup(apicd);
+}
+
#ifdef CONFIG_HOTPLUG_CPU
/*
* Note, this is not accurate accounting, but at least good enough to
@@ -1061,7 +1201,7 @@ static void __init print_local_APIC(void *dummy)
u64 icr;
pr_debug("printing local APIC contents on CPU#%d/%d:\n",
- smp_processor_id(), hard_smp_processor_id());
+ smp_processor_id(), read_apic_id());
v = apic_read(APIC_ID);
pr_info("... APIC ID: %08x (%01x)\n", v, read_apic_id());
v = apic_read(APIC_LVR);
@@ -1206,7 +1346,7 @@ static void __init print_PIC(void)
pr_debug("... PIC ISR: %04x\n", v);
- v = inb(0x4d1) << 8 | inb(0x4d0);
+ v = inb(PIC_ELCR2) << 8 | inb(PIC_ELCR1);
pr_debug("... PIC ELCR: %04x\n", v);
}
diff --git a/arch/x86/kernel/apic/x2apic.h b/arch/x86/kernel/apic/x2apic.h
deleted file mode 100644
index a49b3604027f..000000000000
--- a/arch/x86/kernel/apic/x2apic.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/* Common bits for X2APIC cluster/physical modes. */
-
-int x2apic_apic_id_valid(u32 apicid);
-int x2apic_apic_id_registered(void);
-void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest);
-unsigned int x2apic_get_apic_id(unsigned long id);
-u32 x2apic_set_apic_id(unsigned int id);
-int x2apic_phys_pkg_id(int initial_apicid, int index_msb);
-void x2apic_send_IPI_self(int vector);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 7685444a106b..7db83212effb 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -1,26 +1,25 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/threads.h>
+
+#include <linux/cpuhotplug.h>
#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/dmar.h>
-#include <linux/irq.h>
-#include <linux/cpu.h>
-
-#include <asm/smp.h>
-#include "x2apic.h"
-
-struct cluster_mask {
- unsigned int clusterid;
- int node;
- struct cpumask mask;
-};
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+#include <asm/apic.h>
+
+#include "local.h"
+
+#define apic_cluster(apicid) ((apicid) >> 4)
+
+/*
+ * __x2apic_send_IPI_mask() possibly needs to read
+ * x86_cpu_to_logical_apicid for all online cpus in a sequential way.
+ * Using per cpu variable would cost one cache line per cpu.
+ */
+static u32 *x86_cpu_to_logical_apicid __read_mostly;
-static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
-static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks);
-static struct cluster_mask *cluster_hotplug_mask;
+static DEFINE_PER_CPU_READ_MOSTLY(struct cpumask *, cluster_masks);
static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
@@ -29,9 +28,10 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
static void x2apic_send_IPI(int cpu, int vector)
{
- u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
+ u32 dest = x86_cpu_to_logical_apicid[cpu];
- x2apic_wrmsr_fence();
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
__x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
}
@@ -43,29 +43,30 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
unsigned long flags;
u32 dest;
- x2apic_wrmsr_fence();
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
local_irq_save(flags);
tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask);
cpumask_copy(tmpmsk, mask);
/* If IPI should not be sent to self, clear current CPU */
if (apic_dest != APIC_DEST_ALLINC)
- cpumask_clear_cpu(smp_processor_id(), tmpmsk);
+ __cpumask_clear_cpu(smp_processor_id(), tmpmsk);
/* Collapse cpus in a cluster so a single IPI per cluster is sent */
for_each_cpu(cpu, tmpmsk) {
- struct cluster_mask *cmsk = per_cpu(cluster_masks, cpu);
+ struct cpumask *cmsk = per_cpu(cluster_masks, cpu);
dest = 0;
- for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask)
- dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu);
+ for_each_cpu_and(clustercpu, tmpmsk, cmsk)
+ dest |= x86_cpu_to_logical_apicid[clustercpu];
if (!dest)
continue;
- __x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
+ __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
/* Remove cluster CPUs from tmpmask */
- cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask);
+ cpumask_andnot(tmpmsk, tmpmsk, cmsk);
}
local_irq_restore(flags);
@@ -82,95 +83,141 @@ x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
__x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
}
-static void x2apic_send_IPI_allbutself(int vector)
+static u32 x2apic_calc_apicid(unsigned int cpu)
{
- __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
+ return x86_cpu_to_logical_apicid[cpu];
}
-static void x2apic_send_IPI_all(int vector)
+static void init_x2apic_ldr(void)
{
- __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
-}
+ struct cpumask *cmsk = this_cpu_read(cluster_masks);
-static u32 x2apic_calc_apicid(unsigned int cpu)
-{
- return per_cpu(x86_cpu_to_logical_apicid, cpu);
+ BUG_ON(!cmsk);
+
+ cpumask_set_cpu(smp_processor_id(), cmsk);
}
-static void init_x2apic_ldr(void)
+/*
+ * As an optimisation during boot, set the cluster_mask for all present
+ * CPUs at once, to prevent each of them having to iterate over the others
+ * to find the existing cluster_mask.
+ */
+static void prefill_clustermask(struct cpumask *cmsk, unsigned int cpu, u32 cluster)
{
- struct cluster_mask *cmsk = this_cpu_read(cluster_masks);
- u32 cluster, apicid = apic_read(APIC_LDR);
- unsigned int cpu;
+ int cpu_i;
- this_cpu_write(x86_cpu_to_logical_apicid, apicid);
+ for_each_present_cpu(cpu_i) {
+ struct cpumask **cpu_cmsk = &per_cpu(cluster_masks, cpu_i);
+ u32 apicid = apic->cpu_present_to_apicid(cpu_i);
- if (cmsk)
- goto update;
-
- cluster = apicid >> 16;
- for_each_online_cpu(cpu) {
- cmsk = per_cpu(cluster_masks, cpu);
- /* Matching cluster found. Link and update it. */
- if (cmsk && cmsk->clusterid == cluster)
- goto update;
+ if (apicid == BAD_APICID || cpu_i == cpu || apic_cluster(apicid) != cluster)
+ continue;
+
+ if (WARN_ON_ONCE(*cpu_cmsk == cmsk))
+ continue;
+
+ BUG_ON(*cpu_cmsk);
+ *cpu_cmsk = cmsk;
}
- cmsk = cluster_hotplug_mask;
- cmsk->clusterid = cluster;
- cluster_hotplug_mask = NULL;
-update:
- this_cpu_write(cluster_masks, cmsk);
- cpumask_set_cpu(smp_processor_id(), &cmsk->mask);
}
-static int alloc_clustermask(unsigned int cpu, int node)
+static int alloc_clustermask(unsigned int cpu, u32 cluster, int node)
{
+ struct cpumask *cmsk = NULL;
+ unsigned int cpu_i;
+
+ /*
+ * At boot time, the CPU present mask is stable. The cluster mask is
+ * allocated for the first CPU in the cluster and propagated to all
+ * present siblings in the cluster. If the cluster mask is already set
+ * on entry to this function for a given CPU, there is nothing to do.
+ */
if (per_cpu(cluster_masks, cpu))
return 0;
+
+ if (system_state < SYSTEM_RUNNING)
+ goto alloc;
+
/*
- * If a hotplug spare mask exists, check whether it's on the right
- * node. If not, free it and allocate a new one.
+ * On post boot hotplug for a CPU which was not present at boot time,
+ * iterate over all possible CPUs (even those which are not present
+ * any more) to find any existing cluster mask.
*/
- if (cluster_hotplug_mask) {
- if (cluster_hotplug_mask->node == node)
- return 0;
- kfree(cluster_hotplug_mask);
+ for_each_possible_cpu(cpu_i) {
+ u32 apicid = apic->cpu_present_to_apicid(cpu_i);
+
+ if (apicid != BAD_APICID && apic_cluster(apicid) == cluster) {
+ cmsk = per_cpu(cluster_masks, cpu_i);
+ /*
+ * If the cluster is already initialized, just store
+ * the mask and return. There's no need to propagate.
+ */
+ if (cmsk) {
+ per_cpu(cluster_masks, cpu) = cmsk;
+ return 0;
+ }
+ }
}
-
- cluster_hotplug_mask = kzalloc_node(sizeof(*cluster_hotplug_mask),
- GFP_KERNEL, node);
- if (!cluster_hotplug_mask)
+ /*
+ * No CPU in the cluster has ever been initialized, so fall through to
+ * the boot time code which will also populate the cluster mask for any
+ * other CPU in the cluster which is (now) present.
+ */
+alloc:
+ cmsk = kzalloc_node(sizeof(*cmsk), GFP_KERNEL, node);
+ if (!cmsk)
return -ENOMEM;
- cluster_hotplug_mask->node = node;
+ per_cpu(cluster_masks, cpu) = cmsk;
+ prefill_clustermask(cmsk, cpu, cluster);
+
return 0;
}
static int x2apic_prepare_cpu(unsigned int cpu)
{
- if (alloc_clustermask(cpu, cpu_to_node(cpu)) < 0)
+ u32 phys_apicid = apic->cpu_present_to_apicid(cpu);
+ u32 cluster = apic_cluster(phys_apicid);
+ u32 logical_apicid = (cluster << 16) | (1 << (phys_apicid & 0xf));
+ int node = cpu_to_node(cpu);
+
+ x86_cpu_to_logical_apicid[cpu] = logical_apicid;
+
+ if (alloc_clustermask(cpu, cluster, node) < 0)
return -ENOMEM;
- if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL))
+
+ if (!zalloc_cpumask_var_node(&per_cpu(ipi_mask, cpu), GFP_KERNEL, node))
return -ENOMEM;
+
return 0;
}
static int x2apic_dead_cpu(unsigned int dead_cpu)
{
- struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu);
+ struct cpumask *cmsk = per_cpu(cluster_masks, dead_cpu);
- cpumask_clear_cpu(dead_cpu, &cmsk->mask);
+ if (cmsk)
+ cpumask_clear_cpu(dead_cpu, cmsk);
free_cpumask_var(per_cpu(ipi_mask, dead_cpu));
return 0;
}
static int x2apic_cluster_probe(void)
{
+ u32 slots;
+
if (!x2apic_mode)
return 0;
+ slots = max_t(u32, L1_CACHE_BYTES/sizeof(u32), nr_cpu_ids);
+ x86_cpu_to_logical_apicid = kcalloc(slots, sizeof(u32), GFP_KERNEL);
+ if (!x86_cpu_to_logical_apicid)
+ return 0;
+
if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare",
x2apic_prepare_cpu, x2apic_dead_cpu) < 0) {
pr_err("Failed to register X2APIC_PREPARE\n");
+ kfree(x86_cpu_to_logical_apicid);
+ x86_cpu_to_logical_apicid = NULL;
return 0;
}
init_x2apic_ldr();
@@ -182,27 +229,17 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
.name = "cluster x2apic",
.probe = x2apic_cluster_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
- .apic_id_valid = x2apic_apic_id_valid,
- .apic_id_registered = x2apic_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 1, /* logical */
+ .dest_mode_logical = true,
.disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = NULL,
.init_apic_ldr = init_x2apic_ldr,
-
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
- .phys_pkg_id = x2apic_phys_pkg_id,
+ .max_apic_id = UINT_MAX,
+ .x2apic_set_max_apicid = true,
.get_apic_id = x2apic_get_apic_id,
- .set_apic_id = x2apic_set_apic_id,
.calc_dest_apicid = x2apic_calc_apicid,
@@ -212,16 +249,13 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_self = x2apic_send_IPI_self,
-
- .inquire_remote_apic = NULL,
+ .nmi_to_offline_cpu = true,
.read = native_apic_msr_read,
.write = native_apic_msr_write,
- .eoi_write = native_apic_msr_eoi_write,
+ .eoi = native_apic_msr_eoi,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
- .wait_icr_idle = native_x2apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
apic_driver(apic_x2apic_cluster);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index b5cf9e7b3830..12d4c35547a6 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -1,18 +1,21 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/threads.h>
+
#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/dmar.h>
+#include <linux/acpi.h>
-#include <asm/smp.h>
-#include <asm/ipi.h>
-#include "x2apic.h"
+#include "local.h"
int x2apic_phys;
static struct apic apic_x2apic_phys;
+u32 x2apic_max_apicid __ro_after_init = UINT_MAX;
+
+void __init x2apic_set_max_apicid(u32 apicid)
+{
+ x2apic_max_apicid = apicid;
+ if (apic->x2apic_set_max_apicid)
+ apic->max_apic_id = apicid;
+}
static int __init set_x2apic_phys_mode(char *arg)
{
@@ -42,7 +45,8 @@ static void x2apic_send_IPI(int cpu, int vector)
{
u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
- x2apic_wrmsr_fence();
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
__x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL);
}
@@ -53,7 +57,8 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
unsigned long this_cpu;
unsigned long flags;
- x2apic_wrmsr_fence();
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
local_irq_save(flags);
@@ -78,37 +83,28 @@ static void
__x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
}
-static void x2apic_send_IPI_allbutself(int vector)
+static void __x2apic_send_IPI_shorthand(int vector, u32 which)
{
- __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
-}
+ unsigned long cfg = __prepare_ICR(which, vector, 0);
-static void x2apic_send_IPI_all(int vector)
-{
- __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
+ native_x2apic_icr_write(cfg, 0);
}
-static void init_x2apic_ldr(void)
+void x2apic_send_IPI_allbutself(int vector)
{
+ __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT);
}
-static int x2apic_phys_probe(void)
+void x2apic_send_IPI_all(int vector)
{
- if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys()))
- return 1;
-
- return apic == &apic_x2apic_phys;
+ __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC);
}
-/* Common x2apic functions, also used by x2apic_cluster */
-int x2apic_apic_id_valid(u32 apicid)
-{
- return 1;
-}
-
-int x2apic_apic_id_registered(void)
+void x2apic_send_IPI_self(int vector)
{
- return 1;
+ apic_write(APIC_SELF_IPI, vector);
}
void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
@@ -117,24 +113,20 @@ void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
native_x2apic_icr_write(cfg, apicid);
}
-unsigned int x2apic_get_apic_id(unsigned long id)
+static int x2apic_phys_probe(void)
{
- return id;
-}
+ if (!x2apic_mode)
+ return 0;
-u32 x2apic_set_apic_id(unsigned int id)
-{
- return id;
-}
+ if (x2apic_phys || x2apic_fadt_phys())
+ return 1;
-int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
-{
- return initial_apicid >> index_msb;
+ return apic == &apic_x2apic_phys;
}
-void x2apic_send_IPI_self(int vector)
+u32 x2apic_get_apic_id(u32 id)
{
- apic_write(APIC_SELF_IPI, vector);
+ return id;
}
static struct apic apic_x2apic_phys __ro_after_init = {
@@ -142,27 +134,16 @@ static struct apic apic_x2apic_phys __ro_after_init = {
.name = "physical x2apic",
.probe = x2apic_phys_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
- .apic_id_valid = x2apic_apic_id_valid,
- .apic_id_registered = x2apic_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* physical */
+ .dest_mode_logical = false,
.disable_esr = 0,
- .dest_logical = 0,
- .check_apicid_used = NULL,
-
- .init_apic_ldr = init_x2apic_ldr,
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
- .phys_pkg_id = x2apic_phys_pkg_id,
+ .max_apic_id = UINT_MAX,
+ .x2apic_set_max_apicid = true,
.get_apic_id = x2apic_get_apic_id,
- .set_apic_id = x2apic_set_apic_id,
.calc_dest_apicid = apic_default_calc_apicid,
@@ -172,16 +153,13 @@ static struct apic apic_x2apic_phys __ro_after_init = {
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_self = x2apic_send_IPI_self,
-
- .inquire_remote_apic = NULL,
+ .nmi_to_offline_cpu = true,
.read = native_apic_msr_read,
.write = native_apic_msr_write,
- .eoi_write = native_apic_msr_eoi_write,
+ .eoi = native_apic_msr_eoi,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
- .wait_icr_idle = native_x2apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
apic_driver(apic_x2apic_phys);
diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c
new file mode 100644
index 000000000000..dbc5678bc3b6
--- /dev/null
+++ b/arch/x86/kernel/apic/x2apic_savic.c
@@ -0,0 +1,428 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Secure AVIC Support (SEV-SNP Guests)
+ *
+ * Copyright (C) 2024 Advanced Micro Devices, Inc.
+ *
+ * Author: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>
+ */
+
+#include <linux/cc_platform.h>
+#include <linux/cpumask.h>
+#include <linux/percpu-defs.h>
+#include <linux/align.h>
+
+#include <asm/apic.h>
+#include <asm/sev.h>
+
+#include "local.h"
+
+struct secure_avic_page {
+ u8 regs[PAGE_SIZE];
+} __aligned(PAGE_SIZE);
+
+static struct secure_avic_page __percpu *savic_page __ro_after_init;
+
+static int savic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ return x2apic_enabled() && cc_platform_has(CC_ATTR_SNP_SECURE_AVIC);
+}
+
+static inline void *get_reg_bitmap(unsigned int cpu, unsigned int offset)
+{
+ return &per_cpu_ptr(savic_page, cpu)->regs[offset];
+}
+
+static inline void update_vector(unsigned int cpu, unsigned int offset,
+ unsigned int vector, bool set)
+{
+ void *bitmap = get_reg_bitmap(cpu, offset);
+
+ if (set)
+ apic_set_vector(vector, bitmap);
+ else
+ apic_clear_vector(vector, bitmap);
+}
+
+#define SAVIC_ALLOWED_IRR 0x204
+
+/*
+ * When Secure AVIC is enabled, RDMSR/WRMSR of the APIC registers
+ * result in #VC exception (for non-accelerated register accesses)
+ * with VMEXIT_AVIC_NOACCEL error code. The #VC exception handler
+ * can read/write the x2APIC register in the guest APIC backing page.
+ *
+ * Since doing this would increase the latency of accessing x2APIC
+ * registers, instead of doing RDMSR/WRMSR based accesses and
+ * handling the APIC register reads/writes in the #VC exception handler,
+ * the read() and write() callbacks directly read/write the APIC register
+ * from/to the vCPU's APIC backing page.
+ */
+static u32 savic_read(u32 reg)
+{
+ void *ap = this_cpu_ptr(savic_page);
+
+ switch (reg) {
+ case APIC_LVTT:
+ case APIC_TMICT:
+ case APIC_TMCCT:
+ case APIC_TDCR:
+ case APIC_LVTTHMR:
+ case APIC_LVTPC:
+ case APIC_LVT0:
+ case APIC_LVT1:
+ case APIC_LVTERR:
+ return savic_ghcb_msr_read(reg);
+ case APIC_ID:
+ case APIC_LVR:
+ case APIC_TASKPRI:
+ case APIC_ARBPRI:
+ case APIC_PROCPRI:
+ case APIC_LDR:
+ case APIC_SPIV:
+ case APIC_ESR:
+ case APIC_EFEAT:
+ case APIC_ECTRL:
+ case APIC_SEOI:
+ case APIC_IER:
+ case APIC_EILVTn(0) ... APIC_EILVTn(3):
+ return apic_get_reg(ap, reg);
+ case APIC_ICR:
+ return (u32)apic_get_reg64(ap, reg);
+ case APIC_ISR ... APIC_ISR + 0x70:
+ case APIC_TMR ... APIC_TMR + 0x70:
+ if (WARN_ONCE(!IS_ALIGNED(reg, 16),
+ "APIC register read offset 0x%x not aligned at 16 bytes", reg))
+ return 0;
+ return apic_get_reg(ap, reg);
+ /* IRR and ALLOWED_IRR offset range */
+ case APIC_IRR ... APIC_IRR + 0x74:
+ /*
+ * Valid APIC_IRR/SAVIC_ALLOWED_IRR registers are at 16 bytes strides from
+ * their respective base offset. APIC_IRRs are in the range
+ *
+ * (0x200, 0x210, ..., 0x270)
+ *
+ * while the SAVIC_ALLOWED_IRR range starts 4 bytes later, in the range
+ *
+ * (0x204, 0x214, ..., 0x274).
+ *
+ * Filter out everything else.
+ */
+ if (WARN_ONCE(!(IS_ALIGNED(reg, 16) ||
+ IS_ALIGNED(reg - 4, 16)),
+ "Misaligned APIC_IRR/ALLOWED_IRR APIC register read offset 0x%x", reg))
+ return 0;
+ return apic_get_reg(ap, reg);
+ default:
+ pr_err("Error reading unknown Secure AVIC reg offset 0x%x\n", reg);
+ return 0;
+ }
+}
+
+#define SAVIC_NMI_REQ 0x278
+
+/*
+ * On WRMSR to APIC_SELF_IPI register by the guest, Secure AVIC hardware
+ * updates the APIC_IRR in the APIC backing page of the vCPU. In addition,
+ * hardware evaluates the new APIC_IRR update for interrupt injection to
+ * the vCPU. So, self IPIs are hardware-accelerated.
+ */
+static inline void self_ipi_reg_write(unsigned int vector)
+{
+ native_apic_msr_write(APIC_SELF_IPI, vector);
+}
+
+static void send_ipi_dest(unsigned int cpu, unsigned int vector, bool nmi)
+{
+ if (nmi)
+ apic_set_reg(per_cpu_ptr(savic_page, cpu), SAVIC_NMI_REQ, 1);
+ else
+ update_vector(cpu, APIC_IRR, vector, true);
+}
+
+static void send_ipi_allbut(unsigned int vector, bool nmi)
+{
+ unsigned int cpu, src_cpu;
+
+ guard(irqsave)();
+
+ src_cpu = raw_smp_processor_id();
+
+ for_each_cpu(cpu, cpu_online_mask) {
+ if (cpu == src_cpu)
+ continue;
+ send_ipi_dest(cpu, vector, nmi);
+ }
+}
+
+static inline void self_ipi(unsigned int vector, bool nmi)
+{
+ u32 icr_low = APIC_SELF_IPI | vector;
+
+ if (nmi)
+ icr_low |= APIC_DM_NMI;
+
+ native_x2apic_icr_write(icr_low, 0);
+}
+
+static void savic_icr_write(u32 icr_low, u32 icr_high)
+{
+ unsigned int dsh, vector;
+ u64 icr_data;
+ bool nmi;
+
+ dsh = icr_low & APIC_DEST_ALLBUT;
+ vector = icr_low & APIC_VECTOR_MASK;
+ nmi = ((icr_low & APIC_DM_FIXED_MASK) == APIC_DM_NMI);
+
+ switch (dsh) {
+ case APIC_DEST_SELF:
+ self_ipi(vector, nmi);
+ break;
+ case APIC_DEST_ALLINC:
+ self_ipi(vector, nmi);
+ fallthrough;
+ case APIC_DEST_ALLBUT:
+ send_ipi_allbut(vector, nmi);
+ break;
+ default:
+ send_ipi_dest(icr_high, vector, nmi);
+ break;
+ }
+
+ icr_data = ((u64)icr_high) << 32 | icr_low;
+ if (dsh != APIC_DEST_SELF)
+ savic_ghcb_msr_write(APIC_ICR, icr_data);
+ apic_set_reg64(this_cpu_ptr(savic_page), APIC_ICR, icr_data);
+}
+
+static void savic_write(u32 reg, u32 data)
+{
+ void *ap = this_cpu_ptr(savic_page);
+
+ switch (reg) {
+ case APIC_LVTT:
+ case APIC_TMICT:
+ case APIC_TDCR:
+ case APIC_LVT0:
+ case APIC_LVT1:
+ case APIC_LVTTHMR:
+ case APIC_LVTPC:
+ case APIC_LVTERR:
+ savic_ghcb_msr_write(reg, data);
+ break;
+ case APIC_TASKPRI:
+ case APIC_EOI:
+ case APIC_SPIV:
+ case SAVIC_NMI_REQ:
+ case APIC_ESR:
+ case APIC_ECTRL:
+ case APIC_SEOI:
+ case APIC_IER:
+ case APIC_EILVTn(0) ... APIC_EILVTn(3):
+ apic_set_reg(ap, reg, data);
+ break;
+ case APIC_ICR:
+ savic_icr_write(data, 0);
+ break;
+ case APIC_SELF_IPI:
+ self_ipi_reg_write(data);
+ break;
+ /* ALLOWED_IRR offsets are writable */
+ case SAVIC_ALLOWED_IRR ... SAVIC_ALLOWED_IRR + 0x70:
+ if (IS_ALIGNED(reg - 4, 16)) {
+ apic_set_reg(ap, reg, data);
+ break;
+ }
+ fallthrough;
+ default:
+ pr_err("Error writing unknown Secure AVIC reg offset 0x%x\n", reg);
+ }
+}
+
+static void send_ipi(u32 dest, unsigned int vector, unsigned int dsh)
+{
+ unsigned int icr_low;
+
+ icr_low = __prepare_ICR(dsh, vector, APIC_DEST_PHYSICAL);
+ savic_icr_write(icr_low, dest);
+}
+
+static void savic_send_ipi(int cpu, int vector)
+{
+ u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
+
+ send_ipi(dest, vector, 0);
+}
+
+static void send_ipi_mask(const struct cpumask *mask, unsigned int vector, bool excl_self)
+{
+ unsigned int cpu, this_cpu;
+
+ guard(irqsave)();
+
+ this_cpu = raw_smp_processor_id();
+
+ for_each_cpu(cpu, mask) {
+ if (excl_self && cpu == this_cpu)
+ continue;
+ send_ipi(per_cpu(x86_cpu_to_apicid, cpu), vector, 0);
+ }
+}
+
+static void savic_send_ipi_mask(const struct cpumask *mask, int vector)
+{
+ send_ipi_mask(mask, vector, false);
+}
+
+static void savic_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ send_ipi_mask(mask, vector, true);
+}
+
+static void savic_send_ipi_allbutself(int vector)
+{
+ send_ipi(0, vector, APIC_DEST_ALLBUT);
+}
+
+static void savic_send_ipi_all(int vector)
+{
+ send_ipi(0, vector, APIC_DEST_ALLINC);
+}
+
+static void savic_send_ipi_self(int vector)
+{
+ self_ipi_reg_write(vector);
+}
+
+static void savic_update_vector(unsigned int cpu, unsigned int vector, bool set)
+{
+ update_vector(cpu, SAVIC_ALLOWED_IRR, vector, set);
+}
+
+static void savic_eoi(void)
+{
+ unsigned int cpu;
+ int vec;
+
+ cpu = raw_smp_processor_id();
+ vec = apic_find_highest_vector(get_reg_bitmap(cpu, APIC_ISR));
+ if (WARN_ONCE(vec == -1, "EOI write while no active interrupt in APIC_ISR"))
+ return;
+
+ /* Is level-triggered interrupt? */
+ if (apic_test_vector(vec, get_reg_bitmap(cpu, APIC_TMR))) {
+ update_vector(cpu, APIC_ISR, vec, false);
+ /*
+ * Propagate the EOI write to the hypervisor for level-triggered
+ * interrupts. Return to the guest from GHCB protocol event takes
+ * care of re-evaluating interrupt state.
+ */
+ savic_ghcb_msr_write(APIC_EOI, 0);
+ } else {
+ /*
+ * Hardware clears APIC_ISR and re-evaluates the interrupt state
+ * to determine if there is any pending interrupt which can be
+ * delivered to CPU.
+ */
+ native_apic_msr_eoi();
+ }
+}
+
+static void savic_teardown(void)
+{
+ /* Disable Secure AVIC */
+ native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, 0);
+ savic_unregister_gpa(NULL);
+}
+
+static void savic_setup(void)
+{
+ void *ap = this_cpu_ptr(savic_page);
+ enum es_result res;
+ unsigned long gpa;
+
+ /*
+ * Before Secure AVIC is enabled, APIC MSR reads are intercepted.
+ * APIC_ID MSR read returns the value from the hypervisor.
+ */
+ apic_set_reg(ap, APIC_ID, native_apic_msr_read(APIC_ID));
+
+ gpa = __pa(ap);
+
+ /*
+ * The NPT entry for a vCPU's APIC backing page must always be
+ * present when the vCPU is running in order for Secure AVIC to
+ * function. A VMEXIT_BUSY is returned on VMRUN and the vCPU cannot
+ * be resumed if the NPT entry for the APIC backing page is not
+ * present. Notify GPA of the vCPU's APIC backing page to the
+ * hypervisor by calling savic_register_gpa(). Before executing
+ * VMRUN, the hypervisor makes use of this information to make sure
+ * the APIC backing page is mapped in NPT.
+ */
+ res = savic_register_gpa(gpa);
+ if (res != ES_OK)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL);
+
+ native_wrmsrq(MSR_AMD64_SAVIC_CONTROL,
+ gpa | MSR_AMD64_SAVIC_EN | MSR_AMD64_SAVIC_ALLOWEDNMI);
+}
+
+static int savic_probe(void)
+{
+ if (!cc_platform_has(CC_ATTR_SNP_SECURE_AVIC))
+ return 0;
+
+ if (!x2apic_mode) {
+ pr_err("Secure AVIC enabled in non x2APIC mode\n");
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL);
+ /* unreachable */
+ }
+
+ savic_page = alloc_percpu(struct secure_avic_page);
+ if (!savic_page)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL);
+
+ return 1;
+}
+
+static struct apic apic_x2apic_savic __ro_after_init = {
+
+ .name = "secure avic x2apic",
+ .probe = savic_probe,
+ .acpi_madt_oem_check = savic_acpi_madt_oem_check,
+ .setup = savic_setup,
+ .teardown = savic_teardown,
+
+ .dest_mode_logical = false,
+
+ .disable_esr = 0,
+
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+
+ .max_apic_id = UINT_MAX,
+ .x2apic_set_max_apicid = true,
+ .get_apic_id = x2apic_get_apic_id,
+
+ .calc_dest_apicid = apic_default_calc_apicid,
+
+ .send_IPI = savic_send_ipi,
+ .send_IPI_mask = savic_send_ipi_mask,
+ .send_IPI_mask_allbutself = savic_send_ipi_mask_allbutself,
+ .send_IPI_allbutself = savic_send_ipi_allbutself,
+ .send_IPI_all = savic_send_ipi_all,
+ .send_IPI_self = savic_send_ipi_self,
+
+ .nmi_to_offline_cpu = true,
+
+ .read = savic_read,
+ .write = savic_write,
+ .eoi = savic_eoi,
+ .icr_read = native_x2apic_icr_read,
+ .icr_write = savic_icr_write,
+
+ .update_vector = savic_update_vector,
+};
+
+apic_driver(apic_x2apic_savic);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index a555da094157..15209f220e1f 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -5,66 +5,54 @@
*
* SGI UV APIC functions (note: not an Intel compatible APIC)
*
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
* Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved.
*/
+#include <linux/crash_dump.h>
+#include <linux/cpuhotplug.h>
#include <linux/cpumask.h>
-#include <linux/hardirq.h>
#include <linux/proc_fs.h>
-#include <linux/threads.h>
-#include <linux/kernel.h>
+#include <linux/memory.h>
#include <linux/export.h>
-#include <linux/string.h>
-#include <linux/ctype.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
-#include <linux/init.h>
-#include <linux/io.h>
#include <linux/pci.h>
-#include <linux/kdebug.h>
-#include <linux/delay.h>
-#include <linux/crash_dump.h>
-#include <linux/reboot.h>
-#include <linux/memory.h>
+#include <linux/acpi.h>
+#include <linux/efi.h>
+#include <asm/e820/api.h>
#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
-#include <asm/current.h>
-#include <asm/pgtable.h>
#include <asm/uv/bios.h>
#include <asm/uv/uv.h>
#include <asm/apic.h>
-#include <asm/e820/api.h>
-#include <asm/ipi.h>
-#include <asm/smp.h>
-#include <asm/x86_init.h>
-#include <asm/nmi.h>
-DEFINE_PER_CPU(int, x2apic_extra_bits);
+#include "local.h"
static enum uv_system_type uv_system_type;
-static bool uv_hubless_system;
+static int uv_hubbed_system;
+static int uv_hubless_system;
static u64 gru_start_paddr, gru_end_paddr;
-static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr;
-static u64 gru_dist_lmask, gru_dist_umask;
static union uvh_apicid uvh_apicid;
+static int uv_node_id;
+
+/* Unpack AT/OEM/TABLE ID's to be NULL terminated strings */
+static u8 uv_archtype[UV_AT_SIZE + 1];
+static u8 oem_id[ACPI_OEM_ID_SIZE + 1];
+static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1];
-/* Information derived from CPUID: */
+/* Information derived from CPUID and some UV MMRs */
static struct {
unsigned int apicid_shift;
unsigned int apicid_mask;
- unsigned int socketid_shift; /* aka pnode_shift for UV1/2/3 */
+ unsigned int socketid_shift; /* aka pnode_shift for UV2/3 */
unsigned int pnode_mask;
+ unsigned int nasid_shift;
unsigned int gpa_shift;
unsigned int gnode_shift;
+ unsigned int m_skt;
+ unsigned int n_skt;
} uv_cpuid;
-int uv_min_hub_revision_id;
-EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
-
-unsigned int uv_apicid_hibits;
-EXPORT_SYMBOL_GPL(uv_apicid_hibits);
+static int uv_min_hub_revision_id;
static struct apic apic_x2apic_uv_x;
static struct uv_hub_info_s uv_hub_info_node0;
@@ -97,20 +85,10 @@ static unsigned long __init uv_early_read_mmr(unsigned long addr)
static inline bool is_GRU_range(u64 start, u64 end)
{
- if (gru_dist_base) {
- u64 su = start & gru_dist_umask; /* Upper (incl pnode) bits */
- u64 sl = start & gru_dist_lmask; /* Base offset bits */
- u64 eu = end & gru_dist_umask;
- u64 el = end & gru_dist_lmask;
-
- /* Must reside completely within a single GRU range: */
- return (sl == gru_dist_base && el == gru_dist_base &&
- su >= gru_first_node_paddr &&
- su <= gru_last_node_paddr &&
- eu == su);
- } else {
- return start >= gru_start_paddr && end <= gru_end_paddr;
- }
+ if (!gru_start_paddr)
+ return false;
+
+ return start >= gru_start_paddr && end <= gru_end_paddr;
}
static bool uv_is_untracked_pat_range(u64 start, u64 end)
@@ -118,43 +96,102 @@ static bool uv_is_untracked_pat_range(u64 start, u64 end)
return is_ISA_range(start, end) || is_GRU_range(start, end);
}
-static int __init early_get_pnodeid(void)
+static void __init early_get_pnodeid(void)
{
- union uvh_node_id_u node_id;
- union uvh_rh_gam_config_mmr_u m_n_config;
int pnode;
- /* Currently, all blades have same revision number */
+ uv_cpuid.m_skt = 0;
+ if (UVH_RH10_GAM_ADDR_MAP_CONFIG) {
+ union uvh_rh10_gam_addr_map_config_u m_n_config;
+
+ m_n_config.v = uv_early_read_mmr(UVH_RH10_GAM_ADDR_MAP_CONFIG);
+ uv_cpuid.n_skt = m_n_config.s.n_skt;
+ uv_cpuid.nasid_shift = 0;
+ } else if (UVH_RH_GAM_ADDR_MAP_CONFIG) {
+ union uvh_rh_gam_addr_map_config_u m_n_config;
+
+ m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_ADDR_MAP_CONFIG);
+ uv_cpuid.n_skt = m_n_config.s.n_skt;
+ if (is_uv(UV3))
+ uv_cpuid.m_skt = m_n_config.s3.m_skt;
+ if (is_uv(UV2))
+ uv_cpuid.m_skt = m_n_config.s2.m_skt;
+ uv_cpuid.nasid_shift = 1;
+ } else {
+ unsigned long GAM_ADDR_MAP_CONFIG = 0;
+
+ WARN(GAM_ADDR_MAP_CONFIG == 0,
+ "UV: WARN: GAM_ADDR_MAP_CONFIG is not available\n");
+ uv_cpuid.n_skt = 0;
+ uv_cpuid.nasid_shift = 0;
+ }
+
+ if (is_uv(UV4|UVY))
+ uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */
+
+ uv_cpuid.pnode_mask = (1 << uv_cpuid.n_skt) - 1;
+ pnode = (uv_node_id >> uv_cpuid.nasid_shift) & uv_cpuid.pnode_mask;
+ uv_cpuid.gpa_shift = 46; /* Default unless changed */
+
+ pr_info("UV: n_skt:%d pnmsk:%x pn:%x\n",
+ uv_cpuid.n_skt, uv_cpuid.pnode_mask, pnode);
+}
+
+/* Running on a UV Hubbed system, determine which UV Hub Type it is */
+static int __init early_set_hub_type(void)
+{
+ union uvh_node_id_u node_id;
+
+ /*
+ * The NODE_ID MMR is always at offset 0.
+ * Contains the chip part # + revision.
+ * Node_id field started with 15 bits,
+ * ... now 7 but upper 8 are masked to 0.
+ * All blades/nodes have the same part # and hub revision.
+ */
node_id.v = uv_early_read_mmr(UVH_NODE_ID);
- m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
- uv_min_hub_revision_id = node_id.s.revision;
+ uv_node_id = node_id.sx.node_id;
switch (node_id.s.part_number) {
- case UV2_HUB_PART_NUMBER:
- case UV2_HUB_PART_NUMBER_X:
- uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
+
+ case UV5_HUB_PART_NUMBER:
+ uv_min_hub_revision_id = node_id.s.revision
+ + UV5_HUB_REVISION_BASE;
+ uv_hub_type_set(UV5);
+ break;
+
+ /* UV4/4A only have a revision difference */
+ case UV4_HUB_PART_NUMBER:
+ uv_min_hub_revision_id = node_id.s.revision
+ + UV4_HUB_REVISION_BASE - 1;
+ uv_hub_type_set(UV4);
+ if (uv_min_hub_revision_id == UV4A_HUB_REVISION_BASE)
+ uv_hub_type_set(UV4|UV4A);
break;
+
case UV3_HUB_PART_NUMBER:
case UV3_HUB_PART_NUMBER_X:
- uv_min_hub_revision_id += UV3_HUB_REVISION_BASE;
+ uv_min_hub_revision_id = node_id.s.revision
+ + UV3_HUB_REVISION_BASE;
+ uv_hub_type_set(UV3);
break;
- /* Update: UV4A has only a modified revision to indicate HUB fixes */
- case UV4_HUB_PART_NUMBER:
- uv_min_hub_revision_id += UV4_HUB_REVISION_BASE - 1;
- uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */
+ case UV2_HUB_PART_NUMBER:
+ case UV2_HUB_PART_NUMBER_X:
+ uv_min_hub_revision_id = node_id.s.revision
+ + UV2_HUB_REVISION_BASE - 1;
+ uv_hub_type_set(UV2);
break;
+
+ default:
+ return 0;
}
- uv_hub_info->hub_revision = uv_min_hub_revision_id;
- uv_cpuid.pnode_mask = (1 << m_n_config.s.n_skt) - 1;
- pnode = (node_id.s.node_id >> 1) & uv_cpuid.pnode_mask;
- uv_cpuid.gpa_shift = 46; /* Default unless changed */
+ pr_info("UV: part#:%x rev:%d rev_id:%d UVtype:0x%x\n",
+ node_id.s.part_number, node_id.s.revision,
+ uv_min_hub_revision_id, is_uv(~0));
- pr_info("UV: rev:%d part#:%x nodeid:%04x n_skt:%d pnmsk:%x pn:%x\n",
- node_id.s.revision, node_id.s.part_number, node_id.s.node_id,
- m_n_config.s.n_skt, uv_cpuid.pnode_mask, pnode);
- return pnode;
+ return 1;
}
static void __init uv_tsc_check_sync(void)
@@ -163,121 +200,198 @@ static void __init uv_tsc_check_sync(void)
int sync_state;
int mmr_shift;
char *state;
- bool valid;
- /* Accommodate different UV arch BIOSes */
+ /* UV5 guarantees synced TSCs; do not zero TSC_ADJUST */
+ if (!is_uv(UV2|UV3|UV4)) {
+ mark_tsc_async_resets("UV5+");
+ return;
+ }
+
+ /* UV2,3,4, UV BIOS TSC sync state available */
mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR);
mmr_shift =
- is_uv1_hub() ? 0 :
is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT;
- if (mmr_shift)
- sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK;
- else
- sync_state = 0;
+ sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK;
+ /* Check if TSC is valid for all sockets */
switch (sync_state) {
case UVH_TSC_SYNC_VALID:
state = "in sync";
- valid = true;
+ mark_tsc_async_resets("UV BIOS");
break;
- case UVH_TSC_SYNC_INVALID:
- state = "unstable";
- valid = false;
+ /* If BIOS state unknown, don't do anything */
+ case UVH_TSC_SYNC_UNKNOWN:
+ state = "unknown";
break;
+
+ /* Otherwise, BIOS indicates problem with TSC */
default:
- state = "unknown: assuming valid";
- valid = true;
+ state = "unstable";
+ mark_tsc_unstable("UV BIOS");
break;
}
pr_info("UV: TSC sync state from BIOS:0%d(%s)\n", sync_state, state);
-
- /* Mark flag that says TSC != 0 is valid for socket 0 */
- if (valid)
- mark_tsc_async_resets("UV BIOS");
- else
- mark_tsc_unstable("UV BIOS");
}
-/* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */
+/* Selector for (4|4A|5) structs */
+#define uvxy_field(sname, field, undef) ( \
+ is_uv(UV4A) ? sname.s4a.field : \
+ is_uv(UV4) ? sname.s4.field : \
+ is_uv(UV3) ? sname.s3.field : \
+ undef)
-#define SMT_LEVEL 0 /* Leaf 0xb SMT level */
-#define INVALID_TYPE 0 /* Leaf 0xb sub-leaf types */
-#define SMT_TYPE 1
-#define CORE_TYPE 2
-#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
-#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
-
-static void set_x2apic_bits(void)
+static void __init early_get_apic_socketid_shift(void)
{
- unsigned int eax, ebx, ecx, edx, sub_index;
- unsigned int sid_shift;
+ unsigned int sid_shift = topology_get_domain_shift(TOPO_PKG_DOMAIN);
- cpuid(0, &eax, &ebx, &ecx, &edx);
- if (eax < 0xb) {
- pr_info("UV: CPU does not have CPUID.11\n");
- return;
- }
+ if (is_uv2_hub() || is_uv3_hub())
+ uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
- cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
- if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) {
- pr_info("UV: CPUID.11 not implemented\n");
- return;
+ if (sid_shift) {
+ uv_cpuid.apicid_shift = 0;
+ uv_cpuid.apicid_mask = (~(-1 << sid_shift));
+ uv_cpuid.socketid_shift = sid_shift;
+ } else {
+ pr_info("UV: CPU does not have valid CPUID.11\n");
}
- sid_shift = BITS_SHIFT_NEXT_LEVEL(eax);
- sub_index = 1;
- do {
- cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
- if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
- sid_shift = BITS_SHIFT_NEXT_LEVEL(eax);
- break;
- }
- sub_index++;
- } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
+ pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n", uv_cpuid.apicid_shift, uv_cpuid.apicid_mask);
+ pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", uv_cpuid.socketid_shift, uv_cpuid.pnode_mask);
+}
- uv_cpuid.apicid_shift = 0;
- uv_cpuid.apicid_mask = (~(-1 << sid_shift));
- uv_cpuid.socketid_shift = sid_shift;
+static void __init uv_stringify(int len, char *to, char *from)
+{
+ strscpy(to, from, len);
+
+ /* Trim trailing spaces */
+ (void)strim(to);
}
-static void __init early_get_apic_socketid_shift(void)
+/* Find UV arch type entry in UVsystab */
+static unsigned long __init early_find_archtype(struct uv_systab *st)
{
- if (is_uv2_hub() || is_uv3_hub())
- uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
+ int i;
- set_x2apic_bits();
+ for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) {
+ unsigned long ptr = st->entry[i].offset;
- pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n", uv_cpuid.apicid_shift, uv_cpuid.apicid_mask);
- pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", uv_cpuid.socketid_shift, uv_cpuid.pnode_mask);
+ if (!ptr)
+ continue;
+ ptr += (unsigned long)st;
+ if (st->entry[i].type == UV_SYSTAB_TYPE_ARCH_TYPE)
+ return ptr;
+ }
+ return 0;
}
-/*
- * Add an extra bit as dictated by bios to the destination apicid of
- * interrupts potentially passing through the UV HUB. This prevents
- * a deadlock between interrupts and IO port operations.
- */
-static void __init uv_set_apicid_hibit(void)
+/* Validate UV arch type field in UVsystab */
+static int __init decode_arch_type(unsigned long ptr)
{
- union uv1h_lb_target_physical_apic_id_mask_u apicid_mask;
+ struct uv_arch_type_entry *uv_ate = (struct uv_arch_type_entry *)ptr;
+ int n = strlen(uv_ate->archtype);
- if (is_uv1_hub()) {
- apicid_mask.v = uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK);
- uv_apicid_hibits = apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK;
+ if (n > 0 && n < sizeof(uv_ate->archtype)) {
+ pr_info("UV: UVarchtype received from BIOS\n");
+ uv_stringify(sizeof(uv_archtype), uv_archtype, uv_ate->archtype);
+ return 1;
}
+ return 0;
}
-static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+/* Determine if UV arch type entry might exist in UVsystab */
+static int __init early_get_arch_type(void)
{
- int pnodeid;
- int uv_apic;
-
- if (strncmp(oem_id, "SGI", 3) != 0) {
- if (strncmp(oem_id, "NSGI", 4) == 0) {
- uv_hubless_system = true;
- pr_info("UV: OEM IDs %s/%s, HUBLESS\n",
- oem_id, oem_table_id);
- }
+ unsigned long uvst_physaddr, uvst_size, ptr;
+ struct uv_systab *st;
+ u32 rev;
+ int ret;
+
+ uvst_physaddr = get_uv_systab_phys(0);
+ if (!uvst_physaddr)
+ return 0;
+
+ st = early_memremap_ro(uvst_physaddr, sizeof(struct uv_systab));
+ if (!st) {
+ pr_err("UV: Cannot access UVsystab, remap failed\n");
+ return 0;
+ }
+
+ rev = st->revision;
+ if (rev < UV_SYSTAB_VERSION_UV5) {
+ early_memunmap(st, sizeof(struct uv_systab));
+ return 0;
+ }
+
+ uvst_size = st->size;
+ early_memunmap(st, sizeof(struct uv_systab));
+ st = early_memremap_ro(uvst_physaddr, uvst_size);
+ if (!st) {
+ pr_err("UV: Cannot access UVarchtype, remap failed\n");
+ return 0;
+ }
+
+ ptr = early_find_archtype(st);
+ if (!ptr) {
+ early_memunmap(st, uvst_size);
+ return 0;
+ }
+
+ ret = decode_arch_type(ptr);
+ early_memunmap(st, uvst_size);
+ return ret;
+}
+
+/* UV system found, check which APIC MODE BIOS already selected */
+static void __init early_set_apic_mode(void)
+{
+ if (x2apic_enabled())
+ uv_system_type = UV_X2APIC;
+ else
+ uv_system_type = UV_LEGACY_APIC;
+}
+
+static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id)
+{
+ /* Save OEM_ID passed from ACPI MADT */
+ uv_stringify(sizeof(oem_id), oem_id, _oem_id);
+
+ /* Check if BIOS sent us a UVarchtype */
+ if (!early_get_arch_type())
+
+ /* If not use OEM ID for UVarchtype */
+ uv_stringify(sizeof(uv_archtype), uv_archtype, oem_id);
+
+ /* Check if not hubbed */
+ if (strncmp(uv_archtype, "SGI", 3) != 0) {
+
+ /* (Not hubbed), check if not hubless */
+ if (strncmp(uv_archtype, "NSGI", 4) != 0)
+
+ /* (Not hubless), not a UV */
+ return 0;
+
+ /* Is UV hubless system */
+ uv_hubless_system = 0x01;
+
+ /* UV5 Hubless */
+ if (strncmp(uv_archtype, "NSGI5", 5) == 0)
+ uv_hubless_system |= 0x20;
+
+ /* UV4 Hubless: CH */
+ else if (strncmp(uv_archtype, "NSGI4", 5) == 0)
+ uv_hubless_system |= 0x10;
+
+ /* UV3 Hubless: UV300/MC990X w/o hub */
+ else
+ uv_hubless_system |= 0x8;
+
+ /* Copy OEM Table ID */
+ uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id);
+
+ pr_info("UV: OEM IDs %s/%s, SystemType %d, HUBLESS ID %x\n",
+ oem_id, oem_table_id, uv_system_type, uv_hubless_system);
+
return 0;
}
@@ -286,62 +400,70 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 0;
}
- /* Set up early hub type field in uv_hub_info for Node 0 */
- uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0;
-
- /*
- * Determine UV arch type.
- * SGI: UV100/1000
- * SGI2: UV2000/3000
- * SGI3: UV300 (truncated to 4 chars because of different varieties)
- * SGI4: UV400 (truncated to 4 chars because of different varieties)
- */
+ /* Set hubbed type if true */
uv_hub_info->hub_revision =
- !strncmp(oem_id, "SGI4", 4) ? UV4_HUB_REVISION_BASE :
- !strncmp(oem_id, "SGI3", 4) ? UV3_HUB_REVISION_BASE :
- !strcmp(oem_id, "SGI2") ? UV2_HUB_REVISION_BASE :
- !strcmp(oem_id, "SGI") ? UV1_HUB_REVISION_BASE : 0;
+ !strncmp(uv_archtype, "SGI5", 4) ? UV5_HUB_REVISION_BASE :
+ !strncmp(uv_archtype, "SGI4", 4) ? UV4_HUB_REVISION_BASE :
+ !strncmp(uv_archtype, "SGI3", 4) ? UV3_HUB_REVISION_BASE :
+ !strcmp(uv_archtype, "SGI2") ? UV2_HUB_REVISION_BASE : 0;
+
+ switch (uv_hub_info->hub_revision) {
+ case UV5_HUB_REVISION_BASE:
+ uv_hubbed_system = 0x21;
+ uv_hub_type_set(UV5);
+ break;
- if (uv_hub_info->hub_revision == 0)
- goto badbios;
+ case UV4_HUB_REVISION_BASE:
+ uv_hubbed_system = 0x11;
+ uv_hub_type_set(UV4);
+ break;
- pnodeid = early_get_pnodeid();
- early_get_apic_socketid_shift();
+ case UV3_HUB_REVISION_BASE:
+ uv_hubbed_system = 0x9;
+ uv_hub_type_set(UV3);
+ break;
+ case UV2_HUB_REVISION_BASE:
+ uv_hubbed_system = 0x5;
+ uv_hub_type_set(UV2);
+ break;
+
+ default:
+ return 0;
+ }
+
+ /* Get UV hub chip part number & revision */
+ early_set_hub_type();
+
+ /* Other UV setup functions */
+ early_set_apic_mode();
+ early_get_pnodeid();
+ early_get_apic_socketid_shift();
x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
x86_platform.nmi_init = uv_nmi_init;
+ uv_tsc_check_sync();
- if (!strcmp(oem_table_id, "UVX")) {
- /* This is the most common hardware variant: */
- uv_system_type = UV_X2APIC;
- uv_apic = 0;
-
- } else if (!strcmp(oem_table_id, "UVH")) {
- /* Only UV1 systems: */
- uv_system_type = UV_NON_UNIQUE_APIC;
- x86_platform.legacy.warm_reset = 0;
- __this_cpu_write(x2apic_extra_bits, pnodeid << uvh_apicid.s.pnode_shift);
- uv_set_apicid_hibit();
- uv_apic = 1;
-
- } else if (!strcmp(oem_table_id, "UVL")) {
- /* Only used for very small systems: */
- uv_system_type = UV_LEGACY_APIC;
- uv_apic = 0;
+ return 1;
+}
- } else {
- goto badbios;
- }
+/* Called early to probe for the correct APIC driver */
+static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id)
+{
+ /* Set up early hub info fields for Node 0 */
+ uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0;
- pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n", oem_id, oem_table_id, uv_system_type, uv_min_hub_revision_id, uv_apic);
- uv_tsc_check_sync();
+ /* If not UV, return. */
+ if (uv_set_system_type(_oem_id, _oem_table_id) == 0)
+ return 0;
+
+ /* Save for display of the OEM Table ID */
+ uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id);
- return uv_apic;
+ pr_info("UV: OEM IDs %s/%s, System/UVType %d/0x%x, HUB RevID %d\n",
+ oem_id, oem_table_id, uv_system_type, is_uv(UV_ANY),
+ uv_min_hub_revision_id);
-badbios:
- pr_err("UV: OEM_ID:%s OEM_TABLE_ID:%s\n", oem_id, oem_table_id);
- pr_err("Current BIOS not supported, update kernel and/or BIOS\n");
- BUG();
+ return 0;
}
enum uv_system_type get_uv_system_type(void)
@@ -349,17 +471,34 @@ enum uv_system_type get_uv_system_type(void)
return uv_system_type;
}
+int uv_get_hubless_system(void)
+{
+ return uv_hubless_system;
+}
+EXPORT_SYMBOL_GPL(uv_get_hubless_system);
+
+ssize_t uv_get_archtype(char *buf, int len)
+{
+ return scnprintf(buf, len, "%s/%s", uv_archtype, oem_table_id);
+}
+EXPORT_SYMBOL_GPL(uv_get_archtype);
+
int is_uv_system(void)
{
return uv_system_type != UV_NONE;
}
EXPORT_SYMBOL_GPL(is_uv_system);
-int is_uv_hubless(void)
+int is_uv_hubbed(int uvtype)
{
- return uv_hubless_system;
+ return (uv_hubbed_system & uvtype);
+}
+EXPORT_SYMBOL_GPL(is_uv_hubbed);
+
+static int is_uv_hubless(int uvtype)
+{
+ return (uv_hubless_system & uvtype);
}
-EXPORT_SYMBOL_GPL(is_uv_hubless);
void **__uv_hub_info_list;
EXPORT_SYMBOL_GPL(__uv_hub_info_list);
@@ -374,7 +513,6 @@ unsigned long sn_rtc_cycles_per_second;
EXPORT_SYMBOL(sn_rtc_cycles_per_second);
/* The following values are used for the per node hub info struct */
-static __initdata unsigned short *_node_to_pnode;
static __initdata unsigned short _min_socket, _max_socket;
static __initdata unsigned short _min_pnode, _max_pnode, _gr_table_len;
static __initdata struct uv_gam_range_entry *uv_gre_table;
@@ -382,17 +520,12 @@ static __initdata struct uv_gam_parameters *uv_gp_table;
static __initdata unsigned short *_socket_to_node;
static __initdata unsigned short *_socket_to_pnode;
static __initdata unsigned short *_pnode_to_socket;
+static __initdata unsigned short *_node_to_socket;
static __initdata struct uv_gam_range_s *_gr_table;
#define SOCK_EMPTY ((unsigned short)~0)
-extern int uv_hub_info_version(void)
-{
- return UV_HUB_INFO_VERSION;
-}
-EXPORT_SYMBOL(uv_hub_info_version);
-
/* Default UV memory block size is 2GB */
static unsigned long mem_block_size __initdata = (2UL << 30);
@@ -451,7 +584,8 @@ static __init void build_uv_gr_table(void)
bytes = _gr_table_len * sizeof(struct uv_gam_range_s);
grt = kzalloc(bytes, GFP_KERNEL);
- BUG_ON(!grt);
+ if (WARN_ON_ONCE(!grt))
+ return;
_gr_table = grt;
for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
@@ -533,13 +667,12 @@ static __init void build_uv_gr_table(void)
}
}
-static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+static int uv_wakeup_secondary(u32 phys_apicid, unsigned long start_rip, unsigned int cpu)
{
unsigned long val;
int pnode;
pnode = uv_apicid_to_pnode(phys_apicid);
- phys_apicid |= uv_apicid_hibits;
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
(phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
@@ -560,12 +693,21 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
static void uv_send_IPI_one(int cpu, int vector)
{
- unsigned long apicid;
- int pnode;
+ unsigned long apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ int pnode = uv_apicid_to_pnode(apicid);
+ unsigned long dmode, val;
+
+ if (vector == NMI_VECTOR)
+ dmode = APIC_DELIVERY_MODE_NMI;
+ else
+ dmode = APIC_DELIVERY_MODE_FIXED;
- apicid = per_cpu(x86_cpu_to_apicid, cpu);
- pnode = uv_apicid_to_pnode(apicid);
- uv_hub_send_ipi(pnode, apicid, vector);
+ val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+ (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+ (dmode << UVH_IPI_INT_DELIVERY_MODE_SHFT) |
+ (vector << UVH_IPI_INT_VECTOR_SHFT);
+
+ uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
}
static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
@@ -603,56 +745,6 @@ static void uv_send_IPI_all(int vector)
uv_send_IPI_mask(cpu_online_mask, vector);
}
-static int uv_apic_id_valid(u32 apicid)
-{
- return 1;
-}
-
-static int uv_apic_id_registered(void)
-{
- return 1;
-}
-
-static void uv_init_apic_ldr(void)
-{
-}
-
-static u32 apic_uv_calc_apicid(unsigned int cpu)
-{
- return apic_default_calc_apicid(cpu) | uv_apicid_hibits;
-}
-
-static unsigned int x2apic_get_apic_id(unsigned long x)
-{
- unsigned int id;
-
- WARN_ON(preemptible() && num_online_cpus() > 1);
- id = x | __this_cpu_read(x2apic_extra_bits);
-
- return id;
-}
-
-static u32 set_apic_id(unsigned int id)
-{
- /* CHECKME: Do we need to mask out the xapic extra bits? */
- return id;
-}
-
-static unsigned int uv_read_apic_id(void)
-{
- return x2apic_get_apic_id(apic_read(APIC_ID));
-}
-
-static int uv_phys_pkg_id(int initial_apicid, int index_msb)
-{
- return uv_read_apic_id() >> index_msb;
-}
-
-static void uv_send_IPI_self(int vector)
-{
- apic_write(APIC_SELF_IPI, vector);
-}
-
static int uv_probe(void)
{
return apic == &apic_x2apic_uv_x;
@@ -663,61 +755,41 @@ static struct apic apic_x2apic_uv_x __ro_after_init = {
.name = "UV large system",
.probe = uv_probe,
.acpi_madt_oem_check = uv_acpi_madt_oem_check,
- .apic_id_valid = uv_apic_id_valid,
- .apic_id_registered = uv_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* Physical */
+ .dest_mode_logical = false,
.disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = NULL,
-
- .init_apic_ldr = uv_init_apic_ldr,
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
- .phys_pkg_id = uv_phys_pkg_id,
+ .max_apic_id = UINT_MAX,
.get_apic_id = x2apic_get_apic_id,
- .set_apic_id = set_apic_id,
- .calc_dest_apicid = apic_uv_calc_apicid,
+ .calc_dest_apicid = apic_default_calc_apicid,
.send_IPI = uv_send_IPI_one,
.send_IPI_mask = uv_send_IPI_mask,
.send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
.send_IPI_allbutself = uv_send_IPI_allbutself,
.send_IPI_all = uv_send_IPI_all,
- .send_IPI_self = uv_send_IPI_self,
+ .send_IPI_self = x2apic_send_IPI_self,
.wakeup_secondary_cpu = uv_wakeup_secondary,
- .inquire_remote_apic = NULL,
.read = native_apic_msr_read,
.write = native_apic_msr_write,
- .eoi_write = native_apic_msr_eoi_write,
+ .eoi = native_apic_msr_eoi,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
- .wait_icr_idle = native_x2apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
-static void set_x2apic_extra_bits(int pnode)
-{
- __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift);
-}
-
#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH 3
-#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
+#define DEST_SHIFT UVXH_RH_GAM_ALIAS_0_REDIRECT_CONFIG_DEST_BASE_SHFT
static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
{
- union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias;
- union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
+ union uvh_rh_gam_alias_2_overlay_config_u alias;
+ union uvh_rh_gam_alias_2_redirect_config_u redirect;
unsigned long m_redirect;
unsigned long m_overlay;
int i;
@@ -725,16 +797,16 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
for (i = 0; i < UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH; i++) {
switch (i) {
case 0:
- m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR;
- m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR;
+ m_redirect = UVH_RH_GAM_ALIAS_0_REDIRECT_CONFIG;
+ m_overlay = UVH_RH_GAM_ALIAS_0_OVERLAY_CONFIG;
break;
case 1:
- m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR;
- m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR;
+ m_redirect = UVH_RH_GAM_ALIAS_1_REDIRECT_CONFIG;
+ m_overlay = UVH_RH_GAM_ALIAS_1_OVERLAY_CONFIG;
break;
case 2:
- m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR;
- m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR;
+ m_redirect = UVH_RH_GAM_ALIAS_2_REDIRECT_CONFIG;
+ m_overlay = UVH_RH_GAM_ALIAS_2_OVERLAY_CONFIG;
break;
}
alias.v = uv_read_local_mmr(m_overlay);
@@ -749,6 +821,7 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
}
enum map_type {map_wb, map_uc};
+static const char * const mt[] = { "WB", "UC" };
static __init void map_high(char *id, unsigned long base, int pshift, int bshift, int max_pnode, enum map_type map_type)
{
@@ -760,65 +833,36 @@ static __init void map_high(char *id, unsigned long base, int pshift, int bshift
pr_info("UV: Map %s_HI base address NULL\n", id);
return;
}
- pr_debug("UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes);
if (map_type == map_uc)
init_extra_mapping_uc(paddr, bytes);
else
init_extra_mapping_wb(paddr, bytes);
-}
-
-static __init void map_gru_distributed(unsigned long c)
-{
- union uvh_rh_gam_gru_overlay_config_mmr_u gru;
- u64 paddr;
- unsigned long bytes;
- int nid;
-
- gru.v = c;
-
- /* Only base bits 42:28 relevant in dist mode */
- gru_dist_base = gru.v & 0x000007fff0000000UL;
- if (!gru_dist_base) {
- pr_info("UV: Map GRU_DIST base address NULL\n");
- return;
- }
-
- bytes = 1UL << UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
- gru_dist_lmask = ((1UL << uv_hub_info->m_val) - 1) & ~(bytes - 1);
- gru_dist_umask = ~((1UL << uv_hub_info->m_val) - 1);
- gru_dist_base &= gru_dist_lmask; /* Clear bits above M */
-
- for_each_online_node(nid) {
- paddr = ((u64)uv_node_to_pnode(nid) << uv_hub_info->m_val) |
- gru_dist_base;
- init_extra_mapping_wb(paddr, bytes);
- gru_first_node_paddr = min(paddr, gru_first_node_paddr);
- gru_last_node_paddr = max(paddr, gru_last_node_paddr);
- }
-
- /* Save upper (63:M) bits of address only for is_GRU_range */
- gru_first_node_paddr &= gru_dist_umask;
- gru_last_node_paddr &= gru_dist_umask;
- pr_debug("UV: Map GRU_DIST base 0x%016llx 0x%016llx - 0x%016llx\n", gru_dist_base, gru_first_node_paddr, gru_last_node_paddr);
+ pr_info("UV: Map %s_HI 0x%lx - 0x%lx %s (%d segments)\n",
+ id, paddr, paddr + bytes, mt[map_type], max_pnode + 1);
}
static __init void map_gru_high(int max_pnode)
{
- union uvh_rh_gam_gru_overlay_config_mmr_u gru;
- int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
- unsigned long mask = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK;
- unsigned long base;
-
- gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
- if (!gru.s.enable) {
- pr_info("UV: GRU disabled\n");
+ union uvh_rh_gam_gru_overlay_config_u gru;
+ unsigned long mask, base;
+ int shift;
+
+ if (UVH_RH_GAM_GRU_OVERLAY_CONFIG) {
+ gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG);
+ shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT;
+ mask = UVH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_MASK;
+ } else if (UVH_RH10_GAM_GRU_OVERLAY_CONFIG) {
+ gru.v = uv_read_local_mmr(UVH_RH10_GAM_GRU_OVERLAY_CONFIG);
+ shift = UVH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT;
+ mask = UVH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_MASK;
+ } else {
+ pr_err("UV: GRU unavailable (no MMR)\n");
return;
}
- /* Only UV3 has distributed GRU mode */
- if (is_uv3_hub() && gru.s3.mode) {
- map_gru_distributed(gru.v);
+ if (!gru.s.enable) {
+ pr_info("UV: GRU disabled (by BIOS)\n");
return;
}
@@ -830,62 +874,104 @@ static __init void map_gru_high(int max_pnode)
static __init void map_mmr_high(int max_pnode)
{
- union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
- int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ unsigned long base;
+ int shift;
+ bool enable;
+
+ if (UVH_RH10_GAM_MMR_OVERLAY_CONFIG) {
+ union uvh_rh10_gam_mmr_overlay_config_u mmr;
+
+ mmr.v = uv_read_local_mmr(UVH_RH10_GAM_MMR_OVERLAY_CONFIG);
+ enable = mmr.s.enable;
+ base = mmr.s.base;
+ shift = UVH_RH10_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT;
+ } else if (UVH_RH_GAM_MMR_OVERLAY_CONFIG) {
+ union uvh_rh_gam_mmr_overlay_config_u mmr;
+
+ mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG);
+ enable = mmr.s.enable;
+ base = mmr.s.base;
+ shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT;
+ } else {
+ pr_err("UV:%s:RH_GAM_MMR_OVERLAY_CONFIG MMR undefined?\n",
+ __func__);
+ return;
+ }
- mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
- if (mmr.s.enable)
- map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc);
+ if (enable)
+ map_high("MMR", base, shift, shift, max_pnode, map_uc);
else
pr_info("UV: MMR disabled\n");
}
-/* UV3/4 have identical MMIOH overlay configs, UV4A is slightly different */
-static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode)
+/* Arch specific ENUM cases */
+enum mmioh_arch {
+ UV2_MMIOH = -1,
+ UVY_MMIOH0, UVY_MMIOH1,
+ UVX_MMIOH0, UVX_MMIOH1,
+};
+
+/* Calculate and Map MMIOH Regions */
+static void __init calc_mmioh_map(enum mmioh_arch index,
+ int min_pnode, int max_pnode,
+ int shift, unsigned long base, int m_io, int n_io)
{
- unsigned long overlay;
- unsigned long mmr;
- unsigned long base;
- unsigned long nasid_mask;
- unsigned long m_overlay;
- int i, n, shift, m_io, max_io;
- int nasid, lnasid, fi, li;
- char *id;
-
- if (index == 0) {
- id = "MMIOH0";
- m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR;
- overlay = uv_read_local_mmr(m_overlay);
- base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK;
- mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR;
- m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK)
- >> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT;
- shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT;
- n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH;
- nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK;
- } else {
- id = "MMIOH1";
- m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR;
- overlay = uv_read_local_mmr(m_overlay);
- base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK;
- mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR;
- m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK)
- >> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT;
- shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT;
- n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH;
- nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK;
+ unsigned long mmr, nasid_mask;
+ int nasid, min_nasid, max_nasid, lnasid, mapped;
+ int i, fi, li, n, max_io;
+ char id[8];
+
+ /* One (UV2) mapping */
+ if (index == UV2_MMIOH) {
+ strscpy(id, "MMIOH", sizeof(id));
+ max_io = max_pnode;
+ mapped = 0;
+ goto map_exit;
}
- pr_info("UV: %s overlay 0x%lx base:0x%lx m_io:%d\n", id, overlay, base, m_io);
- if (!(overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK)) {
- pr_info("UV: %s disabled\n", id);
+
+ /* small and large MMIOH mappings */
+ switch (index) {
+ case UVY_MMIOH0:
+ mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0;
+ nasid_mask = UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK;
+ n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH;
+ min_nasid = min_pnode;
+ max_nasid = max_pnode;
+ mapped = 1;
+ break;
+ case UVY_MMIOH1:
+ mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1;
+ nasid_mask = UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK;
+ n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH;
+ min_nasid = min_pnode;
+ max_nasid = max_pnode;
+ mapped = 1;
+ break;
+ case UVX_MMIOH0:
+ mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0;
+ nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK;
+ n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH;
+ min_nasid = min_pnode * 2;
+ max_nasid = max_pnode * 2;
+ mapped = 1;
+ break;
+ case UVX_MMIOH1:
+ mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1;
+ nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK;
+ n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH;
+ min_nasid = min_pnode * 2;
+ max_nasid = max_pnode * 2;
+ mapped = 1;
+ break;
+ default:
+ pr_err("UV:%s:Invalid mapping type:%d\n", __func__, index);
return;
}
- /* Convert to NASID: */
- min_pnode *= 2;
- max_pnode *= 2;
- max_io = lnasid = fi = li = -1;
+ /* enum values chosen so (index mod 2) is MMIOH 0/1 (low/high) */
+ snprintf(id, sizeof(id), "MMIOH%d", index%2);
+ max_io = lnasid = fi = li = -1;
for (i = 0; i < n; i++) {
unsigned long m_redirect = mmr + i * 8;
unsigned long redirect = uv_read_local_mmr(m_redirect);
@@ -895,9 +981,13 @@ static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode)
pr_info("UV: %s redirect base 0x%lx(@0x%lx) 0x%04x\n",
id, redirect, m_redirect, nasid);
- /* Invalid NASID: */
- if (nasid < min_pnode || max_pnode < nasid)
+ /* Invalid NASID check */
+ if (nasid < min_nasid || max_nasid < nasid) {
+ /* Not an error: unused table entries get "poison" values */
+ pr_debug("UV:%s:Invalid NASID(%x):%x (range:%x..%x)\n",
+ __func__, index, nasid, min_nasid, max_nasid);
nasid = -1;
+ }
if (nasid == lnasid) {
li = i;
@@ -920,7 +1010,8 @@ static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode)
}
addr1 = (base << shift) + f * (1ULL << m_io);
addr2 = (base << shift) + (l + 1) * (1ULL << m_io);
- pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", id, fi, li, lnasid, addr1, addr2);
+ pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n",
+ id, fi, li, lnasid, addr1, addr2);
if (max_io < l)
max_io = l;
}
@@ -928,58 +1019,93 @@ static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode)
lnasid = nasid;
}
- pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n", id, base, shift, m_io, max_io);
+map_exit:
+ pr_info("UV: %s base:0x%lx shift:%d m_io:%d max_io:%d max_pnode:0x%x\n",
+ id, base, shift, m_io, max_io, max_pnode);
- if (max_io >= 0)
+ if (max_io >= 0 && !mapped)
map_high(id, base, shift, m_io, max_io, map_uc);
}
static __init void map_mmioh_high(int min_pnode, int max_pnode)
{
- union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
- unsigned long mmr, base;
- int shift, enable, m_io, n_io;
-
- if (is_uv3_hub() || is_uv4_hub()) {
- /* Map both MMIOH regions: */
- map_mmioh_high_uv34(0, min_pnode, max_pnode);
- map_mmioh_high_uv34(1, min_pnode, max_pnode);
+ /* UVY flavor */
+ if (UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0) {
+ union uvh_rh10_gam_mmioh_overlay_config0_u mmioh0;
+ union uvh_rh10_gam_mmioh_overlay_config1_u mmioh1;
+
+ mmioh0.v = uv_read_local_mmr(UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0);
+ if (unlikely(mmioh0.s.enable == 0))
+ pr_info("UV: MMIOH0 disabled\n");
+ else
+ calc_mmioh_map(UVY_MMIOH0, min_pnode, max_pnode,
+ UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT,
+ mmioh0.s.base, mmioh0.s.m_io, mmioh0.s.n_io);
+
+ mmioh1.v = uv_read_local_mmr(UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1);
+ if (unlikely(mmioh1.s.enable == 0))
+ pr_info("UV: MMIOH1 disabled\n");
+ else
+ calc_mmioh_map(UVY_MMIOH1, min_pnode, max_pnode,
+ UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT,
+ mmioh1.s.base, mmioh1.s.m_io, mmioh1.s.n_io);
return;
}
+ /* UVX flavor */
+ if (UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0) {
+ union uvh_rh_gam_mmioh_overlay_config0_u mmioh0;
+ union uvh_rh_gam_mmioh_overlay_config1_u mmioh1;
+
+ mmioh0.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0);
+ if (unlikely(mmioh0.s.enable == 0))
+ pr_info("UV: MMIOH0 disabled\n");
+ else {
+ unsigned long base = uvxy_field(mmioh0, base, 0);
+ int m_io = uvxy_field(mmioh0, m_io, 0);
+ int n_io = uvxy_field(mmioh0, n_io, 0);
+
+ calc_mmioh_map(UVX_MMIOH0, min_pnode, max_pnode,
+ UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT,
+ base, m_io, n_io);
+ }
- if (is_uv1_hub()) {
- mmr = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
- shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
- mmioh.v = uv_read_local_mmr(mmr);
- enable = !!mmioh.s1.enable;
- base = mmioh.s1.base;
- m_io = mmioh.s1.m_io;
- n_io = mmioh.s1.n_io;
- } else if (is_uv2_hub()) {
- mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
- shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
- mmioh.v = uv_read_local_mmr(mmr);
- enable = !!mmioh.s2.enable;
- base = mmioh.s2.base;
- m_io = mmioh.s2.m_io;
- n_io = mmioh.s2.n_io;
- } else {
+ mmioh1.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1);
+ if (unlikely(mmioh1.s.enable == 0))
+ pr_info("UV: MMIOH1 disabled\n");
+ else {
+ unsigned long base = uvxy_field(mmioh1, base, 0);
+ int m_io = uvxy_field(mmioh1, m_io, 0);
+ int n_io = uvxy_field(mmioh1, n_io, 0);
+
+ calc_mmioh_map(UVX_MMIOH1, min_pnode, max_pnode,
+ UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT,
+ base, m_io, n_io);
+ }
return;
}
- if (enable) {
- max_pnode &= (1 << n_io) - 1;
- pr_info("UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n", base, shift, m_io, n_io, max_pnode);
- map_high("MMIOH", base, shift, m_io, max_pnode, map_uc);
- } else {
- pr_info("UV: MMIOH disabled\n");
+ /* UV2 flavor */
+ if (UVH_RH_GAM_MMIOH_OVERLAY_CONFIG) {
+ union uvh_rh_gam_mmioh_overlay_config_u mmioh;
+
+ mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG);
+ if (unlikely(mmioh.s2.enable == 0))
+ pr_info("UV: MMIOH disabled\n");
+ else
+ calc_mmioh_map(UV2_MMIOH, min_pnode, max_pnode,
+ UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_BASE_SHFT,
+ mmioh.s2.base, mmioh.s2.m_io, mmioh.s2.n_io);
+ return;
}
}
static __init void map_low_mmrs(void)
{
- init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
- init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
+ if (UV_GLOBAL_MMR32_BASE)
+ init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
+
+ if (UV_LOCAL_MMR_BASE)
+ init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
}
static __init void uv_rtc_init(void)
@@ -999,85 +1125,6 @@ static __init void uv_rtc_init(void)
}
}
-/*
- * percpu heartbeat timer
- */
-static void uv_heartbeat(struct timer_list *timer)
-{
- unsigned char bits = uv_scir_info->state;
-
- /* Flip heartbeat bit: */
- bits ^= SCIR_CPU_HEARTBEAT;
-
- /* Is this CPU idle? */
- if (idle_cpu(raw_smp_processor_id()))
- bits &= ~SCIR_CPU_ACTIVITY;
- else
- bits |= SCIR_CPU_ACTIVITY;
-
- /* Update system controller interface reg: */
- uv_set_scir_bits(bits);
-
- /* Enable next timer period: */
- mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
-}
-
-static int uv_heartbeat_enable(unsigned int cpu)
-{
- while (!uv_cpu_scir_info(cpu)->enabled) {
- struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer;
-
- uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
- timer_setup(timer, uv_heartbeat, TIMER_PINNED);
- timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
- add_timer_on(timer, cpu);
- uv_cpu_scir_info(cpu)->enabled = 1;
-
- /* Also ensure that boot CPU is enabled: */
- cpu = 0;
- }
- return 0;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-static int uv_heartbeat_disable(unsigned int cpu)
-{
- if (uv_cpu_scir_info(cpu)->enabled) {
- uv_cpu_scir_info(cpu)->enabled = 0;
- del_timer(&uv_cpu_scir_info(cpu)->timer);
- }
- uv_set_cpu_scir_bits(cpu, 0xff);
- return 0;
-}
-
-static __init void uv_scir_register_cpu_notifier(void)
-{
- cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/x2apic-uvx:online",
- uv_heartbeat_enable, uv_heartbeat_disable);
-}
-
-#else /* !CONFIG_HOTPLUG_CPU */
-
-static __init void uv_scir_register_cpu_notifier(void)
-{
-}
-
-static __init int uv_init_heartbeat(void)
-{
- int cpu;
-
- if (is_uv_system()) {
- for_each_online_cpu(cpu)
- uv_heartbeat_enable(cpu);
- }
-
- return 0;
-}
-
-late_initcall(uv_init_heartbeat);
-
-#endif /* !CONFIG_HOTPLUG_CPU */
-
/* Direct Legacy VGA I/O traffic to designated IOH */
static int uv_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags)
{
@@ -1108,9 +1155,6 @@ void uv_cpu_init(void)
return;
uv_hub_info->nr_online_cpus++;
-
- if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
- set_x2apic_extra_bits(uv_hub_info->pnode);
}
struct mn {
@@ -1120,37 +1164,29 @@ struct mn {
unsigned char n_lshift;
};
+/* Initialize caller's MN struct and fill in values */
static void get_mn(struct mn *mnp)
{
- union uvh_rh_gam_config_mmr_u m_n_config;
- union uv3h_gr0_gam_gr_config_u m_gr_config;
-
- /* Make sure the whole structure is well initialized: */
memset(mnp, 0, sizeof(*mnp));
-
- m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR);
- mnp->n_val = m_n_config.s.n_skt;
-
- if (is_uv4_hub()) {
+ mnp->n_val = uv_cpuid.n_skt;
+ if (is_uv(UV4|UVY)) {
mnp->m_val = 0;
mnp->n_lshift = 0;
} else if (is_uv3_hub()) {
- mnp->m_val = m_n_config.s3.m_skt;
- m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG);
+ union uvyh_gr0_gam_gr_config_u m_gr_config;
+
+ mnp->m_val = uv_cpuid.m_skt;
+ m_gr_config.v = uv_read_local_mmr(UVH_GR0_GAM_GR_CONFIG);
mnp->n_lshift = m_gr_config.s3.m_skt;
} else if (is_uv2_hub()) {
- mnp->m_val = m_n_config.s2.m_skt;
+ mnp->m_val = uv_cpuid.m_skt;
mnp->n_lshift = mnp->m_val == 40 ? 40 : 39;
- } else if (is_uv1_hub()) {
- mnp->m_val = m_n_config.s1.m_skt;
- mnp->n_lshift = mnp->m_val;
}
mnp->m_shift = mnp->m_val ? 64 - mnp->m_val : 0;
}
static void __init uv_init_hub_info(struct uv_hub_info_s *hi)
{
- union uvh_node_id_u node_id;
struct mn mn;
get_mn(&mn);
@@ -1163,18 +1199,20 @@ static void __init uv_init_hub_info(struct uv_hub_info_s *hi)
hi->m_shift = mn.m_shift;
hi->n_lshift = mn.n_lshift ? mn.n_lshift : 0;
hi->hub_revision = uv_hub_info->hub_revision;
+ hi->hub_type = uv_hub_info->hub_type;
hi->pnode_mask = uv_cpuid.pnode_mask;
+ hi->nasid_shift = uv_cpuid.nasid_shift;
hi->min_pnode = _min_pnode;
hi->min_socket = _min_socket;
+ hi->node_to_socket = _node_to_socket;
hi->pnode_to_socket = _pnode_to_socket;
hi->socket_to_node = _socket_to_node;
hi->socket_to_pnode = _socket_to_pnode;
hi->gr_table_len = _gr_table_len;
hi->gr_table = _gr_table;
- node_id.v = uv_read_local_mmr(UVH_NODE_ID);
uv_cpuid.gnode_shift = max_t(unsigned int, uv_cpuid.gnode_shift, mn.n_val);
- hi->gnode_extra = (node_id.s.node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1;
+ hi->gnode_extra = (uv_node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1;
if (mn.m_val)
hi->gnode_upper = (u64)hi->gnode_extra << mn.m_val;
@@ -1186,7 +1224,9 @@ static void __init uv_init_hub_info(struct uv_hub_info_s *hi)
hi->gpa_shift = uv_gp_table->gpa_shift;
hi->gpa_mask = (1UL << hi->gpa_shift) - 1;
} else {
- hi->global_mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE;
+ hi->global_mmr_base =
+ uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG) &
+ ~UV_MMR_ENABLE;
hi->global_mmr_shift = _UV_GLOBAL_MMR64_PNODE_SHIFT;
}
@@ -1197,7 +1237,11 @@ static void __init uv_init_hub_info(struct uv_hub_info_s *hi)
/* Show system specific info: */
pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n", hi->n_val, hi->m_val, hi->m_shift, hi->n_lshift);
pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n", hi->gpa_mask, hi->gpa_shift, hi->pnode_mask, hi->apic_pnode_shift);
- pr_info("UV: mmr_base/shift:0x%lx/%ld gru_base/shift:0x%lx/%ld\n", hi->global_mmr_base, hi->global_mmr_shift, hi->global_gru_base, hi->global_gru_shift);
+ pr_info("UV: mmr_base/shift:0x%lx/%ld\n", hi->global_mmr_base, hi->global_mmr_shift);
+ if (hi->global_gru_base)
+ pr_info("UV: gru_base/shift:0x%lx/%ld\n",
+ hi->global_gru_base, hi->global_gru_shift);
+
pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n", hi->gnode_upper, hi->gnode_extra);
}
@@ -1215,9 +1259,9 @@ static void __init decode_gam_params(unsigned long ptr)
static void __init decode_gam_rng_tbl(unsigned long ptr)
{
struct uv_gam_range_entry *gre = (struct uv_gam_range_entry *)ptr;
- unsigned long lgre = 0;
+ unsigned long lgre = 0, gend = 0;
int index = 0;
- int sock_min = 999999, pnode_min = 99999;
+ int sock_min = INT_MAX, pnode_min = INT_MAX;
int sock_max = -1, pnode_max = -1;
uv_gre_table = gre;
@@ -1249,6 +1293,9 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
flag, size, suffix[order],
gre->type, gre->nasid, gre->sockid, gre->pnode);
+ if (gre->type == UV_GAM_RANGE_TYPE_HOLE)
+ gend = (unsigned long)gre->limit << UV_GAM_RANGE_SHFT;
+
/* update to next range start */
lgre = gre->limit;
if (sock_min > gre->sockid)
@@ -1266,23 +1313,29 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
_max_pnode = pnode_max;
_gr_table_len = index;
- pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", index, _min_socket, _max_socket, _min_pnode, _max_pnode);
+ pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x), pnodes(min:%x,max:%x), gap_end(%d)\n",
+ index, _min_socket, _max_socket, _min_pnode, _max_pnode, fls64(gend));
}
+/* Walk through UVsystab decoding the fields */
static int __init decode_uv_systab(void)
{
struct uv_systab *st;
int i;
- if (uv_hub_info->hub_revision < UV4_HUB_REVISION_BASE)
- return 0; /* No extended UVsystab required */
-
+ /* Get mapped UVsystab pointer */
st = uv_systab;
+
+ /* If UVsystab is version 1, there is no extended UVsystab */
+ if (st && st->revision == UV_SYSTAB_VERSION_1)
+ return 0;
+
if ((!st) || (st->revision < UV_SYSTAB_VERSION_UV4_LATEST)) {
int rev = st ? st->revision : 0;
- pr_err("UV: BIOS UVsystab version(%x) mismatch, expecting(%x)\n", rev, UV_SYSTAB_VERSION_UV4_LATEST);
- pr_err("UV: Cannot support UV operations, switching to generic PC\n");
+ pr_err("UV: BIOS UVsystab mismatch, (%x < %x)\n",
+ rev, UV_SYSTAB_VERSION_UV4_LATEST);
+ pr_err("UV: Does not support UV, switch to non-UV x86_64\n");
uv_system_type = UV_NONE;
return -EINVAL;
@@ -1294,7 +1347,8 @@ static int __init decode_uv_systab(void)
if (!ptr)
continue;
- ptr = ptr + (unsigned long)st;
+ /* point to payload */
+ ptr += (unsigned long)st;
switch (st->entry[i].type) {
case UV_SYSTAB_TYPE_GAM_PARAMS:
@@ -1304,84 +1358,171 @@ static int __init decode_uv_systab(void)
case UV_SYSTAB_TYPE_GAM_RNG_TBL:
decode_gam_rng_tbl(ptr);
break;
+
+ case UV_SYSTAB_TYPE_ARCH_TYPE:
+ /* already processed in early startup */
+ break;
+
+ default:
+ pr_err("UV:%s:Unrecognized UV_SYSTAB_TYPE:%d, skipped\n",
+ __func__, st->entry[i].type);
+ break;
}
}
return 0;
}
/*
- * Set up physical blade translations from UVH_NODE_PRESENT_TABLE
- * .. NB: UVH_NODE_PRESENT_TABLE is going away,
- * .. being replaced by GAM Range Table
+ * Given a bitmask 'bits' representing presnt blades, numbered
+ * starting at 'base', masking off unused high bits of blade number
+ * with 'mask', update the minimum and maximum blade numbers that we
+ * have found. (Masking with 'mask' necessary because of BIOS
+ * treatment of system partitioning when creating this table we are
+ * interpreting.)
*/
+static inline void blade_update_min_max(unsigned long bits, int base, int mask, int *min, int *max)
+{
+ int first, last;
+
+ if (!bits)
+ return;
+ first = (base + __ffs(bits)) & mask;
+ last = (base + __fls(bits)) & mask;
+
+ if (*min > first)
+ *min = first;
+ if (*max < last)
+ *max = last;
+}
+
+/* Set up physical blade translations from UVH_NODE_PRESENT_TABLE */
static __init void boot_init_possible_blades(struct uv_hub_info_s *hub_info)
{
+ unsigned long np;
int i, uv_pb = 0;
+ int sock_min = INT_MAX, sock_max = -1, s_mask;
- pr_info("UV: NODE_PRESENT_DEPTH = %d\n", UVH_NODE_PRESENT_TABLE_DEPTH);
- for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) {
- unsigned long np;
+ s_mask = (1 << uv_cpuid.n_skt) - 1;
- np = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8);
- if (np)
+ if (UVH_NODE_PRESENT_TABLE) {
+ pr_info("UV: NODE_PRESENT_DEPTH = %d\n",
+ UVH_NODE_PRESENT_TABLE_DEPTH);
+ for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) {
+ np = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8);
pr_info("UV: NODE_PRESENT(%d) = 0x%016lx\n", i, np);
+ blade_update_min_max(np, i * 64, s_mask, &sock_min, &sock_max);
+ }
+ }
+ if (UVH_NODE_PRESENT_0) {
+ np = uv_read_local_mmr(UVH_NODE_PRESENT_0);
+ pr_info("UV: NODE_PRESENT_0 = 0x%016lx\n", np);
+ blade_update_min_max(np, 0, s_mask, &sock_min, &sock_max);
+ }
+ if (UVH_NODE_PRESENT_1) {
+ np = uv_read_local_mmr(UVH_NODE_PRESENT_1);
+ pr_info("UV: NODE_PRESENT_1 = 0x%016lx\n", np);
+ blade_update_min_max(np, 64, s_mask, &sock_min, &sock_max);
+ }
- uv_pb += hweight64(np);
+ /* Only update if we actually found some bits indicating blades present */
+ if (sock_max >= sock_min) {
+ _min_socket = sock_min;
+ _max_socket = sock_max;
+ uv_pb = sock_max - sock_min + 1;
}
if (uv_possible_blades != uv_pb)
uv_possible_blades = uv_pb;
+
+ pr_info("UV: number nodes/possible blades %d (%d - %d)\n",
+ uv_pb, sock_min, sock_max);
+}
+
+static int __init alloc_conv_table(int num_elem, unsigned short **table)
+{
+ int i;
+ size_t bytes;
+
+ bytes = num_elem * sizeof(*table[0]);
+ *table = kmalloc(bytes, GFP_KERNEL);
+ if (WARN_ON_ONCE(!*table))
+ return -ENOMEM;
+ for (i = 0; i < num_elem; i++)
+ ((unsigned short *)*table)[i] = SOCK_EMPTY;
+ return 0;
}
+/* Remove conversion table if it's 1:1 */
+#define FREE_1_TO_1_TABLE(tbl, min, max, max2) free_1_to_1_table(&tbl, #tbl, min, max, max2)
+
+static void __init free_1_to_1_table(unsigned short **tp, char *tname, int min, int max, int max2)
+{
+ int i;
+ unsigned short *table = *tp;
+
+ if (table == NULL)
+ return;
+ if (max != max2)
+ return;
+ for (i = 0; i < max; i++) {
+ if (i != table[i])
+ return;
+ }
+ kfree(table);
+ *tp = NULL;
+ pr_info("UV: %s is 1:1, conversion table removed\n", tname);
+}
+
+/*
+ * Build Socket Tables
+ * If the number of nodes is >1 per socket, socket to node table will
+ * contain lowest node number on that socket.
+ */
static void __init build_socket_tables(void)
{
struct uv_gam_range_entry *gre = uv_gre_table;
- int num, nump;
- int cpu, i, lnid;
+ int nums, numn, nump;
+ int i, lnid, apicid;
int minsock = _min_socket;
int maxsock = _max_socket;
int minpnode = _min_pnode;
int maxpnode = _max_pnode;
- size_t bytes;
if (!gre) {
- if (is_uv1_hub() || is_uv2_hub() || is_uv3_hub()) {
+ if (is_uv2_hub() || is_uv3_hub()) {
pr_info("UV: No UVsystab socket table, ignoring\n");
return;
}
- pr_crit("UV: Error: UVsystab address translations not available!\n");
- BUG();
+ pr_err("UV: Error: UVsystab address translations not available!\n");
+ WARN_ON_ONCE(!gre);
+ return;
}
- /* Build socket id -> node id, pnode */
- num = maxsock - minsock + 1;
- bytes = num * sizeof(_socket_to_node[0]);
- _socket_to_node = kmalloc(bytes, GFP_KERNEL);
- _socket_to_pnode = kmalloc(bytes, GFP_KERNEL);
-
+ numn = num_possible_nodes();
nump = maxpnode - minpnode + 1;
- bytes = nump * sizeof(_pnode_to_socket[0]);
- _pnode_to_socket = kmalloc(bytes, GFP_KERNEL);
- BUG_ON(!_socket_to_node || !_socket_to_pnode || !_pnode_to_socket);
-
- for (i = 0; i < num; i++)
- _socket_to_node[i] = _socket_to_pnode[i] = SOCK_EMPTY;
-
- for (i = 0; i < nump; i++)
- _pnode_to_socket[i] = SOCK_EMPTY;
+ nums = maxsock - minsock + 1;
+
+ /* Allocate and clear tables */
+ if ((alloc_conv_table(nump, &_pnode_to_socket) < 0)
+ || (alloc_conv_table(nums, &_socket_to_pnode) < 0)
+ || (alloc_conv_table(numn, &_node_to_socket) < 0)
+ || (alloc_conv_table(nums, &_socket_to_node) < 0)) {
+ kfree(_pnode_to_socket);
+ kfree(_socket_to_pnode);
+ kfree(_node_to_socket);
+ return;
+ }
/* Fill in pnode/node/addr conversion list values: */
- pr_info("UV: GAM Building socket/pnode conversion tables\n");
for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
if (gre->type == UV_GAM_RANGE_TYPE_HOLE)
continue;
i = gre->sockid - minsock;
- /* Duplicate: */
- if (_socket_to_pnode[i] != SOCK_EMPTY)
- continue;
- _socket_to_pnode[i] = gre->pnode;
+ if (_socket_to_pnode[i] == SOCK_EMPTY)
+ _socket_to_pnode[i] = gre->pnode;
i = gre->pnode - minpnode;
- _pnode_to_socket[i] = gre->sockid;
+ if (_pnode_to_socket[i] == SOCK_EMPTY)
+ _pnode_to_socket[i] = gre->sockid;
pr_info("UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n",
gre->sockid, gre->type, gre->nasid,
@@ -1390,78 +1531,129 @@ static void __init build_socket_tables(void)
}
/* Set socket -> node values: */
- lnid = -1;
- for_each_present_cpu(cpu) {
- int nid = cpu_to_node(cpu);
- int apicid, sockid;
+ lnid = NUMA_NO_NODE;
+ for (apicid = 0; apicid < ARRAY_SIZE(__apicid_to_node); apicid++) {
+ int nid = __apicid_to_node[apicid];
+ int sockid;
- if (lnid == nid)
+ if ((nid == NUMA_NO_NODE) || (lnid == nid))
continue;
lnid = nid;
- apicid = per_cpu(x86_cpu_to_apicid, cpu);
+
sockid = apicid >> uv_cpuid.socketid_shift;
- _socket_to_node[sockid - minsock] = nid;
- pr_info("UV: sid:%02x: apicid:%04x node:%2d\n",
- sockid, apicid, nid);
- }
- /* Set up physical blade to pnode translation from GAM Range Table: */
- bytes = num_possible_nodes() * sizeof(_node_to_pnode[0]);
- _node_to_pnode = kmalloc(bytes, GFP_KERNEL);
- BUG_ON(!_node_to_pnode);
+ if (_socket_to_node[sockid - minsock] == SOCK_EMPTY)
+ _socket_to_node[sockid - minsock] = nid;
- for (lnid = 0; lnid < num_possible_nodes(); lnid++) {
- unsigned short sockid;
+ if (_node_to_socket[nid] == SOCK_EMPTY)
+ _node_to_socket[nid] = sockid;
- for (sockid = minsock; sockid <= maxsock; sockid++) {
- if (lnid == _socket_to_node[sockid - minsock]) {
- _node_to_pnode[lnid] = _socket_to_pnode[sockid - minsock];
- break;
- }
- }
- if (sockid > maxsock) {
- pr_err("UV: socket for node %d not found!\n", lnid);
- BUG();
- }
+ pr_info("UV: sid:%02x: apicid:%04x socket:%02d node:%03x s2n:%03x\n",
+ sockid,
+ apicid,
+ _node_to_socket[nid],
+ nid,
+ _socket_to_node[sockid - minsock]);
}
/*
- * If socket id == pnode or socket id == node for all nodes,
+ * If e.g. socket id == pnode for all pnodes,
* system runs faster by removing corresponding conversion table.
*/
- pr_info("UV: Checking socket->node/pnode for identity maps\n");
- if (minsock == 0) {
- for (i = 0; i < num; i++)
- if (_socket_to_node[i] == SOCK_EMPTY || i != _socket_to_node[i])
- break;
- if (i >= num) {
- kfree(_socket_to_node);
- _socket_to_node = NULL;
- pr_info("UV: 1:1 socket_to_node table removed\n");
- }
- }
- if (minsock == minpnode) {
- for (i = 0; i < num; i++)
- if (_socket_to_pnode[i] != SOCK_EMPTY &&
- _socket_to_pnode[i] != i + minpnode)
- break;
- if (i >= num) {
- kfree(_socket_to_pnode);
- _socket_to_pnode = NULL;
- pr_info("UV: 1:1 socket_to_pnode table removed\n");
- }
- }
+ FREE_1_TO_1_TABLE(_socket_to_node, _min_socket, nums, numn);
+ FREE_1_TO_1_TABLE(_node_to_socket, _min_socket, nums, numn);
+ FREE_1_TO_1_TABLE(_socket_to_pnode, _min_pnode, nums, nump);
+ FREE_1_TO_1_TABLE(_pnode_to_socket, _min_pnode, nums, nump);
+}
+
+/* Check which reboot to use */
+static void check_efi_reboot(void)
+{
+ /* If EFI reboot not available, use ACPI reboot */
+ if (!efi_enabled(EFI_BOOT))
+ reboot_type = BOOT_ACPI;
+}
+
+/*
+ * User proc fs file handling now deprecated.
+ * Recommend using /sys/firmware/sgi_uv/... instead.
+ */
+static int __maybe_unused proc_hubbed_show(struct seq_file *file, void *data)
+{
+ pr_notice_once("%s: using deprecated /proc/sgi_uv/hubbed, use /sys/firmware/sgi_uv/hub_type\n",
+ current->comm);
+ seq_printf(file, "0x%x\n", uv_hubbed_system);
+ return 0;
+}
+
+static int __maybe_unused proc_hubless_show(struct seq_file *file, void *data)
+{
+ pr_notice_once("%s: using deprecated /proc/sgi_uv/hubless, use /sys/firmware/sgi_uv/hubless\n",
+ current->comm);
+ seq_printf(file, "0x%x\n", uv_hubless_system);
+ return 0;
+}
+
+static int __maybe_unused proc_archtype_show(struct seq_file *file, void *data)
+{
+ pr_notice_once("%s: using deprecated /proc/sgi_uv/archtype, use /sys/firmware/sgi_uv/archtype\n",
+ current->comm);
+ seq_printf(file, "%s/%s\n", uv_archtype, oem_table_id);
+ return 0;
+}
+
+static __init void uv_setup_proc_files(int hubless)
+{
+ struct proc_dir_entry *pde;
+
+ pde = proc_mkdir(UV_PROC_NODE, NULL);
+ proc_create_single("archtype", 0, pde, proc_archtype_show);
+ if (hubless)
+ proc_create_single("hubless", 0, pde, proc_hubless_show);
+ else
+ proc_create_single("hubbed", 0, pde, proc_hubbed_show);
+}
+
+/* Initialize UV hubless systems */
+static __init int uv_system_init_hubless(void)
+{
+ int rc;
+
+ /* Setup PCH NMI handler */
+ uv_nmi_setup_hubless();
+
+ /* Init kernel/BIOS interface */
+ rc = uv_bios_init();
+ if (rc < 0)
+ return rc;
+
+ /* Process UVsystab */
+ rc = decode_uv_systab();
+ if (rc < 0)
+ return rc;
+
+ /* Set section block size for current node memory */
+ set_block_size();
+
+ /* Create user access node */
+ if (rc >= 0)
+ uv_setup_proc_files(1);
+
+ check_efi_reboot();
+
+ return rc;
}
static void __init uv_system_init_hub(void)
{
struct uv_hub_info_s hub_info = {0};
- int bytes, cpu, nodeid;
- unsigned short min_pnode = 9999, max_pnode = 0;
- char *hub = is_uv4_hub() ? "UV400" :
+ int bytes, cpu, nodeid, bid;
+ unsigned short min_pnode = USHRT_MAX, max_pnode = 0;
+ char *hub = is_uv5_hub() ? "UV500" :
+ is_uv4_hub() ? "UV400" :
is_uv3_hub() ? "UV300" :
- is_uv2_hub() ? "UV2000/3000" :
- is_uv1_hub() ? "UV100/1000" : NULL;
+ is_uv2_hub() ? "UV2000/3000" : NULL;
+ struct uv_hub_info_s **uv_hub_info_list_blade;
if (!hub) {
pr_err("UV: Unknown/unsupported UV hub\n");
@@ -1471,20 +1663,25 @@ static void __init uv_system_init_hub(void)
map_low_mmrs();
- /* Get uv_systab for decoding: */
+ /* Get uv_systab for decoding, setup UV BIOS calls */
uv_bios_init();
/* If there's an UVsystab problem then abort UV init: */
- if (decode_uv_systab() < 0)
+ if (decode_uv_systab() < 0) {
+ pr_err("UV: Mangled UVsystab format\n");
return;
+ }
build_socket_tables();
build_uv_gr_table();
set_block_size();
uv_init_hub_info(&hub_info);
- uv_possible_blades = num_possible_nodes();
- if (!_node_to_pnode)
+ /* If UV2 or UV3 may need to get # blades from HW */
+ if (is_uv(UV2|UV3) && !uv_gre_table)
boot_init_possible_blades(&hub_info);
+ else
+ /* min/max sockets set in decode_gam_rng_tbl */
+ uv_possible_blades = (_max_socket - _min_socket) + 1;
/* uv_num_possible_blades() is really the hub count: */
pr_info("UV: Found %d hubs, %d nodes, %d CPUs\n", uv_num_possible_blades(), num_possible_nodes(), num_possible_cpus());
@@ -1493,81 +1690,98 @@ static void __init uv_system_init_hub(void)
hub_info.coherency_domain_number = sn_coherency_id;
uv_rtc_init();
+ /*
+ * __uv_hub_info_list[] is indexed by node, but there is only
+ * one hub_info structure per blade. First, allocate one
+ * structure per blade. Further down we create a per-node
+ * table (__uv_hub_info_list[]) pointing to hub_info
+ * structures for the correct blade.
+ */
+
bytes = sizeof(void *) * uv_num_possible_blades();
- __uv_hub_info_list = kzalloc(bytes, GFP_KERNEL);
- BUG_ON(!__uv_hub_info_list);
+ uv_hub_info_list_blade = kzalloc(bytes, GFP_KERNEL);
+ if (WARN_ON_ONCE(!uv_hub_info_list_blade))
+ return;
bytes = sizeof(struct uv_hub_info_s);
- for_each_node(nodeid) {
+ for_each_possible_blade(bid) {
struct uv_hub_info_s *new_hub;
- if (__uv_hub_info_list[nodeid]) {
- pr_err("UV: Node %d UV HUB already initialized!?\n", nodeid);
- BUG();
+ /* Allocate & fill new per hub info list */
+ new_hub = (bid == 0) ? &uv_hub_info_node0
+ : kzalloc_node(bytes, GFP_KERNEL, uv_blade_to_node(bid));
+ if (WARN_ON_ONCE(!new_hub)) {
+ /* do not kfree() bid 0, which is statically allocated */
+ while (--bid > 0)
+ kfree(uv_hub_info_list_blade[bid]);
+ kfree(uv_hub_info_list_blade);
+ return;
}
- /* Allocate new per hub info list */
- new_hub = (nodeid == 0) ? &uv_hub_info_node0 : kzalloc_node(bytes, GFP_KERNEL, nodeid);
- BUG_ON(!new_hub);
- __uv_hub_info_list[nodeid] = new_hub;
- new_hub = uv_hub_info_list(nodeid);
- BUG_ON(!new_hub);
+ uv_hub_info_list_blade[bid] = new_hub;
*new_hub = hub_info;
/* Use information from GAM table if available: */
- if (_node_to_pnode)
- new_hub->pnode = _node_to_pnode[nodeid];
+ if (uv_gre_table)
+ new_hub->pnode = uv_blade_to_pnode(bid);
else /* Or fill in during CPU loop: */
new_hub->pnode = 0xffff;
- new_hub->numa_blade_id = uv_node_to_blade_id(nodeid);
- new_hub->memory_nid = -1;
+ new_hub->numa_blade_id = bid;
+ new_hub->memory_nid = NUMA_NO_NODE;
new_hub->nr_possible_cpus = 0;
new_hub->nr_online_cpus = 0;
}
+ /*
+ * Now populate __uv_hub_info_list[] for each node with the
+ * pointer to the struct for the blade it resides on.
+ */
+
+ bytes = sizeof(void *) * num_possible_nodes();
+ __uv_hub_info_list = kzalloc(bytes, GFP_KERNEL);
+ if (WARN_ON_ONCE(!__uv_hub_info_list)) {
+ for_each_possible_blade(bid)
+ /* bid 0 is statically allocated */
+ if (bid != 0)
+ kfree(uv_hub_info_list_blade[bid]);
+ kfree(uv_hub_info_list_blade);
+ return;
+ }
+
+ for_each_node(nodeid)
+ __uv_hub_info_list[nodeid] = uv_hub_info_list_blade[uv_node_to_blade_id(nodeid)];
+
/* Initialize per CPU info: */
for_each_possible_cpu(cpu) {
int apicid = per_cpu(x86_cpu_to_apicid, cpu);
- int numa_node_id;
+ unsigned short bid;
unsigned short pnode;
- nodeid = cpu_to_node(cpu);
- numa_node_id = numa_cpu_node(cpu);
pnode = uv_apicid_to_pnode(apicid);
+ bid = uv_pnode_to_socket(pnode) - _min_socket;
- uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid);
+ uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list_blade[bid];
uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++;
- if (uv_cpu_hub_info(cpu)->memory_nid == -1)
+ if (uv_cpu_hub_info(cpu)->memory_nid == NUMA_NO_NODE)
uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu);
- /* Init memoryless node: */
- if (nodeid != numa_node_id &&
- uv_hub_info_list(numa_node_id)->pnode == 0xffff)
- uv_hub_info_list(numa_node_id)->pnode = pnode;
- else if (uv_cpu_hub_info(cpu)->pnode == 0xffff)
+ if (uv_cpu_hub_info(cpu)->pnode == 0xffff)
uv_cpu_hub_info(cpu)->pnode = pnode;
-
- uv_cpu_scir_info(cpu)->offset = uv_scir_offset(apicid);
}
- for_each_node(nodeid) {
- unsigned short pnode = uv_hub_info_list(nodeid)->pnode;
+ for_each_possible_blade(bid) {
+ unsigned short pnode = uv_hub_info_list_blade[bid]->pnode;
- /* Add pnode info for pre-GAM list nodes without CPUs: */
- if (pnode == 0xffff) {
- unsigned long paddr;
+ if (pnode == 0xffff)
+ continue;
- paddr = node_start_pfn(nodeid) << PAGE_SHIFT;
- pnode = uv_gpa_to_pnode(uv_soc_phys_ram_to_gpa(paddr));
- uv_hub_info_list(nodeid)->pnode = pnode;
- }
min_pnode = min(pnode, min_pnode);
max_pnode = max(pnode, max_pnode);
- pr_info("UV: UVHUB node:%2d pn:%02x nrcpus:%d\n",
- nodeid,
- uv_hub_info_list(nodeid)->pnode,
- uv_hub_info_list(nodeid)->nr_possible_cpus);
+ pr_info("UV: HUB:%2d pn:%02x nrcpus:%d\n",
+ bid,
+ uv_hub_info_list_blade[bid]->pnode,
+ uv_hub_info_list_blade[bid]->nr_possible_cpus);
}
pr_info("UV: min_pnode:%02x max_pnode:%02x\n", min_pnode, max_pnode);
@@ -1575,35 +1789,32 @@ static void __init uv_system_init_hub(void)
map_mmr_high(max_pnode);
map_mmioh_high(min_pnode, max_pnode);
+ kfree(uv_hub_info_list_blade);
+ uv_hub_info_list_blade = NULL;
+
uv_nmi_setup();
uv_cpu_init();
- uv_scir_register_cpu_notifier();
- proc_mkdir("sgi_uv", NULL);
+ uv_setup_proc_files(0);
/* Register Legacy VGA I/O redirection handler: */
pci_register_set_vga_state(uv_set_vga_state);
- /*
- * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as
- * EFI is not enabled in the kdump kernel:
- */
- if (is_kdump_kernel())
- reboot_type = BOOT_ACPI;
+ check_efi_reboot();
}
/*
- * There is a small amount of UV specific code needed to initialize a
- * UV system that does not have a "UV HUB" (referred to as "hubless").
+ * There is a different code path needed to initialize a UV system that does
+ * not have a "UV HUB" (referred to as "hubless").
*/
void __init uv_system_init(void)
{
- if (likely(!is_uv_system() && !is_uv_hubless()))
+ if (likely(!is_uv_system() && !is_uv_hubless(1)))
return;
if (is_uv_system())
uv_system_init_hub();
else
- uv_nmi_setup_hubless();
+ uv_system_init_hubless();
}
apic_driver(apic_x2apic_uv_x);