summaryrefslogtreecommitdiff
path: root/arch/x86/kernel/acpi
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/acpi')
-rw-r--r--arch/x86/kernel/acpi/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c217
-rw-r--r--arch/x86/kernel/acpi/cppc.c204
-rw-r--r--arch/x86/kernel/acpi/cstate.c9
-rw-r--r--arch/x86/kernel/acpi/madt_playdead.S28
-rw-r--r--arch/x86/kernel/acpi/madt_wakeup.c292
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S25
7 files changed, 604 insertions, 172 deletions
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index fc17b3f136fe..842a5f449404 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
obj-$(CONFIG_ACPI_APEI) += apei.o
obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o
+obj-$(CONFIG_ACPI_MADT_WAKEUP) += madt_wakeup.o madt_playdead.o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 85a3ce2a3666..dae6a73be40e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -67,13 +67,6 @@ static bool has_lapic_cpus __initdata;
static bool acpi_support_online_capable;
#endif
-#ifdef CONFIG_X86_64
-/* Physical address of the Multiprocessor Wakeup Structure mailbox */
-static u64 acpi_mp_wake_mailbox_paddr;
-/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
-static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox;
-#endif
-
#ifdef CONFIG_X86_IO_APIC
/*
* Locks related to IOAPIC hotplug
@@ -164,35 +157,6 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
return 0;
}
-/**
- * acpi_register_lapic - register a local apic and generates a logic cpu number
- * @id: local apic id to register
- * @acpiid: ACPI id to register
- * @enabled: this cpu is enabled or not
- *
- * Returns the logic cpu number which maps to the local apic
- */
-static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
-{
- int cpu;
-
- if (id >= MAX_LOCAL_APIC) {
- pr_info("skipped apicid that is too big\n");
- return -EINVAL;
- }
-
- if (!enabled) {
- ++disabled_cpus;
- return -EINVAL;
- }
-
- cpu = generic_processor_info(id);
- if (cpu >= 0)
- early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid;
-
- return cpu;
-}
-
static bool __init acpi_is_processor_usable(u32 lapic_flags)
{
if (lapic_flags & ACPI_MADT_ENABLED)
@@ -254,7 +218,7 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
return 0;
}
- acpi_register_lapic(apic_id, processor->uid, enabled);
+ topology_register_apic(apic_id, processor->uid, enabled);
#else
pr_warn("x2apic entry ignored\n");
#endif
@@ -263,6 +227,28 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
}
static int __init
+acpi_check_lapic(union acpi_subtable_headers *header, const unsigned long end)
+{
+ struct acpi_madt_local_apic *processor = NULL;
+
+ processor = (struct acpi_madt_local_apic *)header;
+
+ if (BAD_MADT_ENTRY(processor, end))
+ return -EINVAL;
+
+ /* Ignore invalid ID */
+ if (processor->id == 0xff)
+ return 0;
+
+ /* Ignore processors that can not be onlined */
+ if (!acpi_is_processor_usable(processor->lapic_flags))
+ return 0;
+
+ has_lapic_cpus = true;
+ return 0;
+}
+
+static int __init
acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end)
{
struct acpi_madt_local_apic *processor = NULL;
@@ -289,11 +275,10 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end)
* to not preallocating memory for all NR_CPUS
* when we use CPU hotplug.
*/
- acpi_register_lapic(processor->id, /* APIC ID */
- processor->processor_id, /* ACPI ID */
- processor->lapic_flags & ACPI_MADT_ENABLED);
+ topology_register_apic(processor->id, /* APIC ID */
+ processor->processor_id, /* ACPI ID */
+ processor->lapic_flags & ACPI_MADT_ENABLED);
- has_lapic_cpus = true;
return 0;
}
@@ -309,9 +294,9 @@ acpi_parse_sapic(union acpi_subtable_headers *header, const unsigned long end)
acpi_table_print_madt_entry(&header->common);
- acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */
- processor->processor_id, /* ACPI ID */
- processor->lapic_flags & ACPI_MADT_ENABLED);
+ topology_register_apic((processor->id << 8) | processor->eid,/* APIC ID */
+ processor->processor_id, /* ACPI ID */
+ processor->lapic_flags & ACPI_MADT_ENABLED);
return 0;
}
@@ -370,60 +355,6 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e
return 0;
}
-
-#ifdef CONFIG_X86_64
-static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
-{
- /*
- * Remap mailbox memory only for the first call to acpi_wakeup_cpu().
- *
- * Wakeup of secondary CPUs is fully serialized in the core code.
- * No need to protect acpi_mp_wake_mailbox from concurrent accesses.
- */
- if (!acpi_mp_wake_mailbox) {
- acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
- sizeof(*acpi_mp_wake_mailbox),
- MEMREMAP_WB);
- }
-
- /*
- * Mailbox memory is shared between the firmware and OS. Firmware will
- * listen on mailbox command address, and once it receives the wakeup
- * command, the CPU associated with the given apicid will be booted.
- *
- * The value of 'apic_id' and 'wakeup_vector' must be visible to the
- * firmware before the wakeup command is visible. smp_store_release()
- * ensures ordering and visibility.
- */
- acpi_mp_wake_mailbox->apic_id = apicid;
- acpi_mp_wake_mailbox->wakeup_vector = start_ip;
- smp_store_release(&acpi_mp_wake_mailbox->command,
- ACPI_MP_WAKE_COMMAND_WAKEUP);
-
- /*
- * Wait for the CPU to wake up.
- *
- * The CPU being woken up is essentially in a spin loop waiting to be
- * woken up. It should not take long for it wake up and acknowledge by
- * zeroing out ->command.
- *
- * ACPI specification doesn't provide any guidance on how long kernel
- * has to wait for a wake up acknowledgement. It also doesn't provide
- * a way to cancel a wake up request if it takes too long.
- *
- * In TDX environment, the VMM has control over how long it takes to
- * wake up secondary. It can postpone scheduling secondary vCPU
- * indefinitely. Giving up on wake up request and reporting error opens
- * possible attack vector for VMM: it can wake up a secondary CPU when
- * kernel doesn't expect it. Wait until positive result of the wake up
- * request.
- */
- while (READ_ONCE(acpi_mp_wake_mailbox->command))
- cpu_relax();
-
- return 0;
-}
-#endif /* CONFIG_X86_64 */
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
@@ -844,12 +775,10 @@ static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
return 0;
}
-int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id,
- int *pcpu)
+int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, int *pcpu)
{
- int cpu;
+ int cpu = topology_hotplug_apic(physid, acpi_id);
- cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED);
if (cpu < 0) {
pr_info("Unable to map lapic to logical cpu number\n");
return cpu;
@@ -868,15 +797,11 @@ int acpi_unmap_cpu(int cpu)
#ifdef CONFIG_ACPI_NUMA
set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
#endif
-
- per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
- set_cpu_present(cpu, false);
- num_processors--;
-
- return (0);
+ topology_hotunplug_apic(cpu);
+ return 0;
}
EXPORT_SYMBOL(acpi_unmap_cpu);
-#endif /* CONFIG_ACPI_HOTPLUG_CPU */
+#endif /* CONFIG_ACPI_HOTPLUG_CPU */
int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
{
@@ -1007,11 +932,8 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
* the resource tree during the lateinit timeframe.
*/
#define HPET_RESOURCE_NAME_SIZE 9
- hpet_res = memblock_alloc(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE,
+ hpet_res = memblock_alloc_or_panic(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE,
SMP_CACHE_BYTES);
- if (!hpet_res)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE);
hpet_res->name = (void *)&hpet_res[1];
hpet_res->flags = IORESOURCE_MEM;
@@ -1125,6 +1047,8 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
static int __init acpi_parse_madt_lapic_entries(void)
{
int count, x2count = 0;
+ struct acpi_subtable_proc madt_proc[2];
+ int ret;
if (!boot_cpu_has(X86_FEATURE_APIC))
return -ENODEV;
@@ -1133,10 +1057,27 @@ static int __init acpi_parse_madt_lapic_entries(void)
acpi_parse_sapic, MAX_LOCAL_APIC);
if (!count) {
- count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
- acpi_parse_lapic, MAX_LOCAL_APIC);
- x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
- acpi_parse_x2apic, MAX_LOCAL_APIC);
+ /* Check if there are valid LAPIC entries */
+ acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, acpi_check_lapic, MAX_LOCAL_APIC);
+
+ /*
+ * Enumerate the APIC IDs in the order that they appear in the
+ * MADT, no matter LAPIC entry or x2APIC entry is used.
+ */
+ memset(madt_proc, 0, sizeof(madt_proc));
+ madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC;
+ madt_proc[0].handler = acpi_parse_lapic;
+ madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC;
+ madt_proc[1].handler = acpi_parse_x2apic;
+ ret = acpi_table_parse_entries_array(ACPI_SIG_MADT,
+ sizeof(struct acpi_table_madt),
+ madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC);
+ if (ret < 0) {
+ pr_err("Error parsing LAPIC/X2APIC entries\n");
+ return ret;
+ }
+ count = madt_proc[0].count;
+ x2count = madt_proc[1].count;
}
if (!count && !x2count) {
pr_err("No LAPIC entries present\n");
@@ -1159,29 +1100,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
}
return 0;
}
-
-#ifdef CONFIG_X86_64
-static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
- const unsigned long end)
-{
- struct acpi_madt_multiproc_wakeup *mp_wake;
-
- if (!IS_ENABLED(CONFIG_SMP))
- return -ENODEV;
-
- mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
- if (BAD_MADT_ENTRY(mp_wake, end))
- return -EINVAL;
-
- acpi_table_print_madt_entry(&header->common);
-
- acpi_mp_wake_mailbox_paddr = mp_wake->base_address;
-
- apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);
-
- return 0;
-}
-#endif /* CONFIG_X86_64 */
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
@@ -1290,7 +1208,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
}
count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE,
- acpi_parse_int_src_ovr, nr_irqs);
+ acpi_parse_int_src_ovr,
+ irq_get_nr_irqs());
if (count < 0) {
pr_err("Error parsing interrupt source overrides entry\n");
/* TBD: Cleanup to allow fallback to MPS */
@@ -1310,7 +1229,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
mp_config_acpi_legacy_irqs();
count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE,
- acpi_parse_nmi_src, nr_irqs);
+ acpi_parse_nmi_src,
+ irq_get_nr_irqs());
if (count < 0) {
pr_err("Error parsing NMI SRC entry\n");
/* TBD: Cleanup to allow fallback to MPS */
@@ -1378,7 +1298,7 @@ static void __init acpi_process_madt(void)
smp_found_config = 1;
}
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_ACPI_MADT_WAKEUP
/*
* Parse MADT MP Wake entry.
*/
@@ -1897,3 +1817,14 @@ u64 x86_default_get_root_pointer(void)
{
return boot_params.acpi_rsdp_addr;
}
+
+#ifdef CONFIG_XEN_PV
+void __iomem *x86_acpi_os_ioremap(acpi_physical_address phys, acpi_size size)
+{
+ return ioremap_cache(phys, size);
+}
+
+void __iomem * (*acpi_os_ioremap)(acpi_physical_address phys, acpi_size size) =
+ x86_acpi_os_ioremap;
+EXPORT_SYMBOL_GPL(acpi_os_ioremap);
+#endif
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index 8d8752b44f11..d745dd586303 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -9,6 +9,17 @@
#include <asm/processor.h>
#include <asm/topology.h>
+#define CPPC_HIGHEST_PERF_PERFORMANCE 196
+#define CPPC_HIGHEST_PERF_PREFCORE 166
+
+enum amd_pref_core {
+ AMD_PREF_CORE_UNKNOWN = 0,
+ AMD_PREF_CORE_SUPPORTED,
+ AMD_PREF_CORE_UNSUPPORTED,
+};
+static enum amd_pref_core amd_pref_core_detected;
+static u64 boost_numerator;
+
/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */
bool cpc_supported_by_cpu(void)
@@ -20,7 +31,7 @@ bool cpc_supported_by_cpu(void)
(boot_cpu_data.x86_model >= 0x20 && boot_cpu_data.x86_model <= 0x2f)))
return true;
else if (boot_cpu_data.x86 == 0x17 &&
- boot_cpu_data.x86_model >= 0x70 && boot_cpu_data.x86_model <= 0x7f)
+ boot_cpu_data.x86_model >= 0x30 && boot_cpu_data.x86_model <= 0x7f)
return true;
return boot_cpu_has(X86_FEATURE_CPPC);
}
@@ -69,38 +80,37 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
static void amd_set_max_freq_ratio(void)
{
struct cppc_perf_caps perf_caps;
- u64 highest_perf, nominal_perf;
+ u64 numerator, nominal_perf;
u64 perf_ratio;
int rc;
rc = cppc_get_perf_caps(0, &perf_caps);
if (rc) {
- pr_debug("Could not retrieve perf counters (%d)\n", rc);
+ pr_warn("Could not retrieve perf counters (%d)\n", rc);
return;
}
- highest_perf = amd_get_highest_perf();
+ rc = amd_get_boost_ratio_numerator(0, &numerator);
+ if (rc) {
+ pr_warn("Could not retrieve highest performance (%d)\n", rc);
+ return;
+ }
nominal_perf = perf_caps.nominal_perf;
- if (!highest_perf || !nominal_perf) {
- pr_debug("Could not retrieve highest or nominal performance\n");
+ if (!nominal_perf) {
+ pr_warn("Could not retrieve nominal performance\n");
return;
}
- perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf);
/* midpoint between max_boost and max_P */
- perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
- if (!perf_ratio) {
- pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n");
- return;
- }
+ perf_ratio = (div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf) + SCHED_CAPACITY_SCALE) >> 1;
freq_invariance_set_perf_ratio(perf_ratio, false);
}
static DEFINE_MUTEX(freq_invariance_lock);
-void init_freq_invariance_cppc(void)
+static inline void init_freq_invariance_cppc(void)
{
static bool init_done;
@@ -116,3 +126,171 @@ void init_freq_invariance_cppc(void)
init_done = true;
mutex_unlock(&freq_invariance_lock);
}
+
+void acpi_processor_init_invariance_cppc(void)
+{
+ init_freq_invariance_cppc();
+}
+
+/*
+ * Get the highest performance register value.
+ * @cpu: CPU from which to get highest performance.
+ * @highest_perf: Return address for highest performance value.
+ *
+ * Return: 0 for success, negative error code otherwise.
+ */
+int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf)
+{
+ u64 val;
+ int ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
+ ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val);
+ if (ret)
+ goto out;
+
+ val = AMD_CPPC_HIGHEST_PERF(val);
+ } else {
+ ret = cppc_get_highest_perf(cpu, &val);
+ if (ret)
+ goto out;
+ }
+
+ WRITE_ONCE(*highest_perf, (u32)val);
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(amd_get_highest_perf);
+
+/**
+ * amd_detect_prefcore: Detect if CPUs in the system support preferred cores
+ * @detected: Output variable for the result of the detection.
+ *
+ * Determine whether CPUs in the system support preferred cores. On systems
+ * that support preferred cores, different highest perf values will be found
+ * on different cores. On other systems, the highest perf value will be the
+ * same on all cores.
+ *
+ * The result of the detection will be stored in the 'detected' parameter.
+ *
+ * Return: 0 for success, negative error code otherwise
+ */
+int amd_detect_prefcore(bool *detected)
+{
+ int cpu, count = 0;
+ u64 highest_perf[2] = {0};
+
+ if (WARN_ON(!detected))
+ return -EINVAL;
+
+ switch (amd_pref_core_detected) {
+ case AMD_PREF_CORE_SUPPORTED:
+ *detected = true;
+ return 0;
+ case AMD_PREF_CORE_UNSUPPORTED:
+ *detected = false;
+ return 0;
+ default:
+ break;
+ }
+
+ for_each_present_cpu(cpu) {
+ u32 tmp;
+ int ret;
+
+ ret = amd_get_highest_perf(cpu, &tmp);
+ if (ret)
+ return ret;
+
+ if (!count || (count == 1 && tmp != highest_perf[0]))
+ highest_perf[count++] = tmp;
+
+ if (count == 2)
+ break;
+ }
+
+ *detected = (count == 2);
+ boost_numerator = highest_perf[0];
+
+ amd_pref_core_detected = *detected ? AMD_PREF_CORE_SUPPORTED :
+ AMD_PREF_CORE_UNSUPPORTED;
+
+ pr_debug("AMD CPPC preferred core is %ssupported (highest perf: 0x%llx)\n",
+ *detected ? "" : "un", highest_perf[0]);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(amd_detect_prefcore);
+
+/**
+ * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation
+ * @cpu: CPU to get numerator for.
+ * @numerator: Output variable for numerator.
+ *
+ * Determine the numerator to use for calculating the boost ratio on
+ * a CPU. On systems that support preferred cores, this will be a hardcoded
+ * value. On other systems this will the highest performance register value.
+ *
+ * If booting the system with amd-pstate enabled but preferred cores disabled then
+ * the correct boost numerator will be returned to match hardware capabilities
+ * even if the preferred cores scheduling hints are not enabled.
+ *
+ * Return: 0 for success, negative error code otherwise.
+ */
+int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
+{
+ enum x86_topology_cpu_type core_type = get_topology_cpu_type(&cpu_data(cpu));
+ bool prefcore;
+ int ret;
+ u32 tmp;
+
+ ret = amd_detect_prefcore(&prefcore);
+ if (ret)
+ return ret;
+
+ /* without preferred cores, return the highest perf register value */
+ if (!prefcore) {
+ *numerator = boost_numerator;
+ return 0;
+ }
+
+ /*
+ * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f,
+ * the highest performance level is set to 196.
+ * https://bugzilla.kernel.org/show_bug.cgi?id=218759
+ */
+ if (cpu_feature_enabled(X86_FEATURE_ZEN4)) {
+ switch (boot_cpu_data.x86_model) {
+ case 0x70 ... 0x7f:
+ *numerator = CPPC_HIGHEST_PERF_PERFORMANCE;
+ return 0;
+ default:
+ break;
+ }
+ }
+
+ /* detect if running on heterogeneous design */
+ if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) {
+ switch (core_type) {
+ case TOPO_CPU_TYPE_UNKNOWN:
+ pr_warn("Undefined core type found for cpu %d\n", cpu);
+ break;
+ case TOPO_CPU_TYPE_PERFORMANCE:
+ /* use the max scale for performance cores */
+ *numerator = CPPC_HIGHEST_PERF_PERFORMANCE;
+ return 0;
+ case TOPO_CPU_TYPE_EFFICIENCY:
+ /* use the highest perf value for efficiency cores */
+ ret = amd_get_highest_perf(cpu, &tmp);
+ if (ret)
+ return ret;
+ *numerator = tmp;
+ return 0;
+ }
+ }
+
+ *numerator = CPPC_HIGHEST_PERF_PREFCORE;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(amd_get_boost_ratio_numerator);
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 401808b47af3..5854f0b8f0f1 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -13,6 +13,7 @@
#include <linux/sched.h>
#include <acpi/processor.h>
+#include <asm/cpuid.h>
#include <asm/mwait.h>
#include <asm/special_insns.h>
@@ -128,11 +129,11 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
unsigned int cstate_type; /* C-state type and not ACPI C-state type */
unsigned int num_cstate_subtype;
- cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
+ cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &edx);
/* Check whether this particular cx_type (in CST) is supported or not */
- cstate_type = ((cx->address >> MWAIT_SUBSTATE_SIZE) &
- MWAIT_CSTATE_MASK) + 1;
+ cstate_type = (((cx->address >> MWAIT_SUBSTATE_SIZE) &
+ MWAIT_CSTATE_MASK) + 1) & MWAIT_CSTATE_MASK;
edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
@@ -172,7 +173,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
struct cpuinfo_x86 *c = &cpu_data(cpu);
long retval;
- if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF)
+ if (!cpu_cstate_entry || c->cpuid_level < CPUID_LEAF_MWAIT)
return -1;
if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT)
diff --git a/arch/x86/kernel/acpi/madt_playdead.S b/arch/x86/kernel/acpi/madt_playdead.S
new file mode 100644
index 000000000000..4e498d28cdc8
--- /dev/null
+++ b/arch/x86/kernel/acpi/madt_playdead.S
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/nospec-branch.h>
+#include <asm/page_types.h>
+#include <asm/processor-flags.h>
+
+ .text
+ .align PAGE_SIZE
+
+/*
+ * asm_acpi_mp_play_dead() - Hand over control of the CPU to the BIOS
+ *
+ * rdi: Address of the ACPI MADT MPWK ResetVector
+ * rsi: PGD of the identity mapping
+ */
+SYM_FUNC_START(asm_acpi_mp_play_dead)
+ /* Turn off global entries. Following CR3 write will flush them. */
+ movq %cr4, %rdx
+ andq $~(X86_CR4_PGE), %rdx
+ movq %rdx, %cr4
+
+ /* Switch to identity mapping */
+ movq %rsi, %cr3
+
+ /* Jump to reset vector */
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rdi
+SYM_FUNC_END(asm_acpi_mp_play_dead)
diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c
new file mode 100644
index 000000000000..d5ef6215583b
--- /dev/null
+++ b/arch/x86/kernel/acpi/madt_wakeup.c
@@ -0,0 +1,292 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/acpi.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/kexec.h>
+#include <linux/memblock.h>
+#include <linux/pgtable.h>
+#include <linux/sched/hotplug.h>
+#include <asm/apic.h>
+#include <asm/barrier.h>
+#include <asm/init.h>
+#include <asm/intel_pt.h>
+#include <asm/nmi.h>
+#include <asm/processor.h>
+#include <asm/reboot.h>
+
+/* Physical address of the Multiprocessor Wakeup Structure mailbox */
+static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;
+
+/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
+static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox;
+
+static u64 acpi_mp_pgd __ro_after_init;
+static u64 acpi_mp_reset_vector_paddr __ro_after_init;
+
+static void acpi_mp_stop_this_cpu(void)
+{
+ asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
+}
+
+static void acpi_mp_play_dead(void)
+{
+ play_dead_common();
+ asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
+}
+
+static void acpi_mp_cpu_die(unsigned int cpu)
+{
+ u32 apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ unsigned long timeout;
+
+ /*
+ * Use TEST mailbox command to prove that BIOS got control over
+ * the CPU before declaring it dead.
+ *
+ * BIOS has to clear 'command' field of the mailbox.
+ */
+ acpi_mp_wake_mailbox->apic_id = apicid;
+ smp_store_release(&acpi_mp_wake_mailbox->command,
+ ACPI_MP_WAKE_COMMAND_TEST);
+
+ /* Don't wait longer than a second. */
+ timeout = USEC_PER_SEC;
+ while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout)
+ udelay(1);
+
+ if (!timeout)
+ pr_err("Failed to hand over CPU %d to BIOS\n", cpu);
+}
+
+/* The argument is required to match type of x86_mapping_info::alloc_pgt_page */
+static void __init *alloc_pgt_page(void *dummy)
+{
+ return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+}
+
+static void __init free_pgt_page(void *pgt, void *dummy)
+{
+ return memblock_free(pgt, PAGE_SIZE);
+}
+
+/*
+ * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at
+ * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches
+ * to the identity mapping and the function has be present at the same spot in
+ * the virtual address space before and after switching page tables.
+ */
+static int __init init_transition_pgtable(pgd_t *pgd)
+{
+ pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
+ unsigned long vaddr, paddr;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ vaddr = (unsigned long)asm_acpi_mp_play_dead;
+ pgd += pgd_index(vaddr);
+ if (!pgd_present(*pgd)) {
+ p4d = (p4d_t *)alloc_pgt_page(NULL);
+ if (!p4d)
+ return -ENOMEM;
+ set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+ }
+ p4d = p4d_offset(pgd, vaddr);
+ if (!p4d_present(*p4d)) {
+ pud = (pud_t *)alloc_pgt_page(NULL);
+ if (!pud)
+ return -ENOMEM;
+ set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+ }
+ pud = pud_offset(p4d, vaddr);
+ if (!pud_present(*pud)) {
+ pmd = (pmd_t *)alloc_pgt_page(NULL);
+ if (!pmd)
+ return -ENOMEM;
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ }
+ pmd = pmd_offset(pud, vaddr);
+ if (!pmd_present(*pmd)) {
+ pte = (pte_t *)alloc_pgt_page(NULL);
+ if (!pte)
+ return -ENOMEM;
+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
+ }
+ pte = pte_offset_kernel(pmd, vaddr);
+
+ paddr = __pa(vaddr);
+ set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
+
+ return 0;
+}
+
+static int __init acpi_mp_setup_reset(u64 reset_vector)
+{
+ struct x86_mapping_info info = {
+ .alloc_pgt_page = alloc_pgt_page,
+ .free_pgt_page = free_pgt_page,
+ .page_flag = __PAGE_KERNEL_LARGE_EXEC,
+ .kernpg_flag = _KERNPG_TABLE_NOENC,
+ };
+ pgd_t *pgd;
+
+ pgd = alloc_pgt_page(NULL);
+ if (!pgd)
+ return -ENOMEM;
+
+ for (int i = 0; i < nr_pfn_mapped; i++) {
+ unsigned long mstart, mend;
+
+ mstart = pfn_mapped[i].start << PAGE_SHIFT;
+ mend = pfn_mapped[i].end << PAGE_SHIFT;
+ if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
+ kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+ }
+
+ if (kernel_ident_mapping_init(&info, pgd,
+ PAGE_ALIGN_DOWN(reset_vector),
+ PAGE_ALIGN(reset_vector + 1))) {
+ kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+
+ if (init_transition_pgtable(pgd)) {
+ kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+
+ smp_ops.play_dead = acpi_mp_play_dead;
+ smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu;
+ smp_ops.cpu_die = acpi_mp_cpu_die;
+
+ acpi_mp_reset_vector_paddr = reset_vector;
+ acpi_mp_pgd = __pa(pgd);
+
+ return 0;
+}
+
+static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
+{
+ if (!acpi_mp_wake_mailbox_paddr) {
+ pr_warn_once("No MADT mailbox: cannot bringup secondary CPUs. Booting with kexec?\n");
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * Remap mailbox memory only for the first call to acpi_wakeup_cpu().
+ *
+ * Wakeup of secondary CPUs is fully serialized in the core code.
+ * No need to protect acpi_mp_wake_mailbox from concurrent accesses.
+ */
+ if (!acpi_mp_wake_mailbox) {
+ acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
+ sizeof(*acpi_mp_wake_mailbox),
+ MEMREMAP_WB);
+ }
+
+ /*
+ * Mailbox memory is shared between the firmware and OS. Firmware will
+ * listen on mailbox command address, and once it receives the wakeup
+ * command, the CPU associated with the given apicid will be booted.
+ *
+ * The value of 'apic_id' and 'wakeup_vector' must be visible to the
+ * firmware before the wakeup command is visible. smp_store_release()
+ * ensures ordering and visibility.
+ */
+ acpi_mp_wake_mailbox->apic_id = apicid;
+ acpi_mp_wake_mailbox->wakeup_vector = start_ip;
+ smp_store_release(&acpi_mp_wake_mailbox->command,
+ ACPI_MP_WAKE_COMMAND_WAKEUP);
+
+ /*
+ * Wait for the CPU to wake up.
+ *
+ * The CPU being woken up is essentially in a spin loop waiting to be
+ * woken up. It should not take long for it wake up and acknowledge by
+ * zeroing out ->command.
+ *
+ * ACPI specification doesn't provide any guidance on how long kernel
+ * has to wait for a wake up acknowledgment. It also doesn't provide
+ * a way to cancel a wake up request if it takes too long.
+ *
+ * In TDX environment, the VMM has control over how long it takes to
+ * wake up secondary. It can postpone scheduling secondary vCPU
+ * indefinitely. Giving up on wake up request and reporting error opens
+ * possible attack vector for VMM: it can wake up a secondary CPU when
+ * kernel doesn't expect it. Wait until positive result of the wake up
+ * request.
+ */
+ while (READ_ONCE(acpi_mp_wake_mailbox->command))
+ cpu_relax();
+
+ return 0;
+}
+
+static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake)
+{
+ cpu_hotplug_disable_offlining();
+
+ /*
+ * ACPI MADT doesn't allow to offline a CPU after it was onlined. This
+ * limits kexec: the second kernel won't be able to use more than one CPU.
+ *
+ * To prevent a kexec kernel from onlining secondary CPUs invalidate the
+ * mailbox address in the ACPI MADT wakeup structure which prevents a
+ * kexec kernel to use it.
+ *
+ * This is safe as the booting kernel has the mailbox address cached
+ * already and acpi_wakeup_cpu() uses the cached value to bring up the
+ * secondary CPUs.
+ *
+ * Note: This is a Linux specific convention and not covered by the
+ * ACPI specification.
+ */
+ mp_wake->mailbox_address = 0;
+}
+
+int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
+ const unsigned long end)
+{
+ struct acpi_madt_multiproc_wakeup *mp_wake;
+
+ mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
+
+ /*
+ * Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake
+ * entry. 'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger
+ * than the actual size of the MP wakeup entry in ACPI table because the
+ * 'reset_vector' is only available in the V1 MP wakeup structure.
+ */
+ if (!mp_wake)
+ return -EINVAL;
+ if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0)
+ return -EINVAL;
+ if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0)
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(&header->common);
+
+ acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address;
+
+ if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 &&
+ mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) {
+ if (acpi_mp_setup_reset(mp_wake->reset_vector)) {
+ pr_warn("Failed to setup MADT reset vector\n");
+ acpi_mp_disable_offlining(mp_wake);
+ }
+ } else {
+ /*
+ * CPU offlining requires version 1 of the ACPI MADT wakeup
+ * structure.
+ */
+ acpi_mp_disable_offlining(mp_wake);
+ }
+
+ apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);
+
+ return 0;
+}
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index d5d8a352eafa..b200a193beeb 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -17,7 +17,7 @@
* Hooray, we are in Long 64-bit mode (but still running in low memory)
*/
SYM_FUNC_START(wakeup_long64)
- movq saved_magic, %rax
+ movq saved_magic(%rip), %rax
movq $0x123456789abcdef0, %rdx
cmpq %rdx, %rax
je 2f
@@ -33,14 +33,14 @@ SYM_FUNC_START(wakeup_long64)
movw %ax, %es
movw %ax, %fs
movw %ax, %gs
- movq saved_rsp, %rsp
+ movq saved_rsp(%rip), %rsp
- movq saved_rbx, %rbx
- movq saved_rdi, %rdi
- movq saved_rsi, %rsi
- movq saved_rbp, %rbp
+ movq saved_rbx(%rip), %rbx
+ movq saved_rdi(%rip), %rdi
+ movq saved_rsi(%rip), %rsi
+ movq saved_rbp(%rip), %rbp
- movq saved_rip, %rax
+ movq saved_rip(%rip), %rax
ANNOTATE_RETPOLINE_SAFE
jmp *%rax
SYM_FUNC_END(wakeup_long64)
@@ -72,11 +72,11 @@ SYM_FUNC_START(do_suspend_lowlevel)
movq $.Lresume_point, saved_rip(%rip)
- movq %rsp, saved_rsp
- movq %rbp, saved_rbp
- movq %rbx, saved_rbx
- movq %rdi, saved_rdi
- movq %rsi, saved_rsi
+ movq %rsp, saved_rsp(%rip)
+ movq %rbp, saved_rbp(%rip)
+ movq %rbx, saved_rbx(%rip)
+ movq %rdi, saved_rdi(%rip)
+ movq %rsi, saved_rsi(%rip)
addq $8, %rsp
movl $3, %edi
@@ -87,6 +87,7 @@ SYM_FUNC_START(do_suspend_lowlevel)
.align 4
.Lresume_point:
+ ANNOTATE_NOENDBR
/* We don't restore %rax, it must be 0 anyway */
movq $saved_context, %rax
movq saved_context_cr4(%rax), %rbx