summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorKirill A. Shutemov <kirill.shutemov@linux.intel.com>2024-06-14 12:59:03 +0300
committerBorislav Petkov (AMD) <bp@alien8.de>2024-06-17 17:46:25 +0200
commit1ceebe2e46720de02af4cf626dc847ecc4a263fd (patch)
tree2c6ae79b7a52723c4a680951b3ec9e246c157a6e /arch/x86/kernel
parentd88e7b3e35cff2c318042990d70828f64c3ae296 (diff)
x86/acpi: Add support for CPU offlining for ACPI MADT wakeup method
MADT Multiprocessor Wakeup structure version 1 brings support for CPU offlining: BIOS provides a reset vector where the CPU has to jump to for offlining itself. The new TEST mailbox command can be used to test whether the CPU offlined itself which means the BIOS has control over the CPU and can online it again via the ACPI MADT wakeup method. Add CPU offlining support for the ACPI MADT wakeup method by implementing custom cpu_die(), play_dead() and stop_this_cpu() SMP operations. CPU offlining makes it possible to hand over secondary CPUs over kexec, not limiting the second kernel to a single CPU. The change conforms to the approved ACPI spec change proposal. See the Link. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com> Reviewed-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Kai Huang <kai.huang@intel.com> Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Tested-by: Tao Liu <ltao@redhat.com> Link: https://lore.kernel.org/all/13356251.uLZWGnKmhe@kreacher Link: https://lore.kernel.org/r/20240614095904.1345461-19-kirill.shutemov@linux.intel.com
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/acpi/Makefile2
-rw-r--r--arch/x86/kernel/acpi/madt_playdead.S28
-rw-r--r--arch/x86/kernel/acpi/madt_wakeup.c184
3 files changed, 211 insertions, 3 deletions
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 2feba7257665..842a5f449404 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -4,7 +4,7 @@ obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
obj-$(CONFIG_ACPI_APEI) += apei.o
obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o
-obj-$(CONFIG_ACPI_MADT_WAKEUP) += madt_wakeup.o
+obj-$(CONFIG_ACPI_MADT_WAKEUP) += madt_wakeup.o madt_playdead.o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o
diff --git a/arch/x86/kernel/acpi/madt_playdead.S b/arch/x86/kernel/acpi/madt_playdead.S
new file mode 100644
index 000000000000..4e498d28cdc8
--- /dev/null
+++ b/arch/x86/kernel/acpi/madt_playdead.S
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/nospec-branch.h>
+#include <asm/page_types.h>
+#include <asm/processor-flags.h>
+
+ .text
+ .align PAGE_SIZE
+
+/*
+ * asm_acpi_mp_play_dead() - Hand over control of the CPU to the BIOS
+ *
+ * rdi: Address of the ACPI MADT MPWK ResetVector
+ * rsi: PGD of the identity mapping
+ */
+SYM_FUNC_START(asm_acpi_mp_play_dead)
+ /* Turn off global entries. Following CR3 write will flush them. */
+ movq %cr4, %rdx
+ andq $~(X86_CR4_PGE), %rdx
+ movq %rdx, %cr4
+
+ /* Switch to identity mapping */
+ movq %rsi, %cr3
+
+ /* Jump to reset vector */
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rdi
+SYM_FUNC_END(asm_acpi_mp_play_dead)
diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c
index 30820f9de5af..6cfe762be28b 100644
--- a/arch/x86/kernel/acpi/madt_wakeup.c
+++ b/arch/x86/kernel/acpi/madt_wakeup.c
@@ -1,10 +1,19 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <linux/acpi.h>
#include <linux/cpu.h>
+#include <linux/delay.h>
#include <linux/io.h>
+#include <linux/kexec.h>
+#include <linux/memblock.h>
+#include <linux/pgtable.h>
+#include <linux/sched/hotplug.h>
#include <asm/apic.h>
#include <asm/barrier.h>
+#include <asm/init.h>
+#include <asm/intel_pt.h>
+#include <asm/nmi.h>
#include <asm/processor.h>
+#include <asm/reboot.h>
/* Physical address of the Multiprocessor Wakeup Structure mailbox */
static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;
@@ -12,6 +21,154 @@ static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;
/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox __ro_after_init;
+static u64 acpi_mp_pgd __ro_after_init;
+static u64 acpi_mp_reset_vector_paddr __ro_after_init;
+
+static void acpi_mp_stop_this_cpu(void)
+{
+ asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
+}
+
+static void acpi_mp_play_dead(void)
+{
+ play_dead_common();
+ asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
+}
+
+static void acpi_mp_cpu_die(unsigned int cpu)
+{
+ u32 apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ unsigned long timeout;
+
+ /*
+ * Use TEST mailbox command to prove that BIOS got control over
+ * the CPU before declaring it dead.
+ *
+ * BIOS has to clear 'command' field of the mailbox.
+ */
+ acpi_mp_wake_mailbox->apic_id = apicid;
+ smp_store_release(&acpi_mp_wake_mailbox->command,
+ ACPI_MP_WAKE_COMMAND_TEST);
+
+ /* Don't wait longer than a second. */
+ timeout = USEC_PER_SEC;
+ while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout)
+ udelay(1);
+
+ if (!timeout)
+ pr_err("Failed to hand over CPU %d to BIOS\n", cpu);
+}
+
+/* The argument is required to match type of x86_mapping_info::alloc_pgt_page */
+static void __init *alloc_pgt_page(void *dummy)
+{
+ return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+}
+
+static void __init free_pgt_page(void *pgt, void *dummy)
+{
+ return memblock_free(pgt, PAGE_SIZE);
+}
+
+/*
+ * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at
+ * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches
+ * to the identity mapping and the function has be present at the same spot in
+ * the virtual address space before and after switching page tables.
+ */
+static int __init init_transition_pgtable(pgd_t *pgd)
+{
+ pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
+ unsigned long vaddr, paddr;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ vaddr = (unsigned long)asm_acpi_mp_play_dead;
+ pgd += pgd_index(vaddr);
+ if (!pgd_present(*pgd)) {
+ p4d = (p4d_t *)alloc_pgt_page(NULL);
+ if (!p4d)
+ return -ENOMEM;
+ set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+ }
+ p4d = p4d_offset(pgd, vaddr);
+ if (!p4d_present(*p4d)) {
+ pud = (pud_t *)alloc_pgt_page(NULL);
+ if (!pud)
+ return -ENOMEM;
+ set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+ }
+ pud = pud_offset(p4d, vaddr);
+ if (!pud_present(*pud)) {
+ pmd = (pmd_t *)alloc_pgt_page(NULL);
+ if (!pmd)
+ return -ENOMEM;
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ }
+ pmd = pmd_offset(pud, vaddr);
+ if (!pmd_present(*pmd)) {
+ pte = (pte_t *)alloc_pgt_page(NULL);
+ if (!pte)
+ return -ENOMEM;
+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
+ }
+ pte = pte_offset_kernel(pmd, vaddr);
+
+ paddr = __pa(vaddr);
+ set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
+
+ return 0;
+}
+
+static int __init acpi_mp_setup_reset(u64 reset_vector)
+{
+ struct x86_mapping_info info = {
+ .alloc_pgt_page = alloc_pgt_page,
+ .free_pgt_page = free_pgt_page,
+ .page_flag = __PAGE_KERNEL_LARGE_EXEC,
+ .kernpg_flag = _KERNPG_TABLE_NOENC,
+ };
+ pgd_t *pgd;
+
+ pgd = alloc_pgt_page(NULL);
+ if (!pgd)
+ return -ENOMEM;
+
+ for (int i = 0; i < nr_pfn_mapped; i++) {
+ unsigned long mstart, mend;
+
+ mstart = pfn_mapped[i].start << PAGE_SHIFT;
+ mend = pfn_mapped[i].end << PAGE_SHIFT;
+ if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
+ kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+ }
+
+ if (kernel_ident_mapping_init(&info, pgd,
+ PAGE_ALIGN_DOWN(reset_vector),
+ PAGE_ALIGN(reset_vector + 1))) {
+ kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+
+ if (init_transition_pgtable(pgd)) {
+ kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+
+ smp_ops.play_dead = acpi_mp_play_dead;
+ smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu;
+ smp_ops.cpu_die = acpi_mp_cpu_die;
+
+ acpi_mp_reset_vector_paddr = reset_vector;
+ acpi_mp_pgd = __pa(pgd);
+
+ return 0;
+}
+
static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
{
if (!acpi_mp_wake_mailbox_paddr) {
@@ -97,14 +254,37 @@ int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
struct acpi_madt_multiproc_wakeup *mp_wake;
mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
- if (BAD_MADT_ENTRY(mp_wake, end))
+
+ /*
+ * Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake
+ * entry. 'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger
+ * than the actual size of the MP wakeup entry in ACPI table because the
+ * 'reset_vector' is only available in the V1 MP wakeup structure.
+ */
+ if (!mp_wake)
+ return -EINVAL;
+ if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0)
+ return -EINVAL;
+ if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0)
return -EINVAL;
acpi_table_print_madt_entry(&header->common);
acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address;
- acpi_mp_disable_offlining(mp_wake);
+ if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 &&
+ mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) {
+ if (acpi_mp_setup_reset(mp_wake->reset_vector)) {
+ pr_warn("Failed to setup MADT reset vector\n");
+ acpi_mp_disable_offlining(mp_wake);
+ }
+ } else {
+ /*
+ * CPU offlining requires version 1 of the ACPI MADT wakeup
+ * structure.
+ */
+ acpi_mp_disable_offlining(mp_wake);
+ }
apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);