summaryrefslogtreecommitdiff
path: root/arch/x86/kernel/head64.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/head64.c')
-rw-r--r--arch/x86/kernel/head64.c251
1 files changed, 128 insertions, 123 deletions
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 46c3c73e7f43..fd28b53dbac5 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -1,10 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* prepare to run common code
*
* Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
*/
-#define DISABLE_BRANCH_PROFILING
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
+
#include <linux/init.h>
#include <linux/linkage.h>
#include <linux/types.h>
@@ -14,13 +17,16 @@
#include <linux/start_kernel.h>
#include <linux/io.h>
#include <linux/memblock.h>
+#include <linux/cc_platform.h>
+#include <linux/pgtable.h>
+#include <asm/asm.h>
+#include <asm/page_64.h>
#include <asm/processor.h>
#include <asm/proto.h>
#include <asm/smp.h>
#include <asm/setup.h>
#include <asm/desc.h>
-#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/kdebug.h>
@@ -29,138 +35,58 @@
#include <asm/bootparam_utils.h>
#include <asm/microcode.h>
#include <asm/kasan.h>
+#include <asm/fixmap.h>
+#include <asm/realmode.h>
+#include <asm/extable.h>
+#include <asm/trapnr.h>
+#include <asm/sev.h>
+#include <asm/tdx.h>
+#include <asm/init.h>
/*
* Manage page tables very early on.
*/
-extern pgd_t early_top_pgt[PTRS_PER_PGD];
extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
-static unsigned int __initdata next_early_pgt;
+unsigned int __initdata next_early_pgt;
+SYM_PIC_ALIAS(next_early_pgt);
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
-#define __head __section(.head.text)
-
-static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
-{
- return ptr - (void *)_text + (void *)physaddr;
-}
-
-void __head __startup_64(unsigned long physaddr)
-{
- unsigned long load_delta, *p;
- pgdval_t *pgd;
- p4dval_t *p4d;
- pudval_t *pud;
- pmdval_t *pmd, pmd_entry;
- int i;
-
- /* Is the address too large? */
- if (physaddr >> MAX_PHYSMEM_BITS)
- for (;;);
-
- /*
- * Compute the delta between the address I am compiled to run at
- * and the address I am actually running at.
- */
- load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
-
- /* Is the address not 2M aligned? */
- if (load_delta & ~PMD_PAGE_MASK)
- for (;;);
-
- /* Fixup the physical addresses in the page table */
-
- pgd = fixup_pointer(&early_top_pgt, physaddr);
- pgd[pgd_index(__START_KERNEL_map)] += load_delta;
-
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
- p4d[511] += load_delta;
- }
-
- pud = fixup_pointer(&level3_kernel_pgt, physaddr);
- pud[510] += load_delta;
- pud[511] += load_delta;
-
- pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
- pmd[506] += load_delta;
-
- /*
- * Set up the identity mapping for the switchover. These
- * entries should *NOT* have the global bit set! This also
- * creates a bunch of nonsense entries but that is fine --
- * it avoids problems around wraparound.
- */
-
- pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
- pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
-
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
-
- i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
- pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE;
- pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE;
-
- i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
- p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
- p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
- } else {
- i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
- pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
- pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
- }
-
- i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
- pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE;
- pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE;
-
- pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
- pmd_entry += physaddr;
-
- for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
- int idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD;
- pmd[idx] = pmd_entry + i * PMD_SIZE;
- }
-
- /*
- * Fixup the kernel text+data virtual addresses. Note that
- * we might write invalid pmds, when the kernel is relocated
- * cleanup_highmap() fixes this up along with the mappings
- * beyond _end.
- */
-
- pmd = fixup_pointer(level2_kernel_pgt, physaddr);
- for (i = 0; i < PTRS_PER_PMD; i++) {
- if (pmd[i] & _PAGE_PRESENT)
- pmd[i] += load_delta;
- }
-
- /* Fixup phys_base */
- p = fixup_pointer(&phys_base, physaddr);
- *p += load_delta;
-}
+unsigned int __pgtable_l5_enabled __ro_after_init;
+SYM_PIC_ALIAS(__pgtable_l5_enabled);
+unsigned int pgdir_shift __ro_after_init = 39;
+EXPORT_SYMBOL(pgdir_shift);
+SYM_PIC_ALIAS(pgdir_shift);
+unsigned int ptrs_per_p4d __ro_after_init = 1;
+EXPORT_SYMBOL(ptrs_per_p4d);
+SYM_PIC_ALIAS(ptrs_per_p4d);
+
+unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4;
+EXPORT_SYMBOL(page_offset_base);
+unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4;
+EXPORT_SYMBOL(vmalloc_base);
+unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4;
+EXPORT_SYMBOL(vmemmap_base);
/* Wipe all early page tables except for the kernel symbol map */
static void __init reset_early_page_tables(void)
{
memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
next_early_pgt = 0;
- write_cr3(__pa_nodebug(early_top_pgt));
+ write_cr3(__sme_pa_nodebug(early_top_pgt));
}
/* Create a new PMD entry */
-int __init early_make_pgtable(unsigned long address)
+bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
{
unsigned long physaddr = address - __PAGE_OFFSET;
pgdval_t pgd, *pgd_p;
p4dval_t p4d, *p4d_p;
pudval_t pud, *pud_p;
- pmdval_t pmd, *pmd_p;
+ pmdval_t *pmd_p;
/* Invalid address or early pgt is done ? */
if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
- return -1;
+ return false;
again:
pgd_p = &early_top_pgt[pgd_index(address)].pgd;
@@ -171,7 +97,7 @@ again:
* critical -- __PAGE_OFFSET would point us back into the dynamic
* range and we might end up looping forever...
*/
- if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ if (!pgtable_l5_enabled())
p4d_p = pgd_p;
else if (pgd)
p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
@@ -215,18 +141,45 @@ again:
memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
*pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
}
- pmd = (physaddr & PMD_MASK) + early_pmd_flags;
pmd_p[pmd_index(address)] = pmd;
- return 0;
+ return true;
+}
+
+static bool __init early_make_pgtable(unsigned long address)
+{
+ unsigned long physaddr = address - __PAGE_OFFSET;
+ pmdval_t pmd;
+
+ pmd = (physaddr & PMD_MASK) + early_pmd_flags;
+
+ return __early_make_pgtable(address, pmd);
+}
+
+void __init do_early_exception(struct pt_regs *regs, int trapnr)
+{
+ if (trapnr == X86_TRAP_PF &&
+ early_make_pgtable(native_read_cr2()))
+ return;
+
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT) &&
+ trapnr == X86_TRAP_VC && handle_vc_boot_ghcb(regs))
+ return;
+
+ if (trapnr == X86_TRAP_VE && tdx_early_handle_ve(regs))
+ return;
+
+ early_fixup_exception(regs, trapnr);
}
/* Don't add a printk in there. printk relies on the PDA which is not initialized
yet. */
-static void __init clear_bss(void)
+void __init clear_bss(void)
{
memset(__bss_start, 0,
(unsigned long) __bss_stop - (unsigned long) __bss_start);
+ memset(__brk_base, 0,
+ (unsigned long) __brk_limit - (unsigned long) __brk_base);
}
static unsigned long get_cmd_line_ptr(void)
@@ -243,19 +196,31 @@ static void __init copy_bootdata(char *real_mode_data)
char * command_line;
unsigned long cmd_line_ptr;
- memcpy(&boot_params, real_mode_data, sizeof boot_params);
+ /*
+ * If SME is active, this will create decrypted mappings of the
+ * boot data in advance of the copy operations.
+ */
+ sme_map_bootdata(real_mode_data);
+
+ memcpy(&boot_params, real_mode_data, sizeof(boot_params));
sanitize_boot_params(&boot_params);
cmd_line_ptr = get_cmd_line_ptr();
if (cmd_line_ptr) {
command_line = __va(cmd_line_ptr);
memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
}
+
+ /*
+ * The old boot data is no longer needed and won't be reserved,
+ * freeing up that memory for use by the system. If SME is active,
+ * we need to remove the mappings that were created so that the
+ * memory doesn't remain mapped as decrypted.
+ */
+ sme_unmap_bootdata(real_mode_data);
}
-asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
+asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode_data)
{
- int i;
-
/*
* Build-time sanity checks on the kernel image and module
* area mappings. (these are purely build-time and produce no code)
@@ -266,7 +231,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0);
BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
- BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
+ MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
@@ -275,15 +240,43 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
/* Kill off the identity-map trampoline */
reset_early_page_tables();
+ if (pgtable_l5_enabled()) {
+ page_offset_base = __PAGE_OFFSET_BASE_L5;
+ vmalloc_base = __VMALLOC_BASE_L5;
+ vmemmap_base = __VMEMMAP_BASE_L5;
+ }
+
clear_bss();
+ /*
+ * This needs to happen *before* kasan_early_init() because latter maps stuff
+ * into that page.
+ */
clear_page(init_top_pgt);
+ /*
+ * SME support may update early_pmd_flags to include the memory
+ * encryption mask, so it needs to be called before anything
+ * that may generate a page fault.
+ */
+ sme_early_init();
+
kasan_early_init();
- for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
- set_intr_gate(i, early_idt_handler_array[i]);
- load_idt((const struct desc_ptr *)&idt_descr);
+ /*
+ * Flush global TLB entries which could be left over from the trampoline page
+ * table.
+ *
+ * This needs to happen *after* kasan_early_init() as KASAN-enabled .configs
+ * instrument native_write_cr4() so KASAN must be initialized for that
+ * instrumentation to work.
+ */
+ __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4));
+
+ idt_setup_early_handler();
+
+ /* Needed before cc_platform_has() can be used for TDX */
+ tdx_early_init();
copy_bootdata(__va(real_mode_data));
@@ -298,7 +291,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
x86_64_start_reservations(real_mode_data);
}
-void __init x86_64_start_reservations(char *real_mode_data)
+void __init __noreturn x86_64_start_reservations(char *real_mode_data)
{
/* version is always not zero if it is copied */
if (!boot_params.hdr.version)
@@ -316,3 +309,15 @@ void __init x86_64_start_reservations(char *real_mode_data)
start_kernel();
}
+
+void early_setup_idt(void)
+{
+ void *handler = NULL;
+
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
+ setup_ghcb();
+ handler = vc_boot_ghcb;
+ }
+
+ __pi_startup_64_load_idt(handler);
+}