summaryrefslogtreecommitdiff
path: root/arch/x86/kernel/relocate_kernel_64.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/relocate_kernel_64.S')
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S503
1 files changed, 396 insertions, 107 deletions
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 56cab1bb25f5..ea604f4d0b52 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -5,12 +5,15 @@
*/
#include <linux/linkage.h>
+#include <linux/stringify.h>
+#include <asm/alternative.h>
#include <asm/page_types.h>
#include <asm/kexec.h>
#include <asm/processor-flags.h>
#include <asm/pgtable_types.h>
#include <asm/nospec-branch.h>
#include <asm/unwind_hints.h>
+#include <asm/asm-offsets.h>
/*
* Must be relocatable PIC code callable as a C function, in particular
@@ -21,33 +24,47 @@
#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
/*
- * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
- * ~ control_page + PAGE_SIZE are used as data storage and stack for
- * jumping back
+ * The .text..relocate_kernel and .data..relocate_kernel sections are copied
+ * into the control page, and the remainder of the page is used as the stack.
*/
-#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
+ .section .data..relocate_kernel,"a";
/* Minimal CPU state */
-#define RSP DATA(0x0)
-#define CR0 DATA(0x8)
-#define CR3 DATA(0x10)
-#define CR4 DATA(0x18)
-
-/* other data */
-#define CP_PA_TABLE_PAGE DATA(0x20)
-#define CP_PA_SWAP_PAGE DATA(0x28)
-#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
-
- .text
- .align PAGE_SIZE
+SYM_DATA_LOCAL(saved_rsp, .quad 0)
+SYM_DATA_LOCAL(saved_cr0, .quad 0)
+SYM_DATA_LOCAL(saved_cr3, .quad 0)
+SYM_DATA_LOCAL(saved_cr4, .quad 0)
+ /* other data */
+SYM_DATA(kexec_va_control_page, .quad 0)
+SYM_DATA(kexec_pa_table_page, .quad 0)
+SYM_DATA(kexec_pa_swap_page, .quad 0)
+SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
+SYM_DATA(kexec_debug_8250_mmio32, .quad 0)
+SYM_DATA(kexec_debug_8250_port, .word 0)
+
+ .balign 16
+SYM_DATA_START_LOCAL(kexec_debug_gdt)
+ .word kexec_debug_gdt_end - kexec_debug_gdt - 1
+ .long 0
+ .word 0
+ .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
+ .quad 0x00af9a000000ffff /* __KERNEL_CS */
+ .quad 0x00cf92000000ffff /* __KERNEL_DS */
+SYM_DATA_END_LABEL(kexec_debug_gdt, SYM_L_LOCAL, kexec_debug_gdt_end)
+
+ .balign 8
+SYM_DATA_START(kexec_debug_idt)
+ .skip 0x100, 0x00
+SYM_DATA_END(kexec_debug_idt)
+
+ .section .text..relocate_kernel,"ax";
.code64
-SYM_CODE_START_NOALIGN(relocate_range)
SYM_CODE_START_NOALIGN(relocate_kernel)
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
/*
* %rdi indirection_page
- * %rsi page_list
+ * %rsi pa_control_page
* %rdx start address
* %rcx preserve_context
* %r8 host_mem_enc_active
@@ -62,63 +79,87 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
pushq %r15
pushf
- movq PTR(VA_CONTROL_PAGE)(%rsi), %r11
- movq %rsp, RSP(%r11)
- movq %cr0, %rax
- movq %rax, CR0(%r11)
- movq %cr3, %rax
- movq %rax, CR3(%r11)
- movq %cr4, %rax
- movq %rax, CR4(%r11)
-
- /* Save CR4. Required to enable the right paging mode later. */
- movq %rax, %r13
+ /* Invalidate GDT/IDT, zero out flags */
+ pushq $0
+ pushq $0
- /* zero out flags, and disable interrupts */
- pushq $0
+ lidt (%rsp)
+ lgdt (%rsp)
+ addq $8, %rsp
popfq
- /* Save SME active flag */
- movq %r8, %r12
+ /* Switch to the identity mapped page tables */
+ movq %cr3, %rax
+ movq kexec_pa_table_page(%rip), %r9
+ movq %r9, %cr3
- /*
- * get physical address of control page now
- * this is impossible after page table switch
- */
- movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
+ /* Leave CR4 in %r13 to enable the right paging mode later. */
+ movq %cr4, %r13
- /* get physical address of page table now too */
- movq PTR(PA_TABLE_PAGE)(%rsi), %r9
+ /* Disable global pages immediately to ensure this mapping is RWX */
+ movq %r13, %r12
+ andq $~(X86_CR4_PGE), %r12
+ movq %r12, %cr4
- /* get physical address of swap page now */
- movq PTR(PA_SWAP_PAGE)(%rsi), %r10
+ /* Save %rsp and CRs. */
+ movq %r13, saved_cr4(%rip)
+ movq %rsp, saved_rsp(%rip)
+ movq %rax, saved_cr3(%rip)
+ movq %cr0, %rax
+ movq %rax, saved_cr0(%rip)
- /* save some information for jumping back */
- movq %r9, CP_PA_TABLE_PAGE(%r11)
- movq %r10, CP_PA_SWAP_PAGE(%r11)
- movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
+ /* save indirection list for jumping back */
+ movq %rdi, pa_backup_pages_map(%rip)
- /* Switch to the identity mapped page tables */
- movq %r9, %cr3
+ /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */
+ movq %rcx, %r11
/* setup a new stack at the end of the physical control page */
- lea PAGE_SIZE(%r8), %rsp
+ lea PAGE_SIZE(%rsi), %rsp
/* jump to identity mapped page */
- addq $(identity_mapped - relocate_kernel), %r8
- pushq %r8
- ANNOTATE_UNRET_SAFE
- ret
- int3
+0: addq $identity_mapped - 0b, %rsi
+ subq $__relocate_kernel_start - 0b, %rsi
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rsi
SYM_CODE_END(relocate_kernel)
SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
UNWIND_HINT_END_OF_STACK
- /* set return address to 0 if not preserving context */
- pushq $0
+ /*
+ * %rdi indirection page
+ * %rdx start address
+ * %r8 host_mem_enc_active
+ * %r9 page table page
+ * %r11 preserve_context
+ * %r13 original CR4 when relocate_kernel() was invoked
+ */
+
/* store the start address on the stack */
pushq %rdx
+ /* Create a GDTR (16 bits limit, 64 bits addr) on stack */
+ leaq kexec_debug_gdt(%rip), %rax
+ pushq %rax
+ pushw (%rax)
+
+ /* Load the GDT, put the stack back */
+ lgdt (%rsp)
+ addq $10, %rsp
+
+ /* Test that we can load segments */
+ movq %ds, %rax
+ movq %rax, %ds
+
+ /* Now an IDTR on the stack to load the IDT the kernel created */
+ leaq kexec_debug_idt(%rip), %rsi
+ pushq %rsi
+ pushw $0xff
+ lidt (%rsp)
+ addq $10, %rsp
+
+ //int3
+
/*
* Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP
* below.
@@ -145,16 +186,15 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
* Set cr4 to a known state:
* - physical address extension enabled
* - 5-level paging, if it was enabled before
+ * - Machine check exception on TDX guest, if it was enabled before.
+ * Clearing MCE might not be allowed in TDX guests, depending on setup.
+ *
+ * Use R13 that contains the original CR4 value, read in relocate_kernel().
+ * PAE is always set in the original CR4.
*/
- movl $X86_CR4_PAE, %eax
- testq $X86_CR4_LA57, %r13
- jz 1f
- orl $X86_CR4_LA57, %eax
-1:
- movq %rax, %cr4
-
- jmp 1f
-1:
+ andl $(X86_CR4_PAE | X86_CR4_LA57), %r13d
+ ALTERNATIVE "", __stringify(orl $X86_CR4_MCE, %r13d), X86_FEATURE_TDX_GUEST
+ movq %r13, %cr4
/* Flush the TLB (needed?) */
movq %r9, %cr3
@@ -164,12 +204,11 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
* entries that will conflict with the now unencrypted memory
* used by kexec. Flush the caches before copying the kernel.
*/
- testq %r12, %r12
- jz 1f
+ testq %r8, %r8
+ jz .Lsme_off
wbinvd
-1:
+.Lsme_off:
- movq %rcx, %r11
call swap_pages
/*
@@ -181,13 +220,14 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
movq %cr3, %rax
movq %rax, %cr3
+ testq %r11, %r11 /* preserve_context */
+ jnz .Lrelocate
+
/*
* set all of the registers to known values
* leave %rsp alone
*/
- testq %r11, %r11
- jnz 1f
xorl %eax, %eax
xorl %ebx, %ebx
xorl %ecx, %ecx
@@ -208,22 +248,36 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
ret
int3
-1:
+.Lrelocate:
popq %rdx
+
+ /* Use the swap page for the callee's stack */
+ movq kexec_pa_swap_page(%rip), %r10
leaq PAGE_SIZE(%r10), %rsp
+
+ /* push the existing entry point onto the callee's stack */
+ pushq %rdx
+
ANNOTATE_RETPOLINE_SAFE
call *%rdx
/* get the re-entry point of the peer system */
- movq 0(%rsp), %rbp
- leaq relocate_kernel(%rip), %r8
- movq CP_PA_SWAP_PAGE(%r8), %r10
- movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
- movq CP_PA_TABLE_PAGE(%r8), %rax
+ popq %rbp
+ movq kexec_pa_swap_page(%rip), %r10
+ movq pa_backup_pages_map(%rip), %rdi
+ movq kexec_pa_table_page(%rip), %rax
movq %rax, %cr3
+
+ /* Find start (and end) of this physical mapping of control page */
+ leaq (%rip), %r8
+ ANNOTATE_NOENDBR
+ andq $PAGE_MASK, %r8
lea PAGE_SIZE(%r8), %rsp
+ movl $1, %r11d /* Ensure preserve_context flag is set */
call swap_pages
- movq $virtual_mapped, %rax
+ movq kexec_va_control_page(%rip), %rax
+0: addq $virtual_mapped - 0b, %rax
+ subq $__relocate_kernel_start - 0b, %rax
pushq %rax
ANNOTATE_UNRET_SAFE
ret
@@ -233,13 +287,21 @@ SYM_CODE_END(identity_mapped)
SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR // RET target, above
- movq RSP(%r8), %rsp
- movq CR4(%r8), %rax
+ movq saved_rsp(%rip), %rsp
+ movq saved_cr4(%rip), %rax
movq %rax, %cr4
- movq CR3(%r8), %rax
- movq CR0(%r8), %r8
+ movq saved_cr3(%rip), %rax
+ movq saved_cr0(%rip), %r8
movq %rax, %cr3
movq %r8, %cr0
+
+#ifdef CONFIG_KEXEC_JUMP
+ /* Saved in save_processor_state. */
+ movq $saved_context, %rax
+ lgdt saved_context_gdt_desc(%rax)
+#endif
+
+ /* relocate_kernel() returns the re-entry point for next time */
movq %rbp, %rax
popf
@@ -257,61 +319,288 @@ SYM_CODE_END(virtual_mapped)
/* Do the copies */
SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
UNWIND_HINT_END_OF_STACK
- movq %rdi, %rcx /* Put the page_list in %rcx */
+ /*
+ * %rdi indirection page
+ * %r11 preserve_context
+ */
+ movq %rdi, %rcx /* Put the indirection_page in %rcx */
xorl %edi, %edi
xorl %esi, %esi
- jmp 1f
+ jmp .Lstart /* Should start with an indirection record */
-0: /* top, read another word for the indirection page */
+.Lloop: /* top, read another word for the indirection page */
movq (%rbx), %rcx
addq $8, %rbx
-1:
+.Lstart:
testb $0x1, %cl /* is it a destination page? */
- jz 2f
+ jz .Lnotdest
movq %rcx, %rdi
andq $0xfffffffffffff000, %rdi
- jmp 0b
-2:
+ jmp .Lloop
+.Lnotdest:
testb $0x2, %cl /* is it an indirection page? */
- jz 2f
+ jz .Lnotind
movq %rcx, %rbx
andq $0xfffffffffffff000, %rbx
- jmp 0b
-2:
+ jmp .Lloop
+.Lnotind:
testb $0x4, %cl /* is it the done indicator? */
- jz 2f
- jmp 3f
-2:
+ jz .Lnotdone
+ jmp .Ldone
+.Lnotdone:
testb $0x8, %cl /* is it the source indicator? */
- jz 0b /* Ignore it otherwise */
+ jz .Lloop /* Ignore it otherwise */
movq %rcx, %rsi /* For ever source page do a copy */
andq $0xfffffffffffff000, %rsi
- movq %rdi, %rdx
- movq %rsi, %rax
+ movq %rdi, %rdx /* Save destination page to %rdx */
+ movq %rsi, %rax /* Save source page to %rax */
+
+ testq %r11, %r11 /* Only actually swap for ::preserve_context */
+ jz .Lnoswap
- movq %r10, %rdi
+ /* copy source page to swap page */
+ movq kexec_pa_swap_page(%rip), %rdi
movl $512, %ecx
- rep ; movsq
+ rep movsq
+ /* copy destination page to source page */
movq %rax, %rdi
movq %rdx, %rsi
movl $512, %ecx
- rep ; movsq
+ rep movsq
+ /* copy swap page to destination page */
movq %rdx, %rdi
- movq %r10, %rsi
+ movq kexec_pa_swap_page(%rip), %rsi
+.Lnoswap:
movl $512, %ecx
- rep ; movsq
+ rep movsq
lea PAGE_SIZE(%rax), %rsi
- jmp 0b
-3:
+ jmp .Lloop
+.Ldone:
ANNOTATE_UNRET_SAFE
ret
int3
SYM_CODE_END(swap_pages)
- .skip KEXEC_CONTROL_CODE_MAX_SIZE - (. - relocate_kernel), 0xcc
-SYM_CODE_END(relocate_range);
+/*
+ * Generic 'print character' routine
+ * - %al: Character to be printed (may clobber %rax)
+ * - %rdx: MMIO address or port.
+ */
+#define XMTRDY 0x20
+
+#define TXR 0 /* Transmit register (WRITE) */
+#define LSR 5 /* Line Status */
+
+SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ addw $LSR, %dx
+ xchg %al, %ah
+.Lxmtrdy_loop:
+ inb %dx, %al
+ testb $XMTRDY, %al
+ jnz .Lready
+ pause
+ jmp .Lxmtrdy_loop
+
+.Lready:
+ subw $LSR, %dx
+ xchg %al, %ah
+ outb %al, %dx
+pr_char_null:
+ ANNOTATE_NOENDBR
+
+ ANNOTATE_UNRET_SAFE
+ ret
+SYM_CODE_END(pr_char_8250)
+
+SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250_mmio32)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+.Lxmtrdy_loop_mmio:
+ movb (LSR*4)(%rdx), %ah
+ testb $XMTRDY, %ah
+ jnz .Lready_mmio
+ pause
+ jmp .Lxmtrdy_loop_mmio
+
+.Lready_mmio:
+ movb %al, (%rdx)
+ ANNOTATE_UNRET_SAFE
+ ret
+SYM_CODE_END(pr_char_8250_mmio32)
+
+/*
+ * Load pr_char function pointer into %rsi and load %rdx with whatever
+ * that function wants to see there (typically port/MMIO address).
+ */
+.macro pr_setup
+ leaq pr_char_8250(%rip), %rsi
+ movw kexec_debug_8250_port(%rip), %dx
+ testw %dx, %dx
+ jnz 1f
+
+ leaq pr_char_8250_mmio32(%rip), %rsi
+ movq kexec_debug_8250_mmio32(%rip), %rdx
+ testq %rdx, %rdx
+ jnz 1f
+
+ leaq pr_char_null(%rip), %rsi
+1:
+.endm
+
+/* Print the nybble in %bl, clobber %rax */
+SYM_CODE_START_LOCAL_NOALIGN(pr_nybble)
+ UNWIND_HINT_FUNC
+ movb %bl, %al
+ nop
+ andb $0x0f, %al
+ addb $0x30, %al
+ cmpb $0x3a, %al
+ jb 1f
+ addb $('a' - '0' - 10), %al
+ ANNOTATE_RETPOLINE_SAFE
+1: jmp *%rsi
+SYM_CODE_END(pr_nybble)
+
+SYM_CODE_START_LOCAL_NOALIGN(pr_qword)
+ UNWIND_HINT_FUNC
+ movq $16, %rcx
+1: rolq $4, %rbx
+ call pr_nybble
+ loop 1b
+ movb $'\n', %al
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rsi
+SYM_CODE_END(pr_qword)
+
+.macro print_reg a, b, c, d, r
+ movb $\a, %al
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rsi
+ movb $\b, %al
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rsi
+ movb $\c, %al
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rsi
+ movb $\d, %al
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rsi
+ movq \r, %rbx
+ call pr_qword
+.endm
+
+SYM_CODE_START_NOALIGN(kexec_debug_exc_vectors)
+ /* Each of these is 6 bytes. */
+.macro vec_err exc
+ UNWIND_HINT_ENTRY
+ . = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
+ nop
+ nop
+ pushq $\exc
+ jmp exc_handler
+.endm
+
+.macro vec_noerr exc
+ UNWIND_HINT_ENTRY
+ . = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
+ pushq $0
+ pushq $\exc
+ jmp exc_handler
+.endm
+
+ ANNOTATE_NOENDBR
+ vec_noerr 0 // #DE
+ vec_noerr 1 // #DB
+ vec_noerr 2 // #NMI
+ vec_noerr 3 // #BP
+ vec_noerr 4 // #OF
+ vec_noerr 5 // #BR
+ vec_noerr 6 // #UD
+ vec_noerr 7 // #NM
+ vec_err 8 // #DF
+ vec_noerr 9
+ vec_err 10 // #TS
+ vec_err 11 // #NP
+ vec_err 12 // #SS
+ vec_err 13 // #GP
+ vec_err 14 // #PF
+ vec_noerr 15
+SYM_CODE_END(kexec_debug_exc_vectors)
+
+SYM_CODE_START_LOCAL_NOALIGN(exc_handler)
+ /* No need for RET mitigations during kexec */
+ VALIDATE_UNRET_END
+
+ pushq %rax
+ pushq %rbx
+ pushq %rcx
+ pushq %rdx
+ pushq %rsi
+
+ /* Stack frame */
+#define EXC_SS 0x58 /* Architectural... */
+#define EXC_RSP 0x50
+#define EXC_EFLAGS 0x48
+#define EXC_CS 0x40
+#define EXC_RIP 0x38
+#define EXC_ERRORCODE 0x30 /* Either architectural or zero pushed by handler */
+#define EXC_EXCEPTION 0x28 /* Pushed by handler entry point */
+#define EXC_RAX 0x20 /* Pushed just above in exc_handler */
+#define EXC_RBX 0x18
+#define EXC_RCX 0x10
+#define EXC_RDX 0x08
+#define EXC_RSI 0x00
+
+ /* Set up %rdx/%rsi for debug output */
+ pr_setup
+
+ /* rip and exception info */
+ print_reg 'E', 'x', 'c', ':', EXC_EXCEPTION(%rsp)
+ print_reg 'E', 'r', 'r', ':', EXC_ERRORCODE(%rsp)
+ print_reg 'r', 'i', 'p', ':', EXC_RIP(%rsp)
+ print_reg 'r', 's', 'p', ':', EXC_RSP(%rsp)
+
+ /* We spilled these to the stack */
+ print_reg 'r', 'a', 'x', ':', EXC_RAX(%rsp)
+ print_reg 'r', 'b', 'x', ':', EXC_RBX(%rsp)
+ print_reg 'r', 'c', 'x', ':', EXC_RCX(%rsp)
+ print_reg 'r', 'd', 'x', ':', EXC_RDX(%rsp)
+ print_reg 'r', 's', 'i', ':', EXC_RSI(%rsp)
+
+ /* Other registers untouched */
+ print_reg 'r', 'd', 'i', ':', %rdi
+ print_reg 'r', '8', ' ', ':', %r8
+ print_reg 'r', '9', ' ', ':', %r9
+ print_reg 'r', '1', '0', ':', %r10
+ print_reg 'r', '1', '1', ':', %r11
+ print_reg 'r', '1', '2', ':', %r12
+ print_reg 'r', '1', '3', ':', %r13
+ print_reg 'r', '1', '4', ':', %r14
+ print_reg 'r', '1', '5', ':', %r15
+ print_reg 'c', 'r', '2', ':', %cr2
+
+ /* Only return from INT3 */
+ cmpq $3, EXC_EXCEPTION(%rsp)
+ jne .Ldie
+
+ popq %rsi
+ popq %rdx
+ popq %rcx
+ popq %rbx
+ popq %rax
+
+ addq $16, %rsp
+ iretq
+
+.Ldie:
+ hlt
+ jmp .Ldie
+
+SYM_CODE_END(exc_handler)