diff options
Diffstat (limited to 'arch/x86/kernel/head_64.S')
| -rw-r--r-- | arch/x86/kernel/head_64.S | 258 |
1 files changed, 118 insertions, 140 deletions
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ea6995920b7a..21816b48537c 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -9,7 +9,7 @@ * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com> */ - +#include <linux/export.h> #include <linux/linkage.h> #include <linux/threads.h> #include <linux/init.h> @@ -22,26 +22,18 @@ #include <asm/percpu.h> #include <asm/nops.h> #include "../entry/calling.h" -#include <asm/export.h> #include <asm/nospec-branch.h> #include <asm/apicdef.h> #include <asm/fixmap.h> #include <asm/smp.h> +#include <asm/thread_info.h> /* * We are not able to switch in one step to the final KERNEL ADDRESS SPACE * because we need identity-mapped pages. */ -#define l4_index(x) (((x) >> 39) & 511) -#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) - -L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4) -L4_START_KERNEL = l4_index(__START_KERNEL_map) - -L3_START_KERNEL = pud_index(__START_KERNEL_map) - .text - __HEAD + __INIT .code64 SYM_CODE_START_NOALIGN(startup_64) UNWIND_HINT_END_OF_STACK @@ -67,18 +59,19 @@ SYM_CODE_START_NOALIGN(startup_64) mov %rsi, %r15 /* Set up the stack for verify_cpu() */ - leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp - - leaq _text(%rip), %rdi + leaq __top_init_kernel_stack(%rip), %rsp - /* Setup GSBASE to allow stack canary access for C code */ + /* + * Set up GSBASE. + * Note that on SMP the boot CPU uses the init data section until + * the per-CPU areas are set up. + */ movl $MSR_GS_BASE, %ecx - leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx - movl %edx, %eax - shrq $32, %rdx + xorl %eax, %eax + xorl %edx, %edx wrmsr - call startup_64_setup_env + call __pi_startup_64_setup_gdt_idt /* Now switch to __KERNEL_CS so IRET works reliably */ pushq $__KERNEL_CS @@ -87,6 +80,7 @@ SYM_CODE_START_NOALIGN(startup_64) lretq .Lon_kernel_cs: + ANNOTATE_NOENDBR UNWIND_HINT_END_OF_STACK #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -97,27 +91,59 @@ SYM_CODE_START_NOALIGN(startup_64) * subsequent code. Pass the boot_params pointer as the first argument. */ movq %r15, %rdi - call sme_enable + call __pi_sme_enable #endif /* Sanitize CPU configuration */ call verify_cpu /* + * Derive the kernel's physical-to-virtual offset from the physical and + * virtual addresses of common_startup_64(). + */ + leaq common_startup_64(%rip), %rdi + subq .Lcommon_startup_64(%rip), %rdi + + /* * Perform pagetable fixups. Additionally, if SME is active, encrypt * the kernel and retrieve the modifier (SME encryption mask if SME * is active) to be added to the initial pgdir entry that will be * programmed into CR3. */ - leaq _text(%rip), %rdi movq %r15, %rsi - call __startup_64 + call __pi___startup_64 /* Form the CR3 value being sure to include the CR3 modifier */ - addq $(early_top_pgt - __START_KERNEL_map), %rax - jmp 1f + leaq early_top_pgt(%rip), %rcx + addq %rcx, %rax + +#ifdef CONFIG_AMD_MEM_ENCRYPT + mov %rax, %rdi + + /* + * For SEV guests: Verify that the C-bit is correct. A malicious + * hypervisor could lie about the C-bit position to perform a ROP + * attack on the guest by writing to the unencrypted stack and wait for + * the next RET instruction. + */ + call sev_verify_cbit +#endif + + /* + * Switch to early_top_pgt which still has the identity mappings + * present. + */ + movq %rax, %cr3 + + /* Branch to the common startup code at its kernel virtual address */ + ANNOTATE_RETPOLINE_SAFE + jmp *.Lcommon_startup_64(%rip) SYM_CODE_END(startup_64) + __INITRODATA +SYM_DATA_LOCAL(.Lcommon_startup_64, .quad common_startup_64) + + .text SYM_CODE_START(secondary_startup_64) UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR @@ -150,22 +176,39 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) ANNOTATE_NOENDBR /* Clear %R15 which holds the boot_params pointer on the boot CPU */ - xorq %r15, %r15 + xorl %r15d, %r15d + + /* Derive the runtime physical address of init_top_pgt[] */ + movq phys_base(%rip), %rax + addq $(init_top_pgt - __START_KERNEL_map), %rax /* * Retrieve the modifier (SME encryption mask if SME is active) to be * added to the initial pgdir entry that will be programmed into CR3. */ #ifdef CONFIG_AMD_MEM_ENCRYPT - movq sme_me_mask, %rax -#else - xorq %rax, %rax + addq sme_me_mask(%rip), %rax #endif + /* + * Switch to the init_top_pgt here, away from the trampoline_pgd and + * unmap the identity mapped ranges. + */ + movq %rax, %cr3 - /* Form the CR3 value being sure to include the CR3 modifier */ - addq $(init_top_pgt - __START_KERNEL_map), %rax -1: +SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL) + UNWIND_HINT_END_OF_STACK + ANNOTATE_NOENDBR + /* + * Create a mask of CR4 bits to preserve. Omit PGE in order to flush + * global 1:1 translations from the TLBs. + * + * From the SDM: + * "If CR4.PGE is changing from 0 to 1, there were no global TLB + * entries before the execution; if CR4.PGE is changing from 1 to 0, + * there will be no global TLB entries after the execution." + */ + movl $(X86_CR4_PAE | X86_CR4_LA57), %edx #ifdef CONFIG_X86_MCE /* * Preserve CR4.MCE if the kernel will enable #MC support. @@ -174,61 +217,20 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) * configured will crash the system regardless of the CR4.MCE value set * here. */ - movq %cr4, %rcx - andl $X86_CR4_MCE, %ecx -#else - movl $0, %ecx + orl $X86_CR4_MCE, %edx #endif + movq %cr4, %rcx + andl %edx, %ecx - /* Enable PAE mode, PGE and LA57 */ - orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx -#ifdef CONFIG_X86_5LEVEL - testl $1, __pgtable_l5_enabled(%rip) - jz 1f - orl $X86_CR4_LA57, %ecx -1: -#endif + /* Even if ignored in long mode, set PSE uniformly on all logical CPUs. */ + btsl $X86_CR4_PSE_BIT, %ecx movq %rcx, %cr4 - /* Setup early boot stage 4-/5-level pagetables. */ - addq phys_base(%rip), %rax - - /* - * For SEV guests: Verify that the C-bit is correct. A malicious - * hypervisor could lie about the C-bit position to perform a ROP - * attack on the guest by writing to the unencrypted stack and wait for - * the next RET instruction. - */ - movq %rax, %rdi - call sev_verify_cbit - /* - * Switch to new page-table - * - * For the boot CPU this switches to early_top_pgt which still has the - * indentity mappings present. The secondary CPUs will switch to the - * init_top_pgt here, away from the trampoline_pgd and unmap the - * indentity mapped ranges. - */ - movq %rax, %cr3 - - /* - * Do a global TLB flush after the CR3 switch to make sure the TLB - * entries from the identity mapping are flushed. + * Set CR4.PGE to re-enable global translations. */ - movq %cr4, %rcx - movq %rcx, %rax - xorq $X86_CR4_PGE, %rcx + btsl $X86_CR4_PGE_BIT, %ecx movq %rcx, %cr4 - movq %rax, %cr4 - - /* Ensure I am executing from virtual addresses */ - movq $1f, %rax - ANNOTATE_RETPOLINE_SAFE - jmp *%rax -1: - UNWIND_HINT_END_OF_STACK - ANNOTATE_NOENDBR // above #ifdef CONFIG_SMP /* @@ -256,6 +258,22 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) testl $X2APIC_ENABLE, %eax jnz .Lread_apicid_msr +#ifdef CONFIG_X86_X2APIC + /* + * If system is in X2APIC mode then MMIO base might not be + * mapped causing the MMIO read below to fault. Faults can't + * be handled at that point. + */ + cmpl $0, x2apic_mode(%rip) + jz .Lread_apicid_mmio + + /* Force the AP into X2APIC mode. */ + orl $X2APIC_ENABLE, %eax + wrmsr + jmp .Lread_apicid_msr +#endif + +.Lread_apicid_mmio: /* Read the APIC ID from the fix-mapped MMIO space. */ movq apic_mmio_base(%rip), %rcx addq $APIC_ID, %rcx @@ -269,7 +287,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) .Llookup_AP: /* EAX contains the APIC ID of the current CPU */ - xorq %rcx, %rcx + xorl %ecx, %ecx leaq cpuid_to_apicid(%rip), %rbx .Lfind_cpunr: @@ -304,7 +322,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) * * RDX contains the per-cpu offset */ - movq pcpu_hot + X86_current_task(%rdx), %rax + movq current_task(%rdx), %rax movq TASK_threadsp(%rax), %rsp /* @@ -344,17 +362,12 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) movl %eax,%fs movl %eax,%gs - /* Set up %gs. - * - * The base of %gs always points to fixed_percpu_data. If the - * stack protector canary is enabled, it is located at %gs:40. + /* + * Set up GSBASE. * Note that, on SMP, the boot cpu uses init data section until * the per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx -#ifndef CONFIG_SMP - leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx -#endif movl %edx, %eax shrq $32, %rdx wrmsr @@ -400,39 +413,10 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) movq %r15, %rdi .Ljump_to_C_code: - /* - * Jump to run C code and to be on a real kernel address. - * Since we are running on identity-mapped space we have to jump - * to the full 64bit address, this is only possible as indirect - * jump. In addition we need to ensure %cs is set so we make this - * a far return. - * - * Note: do not change to far jump indirect with 64bit offset. - * - * AMD does not support far jump indirect with 64bit offset. - * AMD64 Architecture Programmer's Manual, Volume 3: states only - * JMP FAR mem16:16 FF /5 Far jump indirect, - * with the target specified by a far pointer in memory. - * JMP FAR mem16:32 FF /5 Far jump indirect, - * with the target specified by a far pointer in memory. - * - * Intel64 does support 64bit offset. - * Software Developer Manual Vol 2: states: - * FF /5 JMP m16:16 Jump far, absolute indirect, - * address given in m16:16 - * FF /5 JMP m16:32 Jump far, absolute indirect, - * address given in m16:32. - * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, - * address given in m16:64. - */ - pushq $.Lafter_lret # put return address on stack for unwinder xorl %ebp, %ebp # clear frame pointer - movq initial_code(%rip), %rax - pushq $__KERNEL_CS # set correct cs - pushq %rax # target address in negative space - lretq -.Lafter_lret: - ANNOTATE_NOENDBR + ANNOTATE_RETPOLINE_SAFE + callq *initial_code(%rip) + ud2 SYM_CODE_END(secondary_startup_64) #include "verify_cpu.S" @@ -449,7 +433,7 @@ SYM_CODE_START(soft_restart_cpu) UNWIND_HINT_END_OF_STACK /* Find the idle task stack */ - movq PER_CPU_VAR(pcpu_hot) + X86_current_task, %rcx + movq PER_CPU_VAR(current_task), %rcx movq TASK_threadsp(%rcx), %rsp jmp .Ljump_to_C_code @@ -578,7 +562,7 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb) /* Call C handler */ movq %rsp, %rdi movq ORIG_RAX(%rsp), %rsi - call do_vc_no_ghcb + call __pi_do_vc_no_ghcb /* Unwind pt_regs */ POP_REGS @@ -589,12 +573,10 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb) /* Pure iret required here - don't use INTERRUPT_RETURN */ iretq SYM_CODE_END(vc_no_ghcb) +SYM_PIC_ALIAS(vc_no_ghcb); #endif -#define SYM_DATA_START_PAGE_ALIGNED(name) \ - SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE) - -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * Each PGD needs to be 8k long and 8k aligned. We do not * ever go out to userspace with these, so we do not @@ -615,25 +597,20 @@ SYM_CODE_END(vc_no_ghcb) #define PTI_USER_PGD_FILL 0 #endif -/* Automate the creation of 1 to 1 mapping pmd entries */ -#define PMDS(START, PERM, COUNT) \ - i = 0 ; \ - .rept (COUNT) ; \ - .quad (START) + (i << PMD_SHIFT) + (PERM) ; \ - i = i + 1 ; \ - .endr - __INITDATA .balign 4 SYM_DATA_START_PTI_ALIGNED(early_top_pgt) - .fill 512,8,0 + .fill 511,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC .fill PTI_USER_PGD_FILL,8,0 SYM_DATA_END(early_top_pgt) +SYM_PIC_ALIAS(early_top_pgt) SYM_DATA_START_PAGE_ALIGNED(early_dynamic_pgts) .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 SYM_DATA_END(early_dynamic_pgts) +SYM_PIC_ALIAS(early_dynamic_pgts); SYM_DATA(early_recursion_flag, .long 0) @@ -672,12 +649,11 @@ SYM_DATA_START_PTI_ALIGNED(init_top_pgt) SYM_DATA_END(init_top_pgt) #endif -#ifdef CONFIG_X86_5LEVEL SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt) .fill 511,8,0 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC SYM_DATA_END(level4_kernel_pgt) -#endif +SYM_PIC_ALIAS(level4_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 @@ -685,6 +661,7 @@ SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt) .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC SYM_DATA_END(level3_kernel_pgt) +SYM_PIC_ALIAS(level3_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt) /* @@ -702,6 +679,7 @@ SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt) */ PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) SYM_DATA_END(level2_kernel_pgt) +SYM_PIC_ALIAS(level2_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt) .fill (512 - 4 - FIXMAP_PMD_NUM),8,0 @@ -714,6 +692,7 @@ SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt) /* 6 MB reserved space + a 2MB hole */ .fill 4,8,0 SYM_DATA_END(level2_fixmap_pgt) +SYM_PIC_ALIAS(level2_fixmap_pgt) SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt) .rept (FIXMAP_PMD_NUM) @@ -721,8 +700,6 @@ SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt) .endr SYM_DATA_END(level1_fixmap_pgt) -#undef PMDS - .data .align 16 @@ -731,9 +708,10 @@ SYM_DATA(smpboot_control, .long 0) .align 16 /* This must match the first entry in level2_kernel_pgt */ SYM_DATA(phys_base, .quad 0x0) +SYM_PIC_ALIAS(phys_base); EXPORT_SYMBOL(phys_base) -#include "../../x86/xen/xen-head.S" +#include "../xen/xen-head.S" __PAGE_ALIGNED_BSS SYM_DATA_START_PAGE_ALIGNED(empty_zero_page) |
