diff options
Diffstat (limited to 'arch/x86/entry/entry_64.S')
| -rw-r--r-- | arch/x86/entry/entry_64.S | 325 |
1 files changed, 180 insertions, 145 deletions
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 15739a2c0983..f9983a1907bf 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -8,7 +8,7 @@ * * entry.S contains the system-call and fault low-level handling routines. * - * Some of this is documented in Documentation/x86/entry_64.rst + * Some of this is documented in Documentation/arch/x86/entry_64.rst * * A note on terminology: * - iret frame: Architecture defined interrupt frame from SS to RIP @@ -18,6 +18,8 @@ * - SYM_FUNC_START/END:Define functions in the symbol table. * - idtentry: Define exception entry points. */ +#include <linux/export.h> +#include <linux/kvm_types.h> #include <linux/linkage.h> #include <asm/segment.h> #include <asm/cache.h> @@ -34,7 +36,6 @@ #include <asm/asm.h> #include <asm/smap.h> #include <asm/pgtable_types.h> -#include <asm/export.h> #include <asm/frame.h> #include <asm/trapnr.h> #include <asm/nospec-branch.h> @@ -92,7 +93,7 @@ SYM_CODE_START(entry_SYSCALL_64) /* tss.sp2 is scratch space. */ movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR @@ -116,6 +117,7 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) /* clobbers %rax, make sure it is after saving the syscall nr */ IBRS_ENTER UNTRAIN_RET + CLEAR_BRANCH_HISTORY call do_syscall_64 /* returns with IRQs disabled */ @@ -126,70 +128,8 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) * In the Xen PV case we must use iret anyway. */ - ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \ - X86_FEATURE_XENPV - - movq RCX(%rsp), %rcx - movq RIP(%rsp), %r11 - - cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */ - jne swapgs_restore_regs_and_return_to_usermode - - /* - * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP - * in kernel space. This essentially lets the user take over - * the kernel, since userspace controls RSP. - * - * If width of "canonical tail" ever becomes variable, this will need - * to be updated to remain correct on both old and new CPUs. - * - * Change top bits to match most significant bit (47th or 56th bit - * depending on paging mode) in the address. - */ -#ifdef CONFIG_X86_5LEVEL - ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \ - "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57 -#else - shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx - sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx -#endif - - /* If this changed %rcx, it was not canonical */ - cmpq %rcx, %r11 - jne swapgs_restore_regs_and_return_to_usermode - - cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ - jne swapgs_restore_regs_and_return_to_usermode - - movq R11(%rsp), %r11 - cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ - jne swapgs_restore_regs_and_return_to_usermode - - /* - * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot - * restore RF properly. If the slowpath sets it for whatever reason, we - * need to restore it correctly. - * - * SYSRET can restore TF, but unlike IRET, restoring TF results in a - * trap from userspace immediately after SYSRET. This would cause an - * infinite loop whenever #DB happens with register state that satisfies - * the opportunistic SYSRET conditions. For example, single-stepping - * this user code: - * - * movq $stuck_here, %rcx - * pushfq - * popq %r11 - * stuck_here: - * - * would never get past 'stuck_here'. - */ - testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 - jnz swapgs_restore_regs_and_return_to_usermode - - /* nothing to check for RSP */ - - cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ - jne swapgs_restore_regs_and_return_to_usermode + ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \ + "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV /* * We win! This label is here just for ease of understanding @@ -205,7 +145,7 @@ syscall_return_via_sysret: */ movq %rsp, %rdi movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK pushq RSP-RDI(%rdi) /* RSP */ pushq (%rdi) /* RDI */ @@ -223,6 +163,7 @@ syscall_return_via_sysret: SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR swapgs + CLEAR_CPU_BUFFERS sysretq SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR @@ -235,6 +176,7 @@ SYM_CODE_END(entry_SYSCALL_64) */ .pushsection .text, "ax" SYM_FUNC_START(__switch_to_asm) + ANNOTATE_NOENDBR /* * Save callee-saved registers * This must match the order in inactive_task_frame @@ -252,7 +194,7 @@ SYM_FUNC_START(__switch_to_asm) #ifdef CONFIG_STACKPROTECTOR movq TASK_stack_canary(%rsi), %rbx - movq %rbx, PER_CPU_VAR(fixed_percpu_data) + FIXED_stack_canary + movq %rbx, PER_CPU_VAR(__stack_chk_guard) #endif /* @@ -284,36 +226,39 @@ SYM_FUNC_END(__switch_to_asm) * r12: kernel thread arg */ .pushsection .text, "ax" - __FUNC_ALIGN -SYM_CODE_START_NOALIGN(ret_from_fork) - UNWIND_HINT_EMPTY +SYM_CODE_START(ret_from_fork_asm) + /* + * This is the start of the kernel stack; even through there's a + * register set at the top, the regset isn't necessarily coherent + * (consider kthreads) and one cannot unwind further. + * + * This ensures stack unwinds of kernel threads terminate in a known + * good state. + */ + UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR // copy_thread CALL_DEPTH_ACCOUNT - movq %rax, %rdi - call schedule_tail /* rdi: 'prev' task parameter */ - - testq %rbx, %rbx /* from kernel_thread? */ - jnz 1f /* kernel threads are uncommon */ -2: - UNWIND_HINT_REGS - movq %rsp, %rdi - call syscall_exit_to_user_mode /* returns with IRQs disabled */ - jmp swapgs_restore_regs_and_return_to_usermode + movq %rax, %rdi /* prev */ + movq %rsp, %rsi /* regs */ + movq %rbx, %rdx /* fn */ + movq %r12, %rcx /* fn_arg */ + call ret_from_fork -1: - /* kernel thread */ - UNWIND_HINT_EMPTY - movq %r12, %rdi - CALL_NOSPEC rbx /* - * A kernel thread is allowed to return here after successfully - * calling kernel_execve(). Exit to userspace to complete the execve() - * syscall. + * Set the stack state to what is expected for the target function + * -- at this point the register set should be a valid user set + * and unwind should work normally. */ - movq $0, RAX(%rsp) - jmp 2b -SYM_CODE_END(ret_from_fork) + UNWIND_HINT_REGS + +#ifdef CONFIG_X86_FRED + ALTERNATIVE "jmp swapgs_restore_regs_and_return_to_usermode", \ + "jmp asm_fred_exit_user", X86_FEATURE_FRED +#else + jmp swapgs_restore_regs_and_return_to_usermode +#endif +SYM_CODE_END(ret_from_fork_asm) .popsection .macro DEBUG_ENTRY_ASSERT_IRQS_OFF @@ -365,10 +310,9 @@ SYM_CODE_END(xen_error_entry) movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ .endif - call \cfunc - /* For some configurations \cfunc ends up being a noreturn. */ - REACHABLE + ANNOTATE_REACHABLE + call \cfunc jmp error_return .endm @@ -385,7 +329,14 @@ SYM_CODE_END(xen_error_entry) */ .macro idtentry vector asmsym cfunc has_error_code:req SYM_CODE_START(\asmsym) - UNWIND_HINT_IRET_REGS offset=\has_error_code*8 + + .if \vector == X86_TRAP_BP + /* #BP advances %rip to the next instruction */ + UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8 signal=0 + .else + UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8 + .endif + ENDBR ASM_CLAC cld @@ -428,14 +379,6 @@ SYM_CODE_END(\asmsym) idtentry \vector asm_\cfunc \cfunc has_error_code=1 .endm -/* - * System vectors which invoke their handlers directly and are not - * going through the regular common device interrupt handling code. - */ -.macro idtentry_sysvec vector cfunc - idtentry \vector asm_\cfunc \cfunc has_error_code=0 -.endm - /** * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB * @vector: Vector number @@ -454,7 +397,7 @@ SYM_CODE_END(\asmsym) */ .macro idtentry_mce_db vector asmsym cfunc SYM_CODE_START(\asmsym) - UNWIND_HINT_IRET_REGS + UNWIND_HINT_IRET_ENTRY ENDBR ASM_CLAC cld @@ -511,7 +454,7 @@ SYM_CODE_END(\asmsym) */ .macro idtentry_vc vector asmsym cfunc SYM_CODE_START(\asmsym) - UNWIND_HINT_IRET_REGS + UNWIND_HINT_IRET_ENTRY ENDBR ASM_CLAC cld @@ -575,7 +518,7 @@ SYM_CODE_END(\asmsym) */ .macro idtentry_df vector asmsym cfunc SYM_CODE_START(\asmsym) - UNWIND_HINT_IRET_REGS offset=8 + UNWIND_HINT_IRET_ENTRY offset=8 ENDBR ASM_CLAC cld @@ -587,10 +530,10 @@ SYM_CODE_START(\asmsym) movq %rsp, %rdi /* pt_regs pointer into first argument */ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ - call \cfunc /* For some configurations \cfunc ends up being a noreturn. */ - REACHABLE + ANNOTATE_REACHABLE + call \cfunc jmp paranoid_exit @@ -617,17 +560,28 @@ __irqentry_text_end: SYM_CODE_START_LOCAL(common_interrupt_return) SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) IBRS_EXIT -#ifdef CONFIG_DEBUG_ENTRY - /* Assert that pt_regs indicates user mode. */ - testb $3, CS(%rsp) - jnz 1f - ud2 -1: -#endif #ifdef CONFIG_XEN_PV ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV #endif +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION + ALTERNATIVE "", "jmp .Lpti_restore_regs_and_return_to_usermode", X86_FEATURE_PTI +#endif + + STACKLEAK_ERASE + POP_REGS + add $8, %rsp /* orig_ax */ + UNWIND_HINT_IRET_REGS +.Lswapgs_and_iret: + swapgs + CLEAR_CPU_BUFFERS + /* Assert that the IRET frame indicates user mode. */ + testb $3, 8(%rsp) + jnz .Lnative_iret + ud2 + +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION +.Lpti_restore_regs_and_return_to_usermode: POP_REGS pop_rdi=0 /* @@ -636,7 +590,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) */ movq %rsp, %rdi movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK /* Copy the IRET frame to the trampoline stack. */ pushq 6*8(%rdi) /* SS */ @@ -654,13 +608,14 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) */ STACKLEAK_ERASE_NOCLOBBER - SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + push %rax + SWITCH_TO_USER_CR3 scratch_reg=%rdi scratch_reg2=%rax + pop %rax /* Restore RDI. */ popq %rdi - swapgs - jmp .Lnative_iret - + jmp .Lswapgs_and_iret +#endif SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL) #ifdef CONFIG_DEBUG_ENTRY @@ -770,6 +725,8 @@ native_irq_return_ldt: */ popq %rax /* Restore user RAX */ + CLEAR_CPU_BUFFERS + /* * RSP now points to an ordinary IRET frame, except that the page * is read-only and RSP[31:16] are preloaded with the userspace @@ -782,11 +739,12 @@ _ASM_NOKPROBE(common_interrupt_return) /* * Reload gs selector with exception handling - * edi: new selector + * di: new selector * * Is in entry.text as it shouldn't be instrumented. */ SYM_FUNC_START(asm_load_gs_index) + ANNOTATE_NOENDBR FRAME_BEGIN swapgs .Lgs_change: @@ -862,7 +820,7 @@ SYM_CODE_END(exc_xen_hypervisor_callback) */ __FUNC_ALIGN SYM_CODE_START_NOALIGN(xen_failsafe_callback) - UNWIND_HINT_EMPTY + UNWIND_HINT_UNDEFINED ENDBR movl %ds, %ecx cmpw %cx, 0x10(%rsp) @@ -1015,14 +973,14 @@ SYM_CODE_START_LOCAL(paranoid_exit) IBRS_EXIT save_reg=%r15 /* - * The order of operations is important. RESTORE_CR3 requires + * The order of operations is important. PARANOID_RESTORE_CR3 requires * kernel GSBASE. * * NB to anyone to try to optimize this code: this code does * not execute at all for exceptions from user mode. Those - * exceptions go through error_exit instead. + * exceptions go through error_return instead. */ - RESTORE_CR3 scratch_reg=%rax save_reg=%r14 + PARANOID_RESTORE_CR3 scratch_reg=%rax save_reg=%r14 /* Handle the three GSBASE cases */ ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE @@ -1100,7 +1058,7 @@ SYM_CODE_START(error_entry) FENCE_SWAPGS_KERNEL_ENTRY CALL_DEPTH_ACCOUNT leaq 8(%rsp), %rax /* return pt_regs pointer */ - ANNOTATE_UNRET_END + VALIDATE_UNRET_END RET .Lbstep_iret: @@ -1143,10 +1101,10 @@ SYM_CODE_END(error_return) * * Registers: * %r14: Used to save/restore the CR3 of the interrupted context - * when PAGE_TABLE_ISOLATION is in use. Do not clobber. + * when MITIGATION_PAGE_TABLE_ISOLATION is in use. Do not clobber. */ SYM_CODE_START(asm_exc_nmi) - UNWIND_HINT_IRET_REGS + UNWIND_HINT_IRET_ENTRY ENDBR /* @@ -1159,8 +1117,8 @@ SYM_CODE_START(asm_exc_nmi) * anyway. * * To handle this case we do the following: - * Check the a special location on the stack that contains - * a variable that is set when NMIs are executing. + * Check a special location on the stack that contains a + * variable that is set when NMIs are executing. * The interrupted task's stack is also checked to see if it * is an NMI stack. * If the variable is not set and the stack is not the NMI @@ -1211,7 +1169,7 @@ SYM_CODE_START(asm_exc_nmi) FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx movq %rsp, %rdx - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp UNWIND_HINT_IRET_REGS base=%rdx offset=8 pushq 5*8(%rdx) /* pt_regs->ss */ pushq 4*8(%rdx) /* pt_regs->rsp */ @@ -1233,7 +1191,6 @@ SYM_CODE_START(asm_exc_nmi) */ movq %rsp, %rdi - movq $-1, %rsi call exc_nmi /* @@ -1291,8 +1248,8 @@ SYM_CODE_START(asm_exc_nmi) * end_repeat_nmi, then we are a nested NMI. We must not * modify the "iret" frame because it's being written by * the outer NMI. That's okay; the outer NMI handler is - * about to about to call exc_nmi() anyway, so we can just - * resume the outer NMI. + * about to call exc_nmi() anyway, so we can just resume + * the outer NMI. */ movq $repeat_nmi, %rdx @@ -1447,14 +1404,12 @@ end_repeat_nmi: UNWIND_HINT_REGS movq %rsp, %rdi - movq $-1, %rsi call exc_nmi /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */ IBRS_EXIT save_reg=%r15 - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 + PARANOID_RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 /* * The above invocation of paranoid_entry stored the GSBASE @@ -1499,6 +1454,12 @@ nmi_restore: movq $0, 5*8(%rsp) /* clear "NMI executing" */ /* + * Skip CLEAR_CPU_BUFFERS here, since it only helps in rare cases like + * NMI in kernel after user state is restored. For an unprivileged user + * these conditions are hard to meet. + */ + + /* * iretq reads the "iret" frame and exits the NMI stack in a * single instruction. We are returning to kernel mode, so this * cannot result in a fault. Similarly, we don't need to worry @@ -1507,18 +1468,17 @@ nmi_restore: iretq SYM_CODE_END(asm_exc_nmi) -#ifndef CONFIG_IA32_EMULATION /* * This handles SYSCALL from 32-bit code. There is no way to program * MSRs to fully disable 32-bit SYSCALL. */ -SYM_CODE_START(ignore_sysret) - UNWIND_HINT_EMPTY +SYM_CODE_START(entry_SYSCALL32_ignore) + UNWIND_HINT_END_OF_STACK ENDBR mov $-ENOSYS, %eax + CLEAR_CPU_BUFFERS sysretl -SYM_CODE_END(ignore_sysret) -#endif +SYM_CODE_END(entry_SYSCALL32_ignore) .pushsection .text, "ax" __FUNC_ALIGN @@ -1527,10 +1487,85 @@ SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rax + movq PER_CPU_VAR(cpu_current_top_of_stack), %rax leaq -PTREGS_SIZE(%rax), %rsp UNWIND_HINT_REGS call make_task_dead SYM_CODE_END(rewind_stack_and_make_dead) .popsection + +/* + * This sequence executes branches in order to remove user branch information + * from the branch history tracker in the Branch Predictor, therefore removing + * user influence on subsequent BTB lookups. + * + * It should be used on parts prior to Alder Lake. Newer parts should use the + * BHI_DIS_S hardware control instead. If a pre-Alder Lake part is being + * virtualized on newer hardware the VMM should protect against BHI attacks by + * setting BHI_DIS_S for the guests. + * + * CALLs/RETs are necessary to prevent Loop Stream Detector(LSD) from engaging + * and not clearing the branch history. The call tree looks like: + * + * call 1 + * call 2 + * call 2 + * call 2 + * call 2 + * call 2 + * ret + * ret + * ret + * ret + * ret + * ret + * + * This means that the stack is non-constant and ORC can't unwind it with %rsp + * alone. Therefore we unconditionally set up the frame pointer, which allows + * ORC to unwind properly. + * + * The alignment is for performance and not for safety, and may be safely + * refactored in the future if needed. The .skips are for safety, to ensure + * that all RETs are in the second half of a cacheline to mitigate Indirect + * Target Selection, rather than taking the slowpath via its_return_thunk. + */ +SYM_FUNC_START(clear_bhb_loop) + ANNOTATE_NOENDBR + push %rbp + mov %rsp, %rbp + movl $5, %ecx + ANNOTATE_INTRA_FUNCTION_CALL + call 1f + jmp 5f + .align 64, 0xcc + /* + * Shift instructions so that the RET is in the upper half of the + * cacheline and don't take the slowpath to its_return_thunk. + */ + .skip 32 - (.Lret1 - 1f), 0xcc + ANNOTATE_INTRA_FUNCTION_CALL +1: call 2f +.Lret1: RET + .align 64, 0xcc + /* + * As above shift instructions for RET at .Lret2 as well. + * + * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc + * but some Clang versions (e.g. 18) don't like this. + */ + .skip 32 - 18, 0xcc +2: movl $5, %eax +3: jmp 4f + nop +4: sub $1, %eax + jnz 3b + sub $1, %ecx + jnz 1b +.Lret2: RET +5: lfence + pop %rbp + RET +SYM_FUNC_END(clear_bhb_loop) +EXPORT_SYMBOL_FOR_KVM(clear_bhb_loop) +STACK_FRAME_NON_STANDARD(clear_bhb_loop) |
