summaryrefslogtreecommitdiff
path: root/arch/x86/entry/entry_32.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/entry/entry_32.S')
-rw-r--r--arch/x86/entry/entry_32.S1526
1 files changed, 836 insertions, 690 deletions
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 48ef7bb32c42..92c0b4a94e0a 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 1991,1992 Linus Torvalds
*
@@ -19,7 +20,7 @@
* 1C(%esp) - %ds
* 20(%esp) - %es
* 24(%esp) - %fs
- * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS
+ * 28(%esp) - unused -- was %gs on old stackprotector kernels
* 2C(%esp) - orig_eax
* 30(%esp) - %eip
* 34(%esp) - %cs
@@ -39,123 +40,184 @@
#include <asm/processor-flags.h>
#include <asm/irq_vectors.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/frame.h>
+#include <asm/trapnr.h>
+#include <asm/nospec-branch.h>
+
+#include "calling.h"
.section .entry.text, "ax"
-/*
- * We use macros for low-level operations which need to be overridden
- * for paravirtualization. The following will never clobber any registers:
- * INTERRUPT_RETURN (aka. "iret")
- * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
- * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
- *
- * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
- * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
- * Allowing a register to be clobbered can shrink the paravirt replacement
- * enough to patch inline, increasing performance.
- */
+#define PTI_SWITCH_MASK (1 << PAGE_SHIFT)
-#ifdef CONFIG_PREEMPT
-# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
-#else
-# define preempt_stop(clobbers)
-# define resume_kernel restore_all
-#endif
+/* Unconditionally switch to user cr3 */
+.macro SWITCH_TO_USER_CR3 scratch_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
-.macro TRACE_IRQS_IRET
-#ifdef CONFIG_TRACE_IRQFLAGS
- testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off?
- jz 1f
- TRACE_IRQS_ON
-1:
+ movl %cr3, \scratch_reg
+ orl $PTI_SWITCH_MASK, \scratch_reg
+ movl \scratch_reg, %cr3
+.Lend_\@:
+.endm
+
+.macro BUG_IF_WRONG_CR3 no_user_check=0
+#ifdef CONFIG_DEBUG_ENTRY
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ .if \no_user_check == 0
+ /* coming from usermode? */
+ testl $USER_SEGMENT_RPL_MASK, PT_CS(%esp)
+ jz .Lend_\@
+ .endif
+ /* On user-cr3? */
+ movl %cr3, %eax
+ testl $PTI_SWITCH_MASK, %eax
+ jnz .Lend_\@
+ /* From userspace with kernel cr3 - BUG */
+ ud2
+.Lend_\@:
#endif
.endm
/*
- * User gs save/restore
- *
- * %gs is used for userland TLS and kernel only uses it for stack
- * canary which is required to be at %gs:20 by gcc. Read the comment
- * at the top of stackprotector.h for more info.
- *
- * Local labels 98 and 99 are used.
+ * Switch to kernel cr3 if not already loaded and return current cr3 in
+ * \scratch_reg
*/
-#ifdef CONFIG_X86_32_LAZY_GS
-
- /* unfortunately push/pop can't be no-op */
-.macro PUSH_GS
- pushl $0
-.endm
-.macro POP_GS pop=0
- addl $(4 + \pop), %esp
-.endm
-.macro POP_GS_EX
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ movl %cr3, \scratch_reg
+ /* Test if we are already on kernel CR3 */
+ testl $PTI_SWITCH_MASK, \scratch_reg
+ jz .Lend_\@
+ andl $(~PTI_SWITCH_MASK), \scratch_reg
+ movl \scratch_reg, %cr3
+ /* Return original CR3 in \scratch_reg */
+ orl $PTI_SWITCH_MASK, \scratch_reg
+.Lend_\@:
.endm
- /* all the rest are no-op */
-.macro PTGS_TO_GS
-.endm
-.macro PTGS_TO_GS_EX
-.endm
-.macro GS_TO_REG reg
-.endm
-.macro REG_TO_PTGS reg
-.endm
-.macro SET_KERNEL_GS reg
-.endm
+#define CS_FROM_ENTRY_STACK (1 << 31)
+#define CS_FROM_USER_CR3 (1 << 30)
+#define CS_FROM_KERNEL (1 << 29)
+#define CS_FROM_ESPFIX (1 << 28)
-#else /* CONFIG_X86_32_LAZY_GS */
+.macro FIXUP_FRAME
+ /*
+ * The high bits of the CS dword (__csh) are used for CS_FROM_*.
+ * Clear them in case hardware didn't do this for us.
+ */
+ andl $0x0000ffff, 4*4(%esp)
-.macro PUSH_GS
- pushl %gs
-.endm
+#ifdef CONFIG_VM86
+ testl $X86_EFLAGS_VM, 5*4(%esp)
+ jnz .Lfrom_usermode_no_fixup_\@
+#endif
+ testl $USER_SEGMENT_RPL_MASK, 4*4(%esp)
+ jnz .Lfrom_usermode_no_fixup_\@
-.macro POP_GS pop=0
-98: popl %gs
- .if \pop <> 0
- add $\pop, %esp
- .endif
-.endm
-.macro POP_GS_EX
-.pushsection .fixup, "ax"
-99: movl $0, (%esp)
- jmp 98b
-.popsection
- _ASM_EXTABLE(98b, 99b)
-.endm
+ orl $CS_FROM_KERNEL, 4*4(%esp)
-.macro PTGS_TO_GS
-98: mov PT_GS(%esp), %gs
-.endm
-.macro PTGS_TO_GS_EX
-.pushsection .fixup, "ax"
-99: movl $0, PT_GS(%esp)
- jmp 98b
-.popsection
- _ASM_EXTABLE(98b, 99b)
-.endm
+ /*
+ * When we're here from kernel mode; the (exception) stack looks like:
+ *
+ * 6*4(%esp) - <previous context>
+ * 5*4(%esp) - flags
+ * 4*4(%esp) - cs
+ * 3*4(%esp) - ip
+ * 2*4(%esp) - orig_eax
+ * 1*4(%esp) - gs / function
+ * 0*4(%esp) - fs
+ *
+ * Lets build a 5 entry IRET frame after that, such that struct pt_regs
+ * is complete and in particular regs->sp is correct. This gives us
+ * the original 6 entries as gap:
+ *
+ * 14*4(%esp) - <previous context>
+ * 13*4(%esp) - gap / flags
+ * 12*4(%esp) - gap / cs
+ * 11*4(%esp) - gap / ip
+ * 10*4(%esp) - gap / orig_eax
+ * 9*4(%esp) - gap / gs / function
+ * 8*4(%esp) - gap / fs
+ * 7*4(%esp) - ss
+ * 6*4(%esp) - sp
+ * 5*4(%esp) - flags
+ * 4*4(%esp) - cs
+ * 3*4(%esp) - ip
+ * 2*4(%esp) - orig_eax
+ * 1*4(%esp) - gs / function
+ * 0*4(%esp) - fs
+ */
-.macro GS_TO_REG reg
- movl %gs, \reg
-.endm
-.macro REG_TO_PTGS reg
- movl \reg, PT_GS(%esp)
-.endm
-.macro SET_KERNEL_GS reg
- movl $(__KERNEL_STACK_CANARY), \reg
- movl \reg, %gs
+ pushl %ss # ss
+ pushl %esp # sp (points at ss)
+ addl $7*4, (%esp) # point sp back at the previous context
+ pushl 7*4(%esp) # flags
+ pushl 7*4(%esp) # cs
+ pushl 7*4(%esp) # ip
+ pushl 7*4(%esp) # orig_eax
+ pushl 7*4(%esp) # gs / function
+ pushl 7*4(%esp) # fs
+.Lfrom_usermode_no_fixup_\@:
.endm
-#endif /* CONFIG_X86_32_LAZY_GS */
+.macro IRET_FRAME
+ /*
+ * We're called with %ds, %es, %fs, and %gs from the interrupted
+ * frame, so we shouldn't use them. Also, we may be in ESPFIX
+ * mode and therefore have a nonzero SS base and an offset ESP,
+ * so any attempt to access the stack needs to use SS. (except for
+ * accesses through %esp, which automatically use SS.)
+ */
+ testl $CS_FROM_KERNEL, 1*4(%esp)
+ jz .Lfinished_frame_\@
+
+ /*
+ * Reconstruct the 3 entry IRET frame right after the (modified)
+ * regs->sp without lowering %esp in between, such that an NMI in the
+ * middle doesn't scribble our stack.
+ */
+ pushl %eax
+ pushl %ecx
+ movl 5*4(%esp), %eax # (modified) regs->sp
-.macro SAVE_ALL pt_regs_ax=%eax
+ movl 4*4(%esp), %ecx # flags
+ movl %ecx, %ss:-1*4(%eax)
+
+ movl 3*4(%esp), %ecx # cs
+ andl $0x0000ffff, %ecx
+ movl %ecx, %ss:-2*4(%eax)
+
+ movl 2*4(%esp), %ecx # ip
+ movl %ecx, %ss:-3*4(%eax)
+
+ movl 1*4(%esp), %ecx # eax
+ movl %ecx, %ss:-4*4(%eax)
+
+ popl %ecx
+ lea -4*4(%eax), %esp
+ popl %eax
+.Lfinished_frame_\@:
+.endm
+
+.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 unwind_espfix=0
cld
- PUSH_GS
+.if \skip_gs == 0
+ pushl $0
+.endif
pushl %fs
+
+ pushl %eax
+ movl $(__KERNEL_PERCPU), %eax
+ movl %eax, %fs
+.if \unwind_espfix > 0
+ UNWIND_ESPFIX_STACK
+.endif
+ popl %eax
+
+ FIXUP_FRAME
pushl %es
pushl %ds
pushl \pt_regs_ax
@@ -168,25 +230,27 @@
movl $(__USER_DS), %edx
movl %edx, %ds
movl %edx, %es
- movl $(__KERNEL_PERCPU), %edx
- movl %edx, %fs
- SET_KERNEL_GS %edx
+ /* Switch to kernel stack if necessary */
+.if \switch_stacks > 0
+ SWITCH_TO_KERNEL_STACK
+.endif
.endm
-/*
- * This is a sneaky trick to help the unwinder find pt_regs on the stack. The
- * frame pointer is replaced with an encoded pointer to pt_regs. The encoding
- * is just setting the LSB, which makes it an invalid stack address and is also
- * a signal to the unwinder that it's a pt_regs pointer in disguise.
- *
- * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the
- * original rbp.
- */
-.macro ENCODE_FRAME_POINTER
-#ifdef CONFIG_FRAME_POINTER
- mov %esp, %ebp
- orl $0x1, %ebp
-#endif
+.macro SAVE_ALL_NMI cr3_reg:req unwind_espfix=0
+ SAVE_ALL unwind_espfix=\unwind_espfix
+
+ BUG_IF_WRONG_CR3
+
+ /*
+ * Now switch the CR3 when PTI is enabled.
+ *
+ * We can enter with either user or kernel cr3, the code will
+ * store the old cr3 in \cr3_reg and switches to the kernel cr3
+ * if necessary.
+ */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=\cr3_reg
+
+.Lend_\@:
.endm
.macro RESTORE_INT_REGS
@@ -204,26 +268,408 @@
1: popl %ds
2: popl %es
3: popl %fs
- POP_GS \pop
-.pushsection .fixup, "ax"
-4: movl $0, (%esp)
- jmp 1b
-5: movl $0, (%esp)
- jmp 2b
-6: movl $0, (%esp)
- jmp 3b
-.popsection
- _ASM_EXTABLE(1b, 4b)
- _ASM_EXTABLE(2b, 5b)
- _ASM_EXTABLE(3b, 6b)
- POP_GS_EX
+4: addl $(4 + \pop), %esp /* pop the unused "gs" slot */
+ IRET_FRAME
+
+ /*
+ * There is no _ASM_EXTABLE_TYPE_REG() for ASM, however since this is
+ * ASM the registers are known and we can trivially hard-code them.
+ */
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_POP_ZERO|EX_REG_DS)
+ _ASM_EXTABLE_TYPE(2b, 3b, EX_TYPE_POP_ZERO|EX_REG_ES)
+ _ASM_EXTABLE_TYPE(3b, 4b, EX_TYPE_POP_ZERO|EX_REG_FS)
+.endm
+
+.macro RESTORE_ALL_NMI cr3_reg:req pop=0
+ /*
+ * Now switch the CR3 when PTI is enabled.
+ *
+ * We enter with kernel cr3 and switch the cr3 to the value
+ * stored on \cr3_reg, which is either a user or a kernel cr3.
+ */
+ ALTERNATIVE "jmp .Lswitched_\@", "", X86_FEATURE_PTI
+
+ testl $PTI_SWITCH_MASK, \cr3_reg
+ jz .Lswitched_\@
+
+ /* User cr3 in \cr3_reg - write it to hardware cr3 */
+ movl \cr3_reg, %cr3
+
+.Lswitched_\@:
+
+ BUG_IF_WRONG_CR3
+
+ RESTORE_REGS pop=\pop
.endm
+.macro CHECK_AND_APPLY_ESPFIX
+#ifdef CONFIG_X86_ESPFIX32
+#define GDT_ESPFIX_OFFSET (GDT_ENTRY_ESPFIX_SS * 8)
+#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page + GDT_ESPFIX_OFFSET)
+
+ ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX
+
+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
+ /*
+ * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
+ * are returning to the kernel.
+ * See comments in process.c:copy_thread() for details.
+ */
+ movb PT_OLDSS(%esp), %ah
+ movb PT_CS(%esp), %al
+ andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+ cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
+ jne .Lend_\@ # returning to user-space with LDT SS
+
+ /*
+ * Setup and switch to ESPFIX stack
+ *
+ * We're returning to userspace with a 16 bit stack. The CPU will not
+ * restore the high word of ESP for us on executing iret... This is an
+ * "official" bug of all the x86-compatible CPUs, which we can work
+ * around to make dosemu and wine happy. We do this by preloading the
+ * high word of ESP with the high word of the userspace ESP while
+ * compensating for the offset by changing to the ESPFIX segment with
+ * a base address that matches for the difference.
+ */
+ mov %esp, %edx /* load kernel esp */
+ mov PT_OLDESP(%esp), %eax /* load userspace esp */
+ mov %dx, %ax /* eax: new kernel esp */
+ sub %eax, %edx /* offset (low word is 0) */
+ shr $16, %edx
+ mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
+ mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
+ pushl $__ESPFIX_SS
+ pushl %eax /* new kernel esp */
+ /*
+ * Disable interrupts, but do not irqtrace this section: we
+ * will soon execute iret and the tracer was already set to
+ * the irqstate after the IRET:
+ */
+ cli
+ lss (%esp), %esp /* switch to espfix segment */
+.Lend_\@:
+#endif /* CONFIG_X86_ESPFIX32 */
+.endm
+
+/*
+ * Called with pt_regs fully populated and kernel segments loaded,
+ * so we can access PER_CPU and use the integer registers.
+ *
+ * We need to be very careful here with the %esp switch, because an NMI
+ * can happen everywhere. If the NMI handler finds itself on the
+ * entry-stack, it will overwrite the task-stack and everything we
+ * copied there. So allocate the stack-frame on the task-stack and
+ * switch to it before we do any copying.
+ */
+
+.macro SWITCH_TO_KERNEL_STACK
+
+ BUG_IF_WRONG_CR3
+
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
+
+ /*
+ * %eax now contains the entry cr3 and we carry it forward in
+ * that register for the time this macro runs
+ */
+
+ /* Are we on the entry stack? Bail out if not! */
+ movl PER_CPU_VAR(cpu_entry_area), %ecx
+ addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
+ subl %esp, %ecx /* ecx = (end of entry_stack) - esp */
+ cmpl $SIZEOF_entry_stack, %ecx
+ jae .Lend_\@
+
+ /* Load stack pointer into %esi and %edi */
+ movl %esp, %esi
+ movl %esi, %edi
+
+ /* Move %edi to the top of the entry stack */
+ andl $(MASK_entry_stack), %edi
+ addl $(SIZEOF_entry_stack), %edi
+
+ /* Load top of task-stack into %edi */
+ movl TSS_entry2task_stack(%edi), %edi
+
+ /* Special case - entry from kernel mode via entry stack */
+#ifdef CONFIG_VM86
+ movl PT_EFLAGS(%esp), %ecx # mix EFLAGS and CS
+ movb PT_CS(%esp), %cl
+ andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %ecx
+#else
+ movl PT_CS(%esp), %ecx
+ andl $SEGMENT_RPL_MASK, %ecx
+#endif
+ cmpl $USER_RPL, %ecx
+ jb .Lentry_from_kernel_\@
+
+ /* Bytes to copy */
+ movl $PTREGS_SIZE, %ecx
+
+#ifdef CONFIG_VM86
+ testl $X86_EFLAGS_VM, PT_EFLAGS(%esi)
+ jz .Lcopy_pt_regs_\@
+
+ /*
+ * Stack-frame contains 4 additional segment registers when
+ * coming from VM86 mode
+ */
+ addl $(4 * 4), %ecx
+
+#endif
+.Lcopy_pt_regs_\@:
+
+ /* Allocate frame on task-stack */
+ subl %ecx, %edi
+
+ /* Switch to task-stack */
+ movl %edi, %esp
+
+ /*
+ * We are now on the task-stack and can safely copy over the
+ * stack-frame
+ */
+ shrl $2, %ecx
+ cld
+ rep movsl
+
+ jmp .Lend_\@
+
+.Lentry_from_kernel_\@:
+
+ /*
+ * This handles the case when we enter the kernel from
+ * kernel-mode and %esp points to the entry-stack. When this
+ * happens we need to switch to the task-stack to run C code,
+ * but switch back to the entry-stack again when we approach
+ * iret and return to the interrupted code-path. This usually
+ * happens when we hit an exception while restoring user-space
+ * segment registers on the way back to user-space or when the
+ * sysenter handler runs with eflags.tf set.
+ *
+ * When we switch to the task-stack here, we can't trust the
+ * contents of the entry-stack anymore, as the exception handler
+ * might be scheduled out or moved to another CPU. Therefore we
+ * copy the complete entry-stack to the task-stack and set a
+ * marker in the iret-frame (bit 31 of the CS dword) to detect
+ * what we've done on the iret path.
+ *
+ * On the iret path we copy everything back and switch to the
+ * entry-stack, so that the interrupted kernel code-path
+ * continues on the same stack it was interrupted with.
+ *
+ * Be aware that an NMI can happen anytime in this code.
+ *
+ * %esi: Entry-Stack pointer (same as %esp)
+ * %edi: Top of the task stack
+ * %eax: CR3 on kernel entry
+ */
+
+ /* Calculate number of bytes on the entry stack in %ecx */
+ movl %esi, %ecx
+
+ /* %ecx to the top of entry-stack */
+ andl $(MASK_entry_stack), %ecx
+ addl $(SIZEOF_entry_stack), %ecx
+
+ /* Number of bytes on the entry stack to %ecx */
+ sub %esi, %ecx
+
+ /* Mark stackframe as coming from entry stack */
+ orl $CS_FROM_ENTRY_STACK, PT_CS(%esp)
+
+ /*
+ * Test the cr3 used to enter the kernel and add a marker
+ * so that we can switch back to it before iret.
+ */
+ testl $PTI_SWITCH_MASK, %eax
+ jz .Lcopy_pt_regs_\@
+ orl $CS_FROM_USER_CR3, PT_CS(%esp)
+
+ /*
+ * %esi and %edi are unchanged, %ecx contains the number of
+ * bytes to copy. The code at .Lcopy_pt_regs_\@ will allocate
+ * the stack-frame on task-stack and copy everything over
+ */
+ jmp .Lcopy_pt_regs_\@
+
+.Lend_\@:
+.endm
+
+/*
+ * Switch back from the kernel stack to the entry stack.
+ *
+ * The %esp register must point to pt_regs on the task stack. It will
+ * first calculate the size of the stack-frame to copy, depending on
+ * whether we return to VM86 mode or not. With that it uses 'rep movsl'
+ * to copy the contents of the stack over to the entry stack.
+ *
+ * We must be very careful here, as we can't trust the contents of the
+ * task-stack once we switched to the entry-stack. When an NMI happens
+ * while on the entry-stack, the NMI handler will switch back to the top
+ * of the task stack, overwriting our stack-frame we are about to copy.
+ * Therefore we switch the stack only after everything is copied over.
+ */
+.macro SWITCH_TO_ENTRY_STACK
+
+ /* Bytes to copy */
+ movl $PTREGS_SIZE, %ecx
+
+#ifdef CONFIG_VM86
+ testl $(X86_EFLAGS_VM), PT_EFLAGS(%esp)
+ jz .Lcopy_pt_regs_\@
+
+ /* Additional 4 registers to copy when returning to VM86 mode */
+ addl $(4 * 4), %ecx
+
+.Lcopy_pt_regs_\@:
+#endif
+
+ /* Initialize source and destination for movsl */
+ movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
+ subl %ecx, %edi
+ movl %esp, %esi
+
+ /* Save future stack pointer in %ebx */
+ movl %edi, %ebx
+
+ /* Copy over the stack-frame */
+ shrl $2, %ecx
+ cld
+ rep movsl
+
+ /*
+ * Switch to entry-stack - needs to happen after everything is
+ * copied because the NMI handler will overwrite the task-stack
+ * when on entry-stack
+ */
+ movl %ebx, %esp
+
+.Lend_\@:
+.endm
+
+/*
+ * This macro handles the case when we return to kernel-mode on the iret
+ * path and have to switch back to the entry stack and/or user-cr3
+ *
+ * See the comments below the .Lentry_from_kernel_\@ label in the
+ * SWITCH_TO_KERNEL_STACK macro for more details.
+ */
+.macro PARANOID_EXIT_TO_KERNEL_MODE
+
+ /*
+ * Test if we entered the kernel with the entry-stack. Most
+ * likely we did not, because this code only runs on the
+ * return-to-kernel path.
+ */
+ testl $CS_FROM_ENTRY_STACK, PT_CS(%esp)
+ jz .Lend_\@
+
+ /* Unlikely slow-path */
+
+ /* Clear marker from stack-frame */
+ andl $(~CS_FROM_ENTRY_STACK), PT_CS(%esp)
+
+ /* Copy the remaining task-stack contents to entry-stack */
+ movl %esp, %esi
+ movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
+
+ /* Bytes on the task-stack to ecx */
+ movl PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %ecx
+ subl %esi, %ecx
+
+ /* Allocate stack-frame on entry-stack */
+ subl %ecx, %edi
+
+ /*
+ * Save future stack-pointer, we must not switch until the
+ * copy is done, otherwise the NMI handler could destroy the
+ * contents of the task-stack we are about to copy.
+ */
+ movl %edi, %ebx
+
+ /* Do the copy */
+ shrl $2, %ecx
+ cld
+ rep movsl
+
+ /* Safe to switch to entry-stack now */
+ movl %ebx, %esp
+
+ /*
+ * We came from entry-stack and need to check if we also need to
+ * switch back to user cr3.
+ */
+ testl $CS_FROM_USER_CR3, PT_CS(%esp)
+ jz .Lend_\@
+
+ /* Clear marker from stack-frame */
+ andl $(~CS_FROM_USER_CR3), PT_CS(%esp)
+
+ SWITCH_TO_USER_CR3 scratch_reg=%eax
+
+.Lend_\@:
+.endm
+
+/**
+ * idtentry - Macro to generate entry stubs for simple IDT entries
+ * @vector: Vector number
+ * @asmsym: ASM symbol for the entry point
+ * @cfunc: C function to be called
+ * @has_error_code: Hardware pushed error code on stack
+ */
+.macro idtentry vector asmsym cfunc has_error_code:req
+SYM_CODE_START(\asmsym)
+ ASM_CLAC
+ cld
+
+ .if \has_error_code == 0
+ pushl $0 /* Clear the error code */
+ .endif
+
+ /* Push the C-function address into the GS slot */
+ pushl $\cfunc
+ /* Invoke the common exception entry */
+ jmp handle_exception
+SYM_CODE_END(\asmsym)
+.endm
+
+.macro idtentry_irq vector cfunc
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
+SYM_CODE_START_LOCAL(asm_\cfunc)
+ ASM_CLAC
+ SAVE_ALL switch_stacks=1
+ ENCODE_FRAME_POINTER
+ movl %esp, %eax
+ movl PT_ORIG_EAX(%esp), %edx /* get the vector from stack */
+ movl $-1, PT_ORIG_EAX(%esp) /* no syscall to restart */
+ call \cfunc
+ jmp handle_exception_return
+SYM_CODE_END(asm_\cfunc)
+.endm
+
+/*
+ * Include the defines which emit the idt entries which are shared
+ * shared between 32 and 64 bit and emit the __irqentry_text_* markers
+ * so the stacktrace boundary checks work.
+ */
+ .align 16
+ .globl __irqentry_text_start
+__irqentry_text_start:
+
+#include <asm/idtentry.h>
+
+ .align 16
+ .globl __irqentry_text_end
+__irqentry_text_end:
+
/*
* %eax: prev task
* %edx: next task
*/
-ENTRY(__switch_to_asm)
+.pushsection .text, "ax"
+SYM_CODE_START(__switch_to_asm)
/*
* Save callee-saved registers
* This must match the order in struct inactive_task_frame
@@ -232,16 +678,33 @@ ENTRY(__switch_to_asm)
pushl %ebx
pushl %edi
pushl %esi
+ /*
+ * Flags are saved to prevent AC leakage. This could go
+ * away if objtool would have 32bit support to verify
+ * the STAC/CLAC correctness.
+ */
+ pushfl
/* switch stack */
movl %esp, TASK_threadsp(%eax)
movl TASK_threadsp(%edx), %esp
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
movl TASK_stack_canary(%edx), %ebx
- movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
+ movl %ebx, PER_CPU_VAR(__stack_chk_guard)
#endif
+ /*
+ * When switching from a shallower to a deeper call stack
+ * the RSB may either underflow or use entries populated
+ * with userspace addresses. On CPUs where those concerns
+ * exist, overwrite the RSB with entries which capture
+ * speculative execution to prevent attack.
+ */
+ FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+
+ /* Restore flags or the incoming task to restore AC state. */
+ popfl
/* restore callee-saved registers */
popl %esi
popl %edi
@@ -249,25 +712,9 @@ ENTRY(__switch_to_asm)
popl %ebp
jmp __switch_to
-END(__switch_to_asm)
-
-/*
- * The unwinder expects the last frame on the stack to always be at the same
- * offset from the end of the page, which allows it to validate the stack.
- * Calling schedule_tail() directly would break that convention because its an
- * asmlinkage function so its argument has to be pushed on the stack. This
- * wrapper creates a proper "end of stack" frame header before the call.
- */
-ENTRY(schedule_tail_wrapper)
- FRAME_BEGIN
-
- pushl %eax
- call schedule_tail
- popl %eax
+SYM_CODE_END(__switch_to_asm)
+.popsection
- FRAME_END
- ret
-ENDPROC(schedule_tail_wrapper)
/*
* A newly forked process directly context switches into this address.
*
@@ -275,78 +722,26 @@ ENDPROC(schedule_tail_wrapper)
* ebx: kernel thread func (NULL for user thread)
* edi: kernel thread arg
*/
-ENTRY(ret_from_fork)
- call schedule_tail_wrapper
+.pushsection .text, "ax"
+SYM_CODE_START(ret_from_fork_asm)
+ movl %esp, %edx /* regs */
- testl %ebx, %ebx
- jnz 1f /* kernel threads are uncommon */
+ /* return address for the stack unwinder */
+ pushl $.Lsyscall_32_done
-2:
- /* When we fork, we trace the syscall return in the child, too. */
- movl %esp, %eax
- call syscall_return_slowpath
- jmp restore_all
-
- /* kernel thread */
-1: movl %edi, %eax
- call *%ebx
- /*
- * A kernel thread is allowed to return here after successfully
- * calling do_execve(). Exit to userspace to complete the execve()
- * syscall.
- */
- movl $0, PT_EAX(%esp)
- jmp 2b
-END(ret_from_fork)
-
-/*
- * Return to user mode is not as complex as all this looks,
- * but we want the default path for a system call return to
- * go as quickly as possible which is why some of this is
- * less clear than it otherwise should be.
- */
-
- # userspace resumption stub bypassing syscall exit tracing
- ALIGN
-ret_from_exception:
- preempt_stop(CLBR_ANY)
-ret_from_intr:
-#ifdef CONFIG_VM86
- movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
- movb PT_CS(%esp), %al
- andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
-#else
- /*
- * We can be coming here from child spawned by kernel_thread().
- */
- movl PT_CS(%esp), %eax
- andl $SEGMENT_RPL_MASK, %eax
-#endif
- cmpl $USER_RPL, %eax
- jb resume_kernel # not returning to v8086 or userspace
+ FRAME_BEGIN
+ /* prev already in EAX */
+ movl %ebx, %ecx /* fn */
+ pushl %edi /* fn_arg */
+ call ret_from_fork
+ addl $4, %esp
+ FRAME_END
-ENTRY(resume_userspace)
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- movl %esp, %eax
- call prepare_exit_to_usermode
- jmp restore_all
-END(ret_from_exception)
-
-#ifdef CONFIG_PREEMPT
-ENTRY(resume_kernel)
- DISABLE_INTERRUPTS(CLBR_ANY)
-.Lneed_resched:
- cmpl $0, PER_CPU_VAR(__preempt_count)
- jnz restore_all
- testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
- jz restore_all
- call preempt_schedule_irq
- jmp .Lneed_resched
-END(resume_kernel)
-#endif
+ RET
+SYM_CODE_END(ret_from_fork_asm)
+.popsection
-GLOBAL(__begin_SYSENTER_singlestep_region)
+SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
/*
* All code from here through __end_SYSENTER_singlestep_region is subject
* to being single-stepped if a user program sets TF and executes SYSENTER.
@@ -356,16 +751,6 @@ GLOBAL(__begin_SYSENTER_singlestep_region)
* will ignore all of the single-step traps generated in this range.
*/
-#ifdef CONFIG_XEN
-/*
- * Xen doesn't set %esp to be precisely what the normal SYSENTER
- * entry point expects, so fix it up before using the normal path.
- */
-ENTRY(xen_sysenter_target)
- addl $5*4, %esp /* remove xen-provided frame */
- jmp .Lsysenter_past_esp
-#endif
-
/*
* 32-bit SYSENTER entry.
*
@@ -398,17 +783,30 @@ ENTRY(xen_sysenter_target)
* ebp user stack
* 0(%ebp) arg6
*/
-ENTRY(entry_SYSENTER_32)
- movl TSS_sysenter_sp0(%esp), %esp
+SYM_FUNC_START(entry_SYSENTER_32)
+ /*
+ * On entry-stack with all userspace-regs live - save and
+ * restore eflags and %eax to use it as scratch-reg for the cr3
+ * switch.
+ */
+ pushfl
+ pushl %eax
+ BUG_IF_WRONG_CR3 no_user_check=1
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
+ popl %eax
+ popfl
+
+ /* Stack empty again, switch to task stack */
+ movl TSS_entry2task_stack(%esp), %esp
+
.Lsysenter_past_esp:
pushl $__USER_DS /* pt_regs->ss */
- pushl %ebp /* pt_regs->sp (stashed in bp) */
+ pushl $0 /* pt_regs->sp (placeholder) */
pushfl /* pt_regs->flags (except IF = 0) */
- orl $X86_EFLAGS_IF, (%esp) /* Fix IF */
pushl $__USER_CS /* pt_regs->cs */
pushl $0 /* pt_regs->ip = 0 (placeholder) */
pushl %eax /* pt_regs->orig_ax */
- SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
+ SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest, stack already switched */
/*
* SYSENTER doesn't filter flags, so we need to clear NT, AC
@@ -433,39 +831,58 @@ ENTRY(entry_SYSENTER_32)
jnz .Lsysenter_fix_flags
.Lsysenter_flags_fixed:
+ movl %esp, %eax
+ call do_SYSENTER_32
+ testb %al, %al
+ jz .Lsyscall_32_done
+
+ STACKLEAK_ERASE
+
+ /* Opportunistic SYSEXIT */
+
/*
- * User mode is traced as though IRQs are on, and SYSENTER
- * turned them off.
+ * Setup entry stack - we keep the pointer in %eax and do the
+ * switch after almost all user-state is restored.
*/
- TRACE_IRQS_OFF
- movl %esp, %eax
- call do_fast_syscall_32
- /* XEN PV guests always use IRET path */
- ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
- "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+ /* Load entry stack pointer and allocate frame for eflags/eax */
+ movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %eax
+ subl $(2*4), %eax
-/* Opportunistic SYSEXIT */
- TRACE_IRQS_ON /* User mode traces as IRQs on. */
+ /* Copy eflags and eax to entry stack */
+ movl PT_EFLAGS(%esp), %edi
+ movl PT_EAX(%esp), %esi
+ movl %edi, (%eax)
+ movl %esi, 4(%eax)
+
+ /* Restore user registers and segments */
movl PT_EIP(%esp), %edx /* pt_regs->ip */
movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */
1: mov PT_FS(%esp), %fs
- PTGS_TO_GS
+
popl %ebx /* pt_regs->bx */
addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */
popl %esi /* pt_regs->si */
popl %edi /* pt_regs->di */
popl %ebp /* pt_regs->bp */
- popl %eax /* pt_regs->ax */
+
+ /* Switch to entry stack */
+ movl %eax, %esp
+
+ /* Now ready to switch the cr3 */
+ SWITCH_TO_USER_CR3 scratch_reg=%eax
+ /* Clobbers ZF */
+ CLEAR_CPU_BUFFERS
/*
* Restore all flags except IF. (We restore IF separately because
* STI gives a one-instruction window in which we won't be interrupted,
* whereas POPF does not.)
*/
- addl $PT_EFLAGS-PT_DS, %esp /* point esp at pt_regs->flags */
- btr $X86_EFLAGS_IF_BIT, (%esp)
+ btrl $X86_EFLAGS_IF_BIT, (%esp)
+ BUG_IF_WRONG_CR3 no_user_check=1
popfl
+ popl %eax
/*
* Return back to the vDSO, which will pop ecx and edx.
@@ -474,19 +891,16 @@ ENTRY(entry_SYSENTER_32)
sti
sysexit
-.pushsection .fixup, "ax"
-2: movl $0, PT_FS(%esp)
- jmp 1b
-.popsection
+2: movl $0, PT_FS(%esp)
+ jmp 1b
_ASM_EXTABLE(1b, 2b)
- PTGS_TO_GS_EX
.Lsysenter_fix_flags:
pushl $X86_EFLAGS_FIXED
popfl
jmp .Lsysenter_flags_fixed
-GLOBAL(__end_SYSENTER_singlestep_region)
-ENDPROC(entry_SYSENTER_32)
+SYM_ENTRY(__end_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
+SYM_FUNC_END(entry_SYSENTER_32)
/*
* 32-bit legacy system call entry.
@@ -516,85 +930,57 @@ ENDPROC(entry_SYSENTER_32)
* edi arg5
* ebp arg6
*/
-ENTRY(entry_INT80_32)
+SYM_FUNC_START(entry_INT80_32)
ASM_CLAC
pushl %eax /* pt_regs->orig_ax */
- SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
- /*
- * User mode is traced as though IRQs are on, and the interrupt gate
- * turned them off.
- */
- TRACE_IRQS_OFF
+ SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1 /* save rest */
movl %esp, %eax
call do_int80_syscall_32
.Lsyscall_32_done:
+ STACKLEAK_ERASE
-restore_all:
- TRACE_IRQS_IRET
-.Lrestore_all_notrace:
-#ifdef CONFIG_X86_ESPFIX32
- ALTERNATIVE "jmp .Lrestore_nocheck", "", X86_BUG_ESPFIX
+restore_all_switch_stack:
+ SWITCH_TO_ENTRY_STACK
+ CHECK_AND_APPLY_ESPFIX
- movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
+ /* Switch back to user CR3 */
+ SWITCH_TO_USER_CR3 scratch_reg=%eax
+
+ BUG_IF_WRONG_CR3
+
+ /* Restore user state */
+ RESTORE_REGS pop=4 # skip orig_eax/error_code
+ CLEAR_CPU_BUFFERS
+.Lirq_return:
/*
- * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
- * are returning to the kernel.
- * See comments in process.c:copy_thread() for details.
+ * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
+ * when returning from IPI handler and when returning from
+ * scheduler to user-space.
*/
- movb PT_OLDSS(%esp), %ah
- movb PT_CS(%esp), %al
- andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
- cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
- je .Lldt_ss # returning to user-space with LDT SS
-#endif
-.Lrestore_nocheck:
- RESTORE_REGS 4 # skip orig_eax/error_code
-.Lirq_return:
- INTERRUPT_RETURN
+ iret
-.section .fixup, "ax"
-ENTRY(iret_exc )
+.Lasm_iret_error:
pushl $0 # no error code
- pushl $do_iret_error
- jmp common_exception
-.previous
- _ASM_EXTABLE(.Lirq_return, iret_exc)
+ pushl $iret_error
-#ifdef CONFIG_X86_ESPFIX32
-.Lldt_ss:
-/*
- * Setup and switch to ESPFIX stack
- *
- * We're returning to userspace with a 16 bit stack. The CPU will not
- * restore the high word of ESP for us on executing iret... This is an
- * "official" bug of all the x86-compatible CPUs, which we can work
- * around to make dosemu and wine happy. We do this by preloading the
- * high word of ESP with the high word of the userspace ESP while
- * compensating for the offset by changing to the ESPFIX segment with
- * a base address that matches for the difference.
- */
-#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
- mov %esp, %edx /* load kernel esp */
- mov PT_OLDESP(%esp), %eax /* load userspace esp */
- mov %dx, %ax /* eax: new kernel esp */
- sub %eax, %edx /* offset (low word is 0) */
- shr $16, %edx
- mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
- mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
- pushl $__ESPFIX_SS
- pushl %eax /* new kernel esp */
+#ifdef CONFIG_DEBUG_ENTRY
/*
- * Disable interrupts, but do not irqtrace this section: we
- * will soon execute iret and the tracer was already set to
- * the irqstate after the IRET:
+ * The stack-frame here is the one that iret faulted on, so its a
+ * return-to-user frame. We are on kernel-cr3 because we come here from
+ * the fixup code. This confuses the CR3 checker, so switch to user-cr3
+ * as the checker expects it.
*/
- DISABLE_INTERRUPTS(CLBR_ANY)
- lss (%esp), %esp /* switch to espfix segment */
- jmp .Lrestore_nocheck
+ pushl %eax
+ SWITCH_TO_USER_CR3 scratch_reg=%eax
+ popl %eax
#endif
-ENDPROC(entry_INT80_32)
+
+ jmp handle_exception
+
+ _ASM_EXTABLE(.Lirq_return, .Lasm_iret_error)
+SYM_FUNC_END(entry_INT80_32)
.macro FIXUP_ESPFIX_STACK
/*
@@ -603,377 +989,124 @@ ENDPROC(entry_INT80_32)
* We can't call C functions using the ESPFIX stack. This code reads
* the high word of the segment base from the GDT and swiches to the
* normal stack and adjusts ESP with the matching offset.
+ *
+ * We might be on user CR3 here, so percpu data is not mapped and we can't
+ * access the GDT through the percpu segment. Instead, use SGDT to find
+ * the cpu_entry_area alias of the GDT.
*/
#ifdef CONFIG_X86_ESPFIX32
/* fixup the stack */
- mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
- mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
+ pushl %ecx
+ subl $2*4, %esp
+ sgdt (%esp)
+ movl 2(%esp), %ecx /* GDT address */
+ /*
+ * Careful: ECX is a linear pointer, so we need to force base
+ * zero. %cs is the only known-linear segment we have right now.
+ */
+ mov %cs:GDT_ESPFIX_OFFSET + 4(%ecx), %al /* bits 16..23 */
+ mov %cs:GDT_ESPFIX_OFFSET + 7(%ecx), %ah /* bits 24..31 */
shl $16, %eax
+ addl $2*4, %esp
+ popl %ecx
addl %esp, %eax /* the adjusted stack pointer */
pushl $__KERNEL_DS
pushl %eax
lss (%esp), %esp /* switch to the normal stack segment */
#endif
.endm
+
.macro UNWIND_ESPFIX_STACK
+ /* It's safe to clobber %eax, all other regs need to be preserved */
#ifdef CONFIG_X86_ESPFIX32
movl %ss, %eax
/* see if on espfix stack */
cmpw $__ESPFIX_SS, %ax
- jne 27f
- movl $__KERNEL_DS, %eax
- movl %eax, %ds
- movl %eax, %es
+ jne .Lno_fixup_\@
/* switch to normal stack */
FIXUP_ESPFIX_STACK
-27:
+.Lno_fixup_\@:
#endif
.endm
-/*
- * Build the entry stubs with some assembler magic.
- * We pack 1 stub into every 8-byte block.
- */
- .align 8
-ENTRY(irq_entries_start)
- vector=FIRST_EXTERNAL_VECTOR
- .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
- pushl $(~vector+0x80) /* Note: always in signed byte range */
- vector=vector+1
- jmp common_interrupt
- .align 8
- .endr
-END(irq_entries_start)
-
-/*
- * the CPU automatically disables interrupts when executing an IRQ vector,
- * so IRQ-flags tracing has to follow that:
- */
- .p2align CONFIG_X86_L1_CACHE_SHIFT
-common_interrupt:
- ASM_CLAC
- addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
- SAVE_ALL
+SYM_CODE_START_LOCAL_NOALIGN(handle_exception)
+ /* the function address is in %gs's slot on the stack */
+ SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
ENCODE_FRAME_POINTER
- TRACE_IRQS_OFF
- movl %esp, %eax
- call do_IRQ
- jmp ret_from_intr
-ENDPROC(common_interrupt)
-
-#define BUILD_INTERRUPT3(name, nr, fn) \
-ENTRY(name) \
- ASM_CLAC; \
- pushl $~(nr); \
- SAVE_ALL; \
- ENCODE_FRAME_POINTER; \
- TRACE_IRQS_OFF \
- movl %esp, %eax; \
- call fn; \
- jmp ret_from_intr; \
-ENDPROC(name)
-
-
-#ifdef CONFIG_TRACING
-# define TRACE_BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name)
-#else
-# define TRACE_BUILD_INTERRUPT(name, nr)
-#endif
-#define BUILD_INTERRUPT(name, nr) \
- BUILD_INTERRUPT3(name, nr, smp_##name); \
- TRACE_BUILD_INTERRUPT(name, nr)
+ movl PT_GS(%esp), %edi # get the function address
-/* The include is where all of the SMP etc. interrupts come from */
-#include <asm/entry_arch.h>
+ /* fixup orig %eax */
+ movl PT_ORIG_EAX(%esp), %edx # get the error code
+ movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
-ENTRY(coprocessor_error)
- ASM_CLAC
- pushl $0
- pushl $do_coprocessor_error
- jmp common_exception
-END(coprocessor_error)
+ movl %esp, %eax # pt_regs pointer
+ CALL_NOSPEC edi
-ENTRY(simd_coprocessor_error)
- ASM_CLAC
- pushl $0
-#ifdef CONFIG_X86_INVD_BUG
- /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
- ALTERNATIVE "pushl $do_general_protection", \
- "pushl $do_simd_coprocessor_error", \
- X86_FEATURE_XMM
+handle_exception_return:
+#ifdef CONFIG_VM86
+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
+ movb PT_CS(%esp), %al
+ andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
#else
- pushl $do_simd_coprocessor_error
-#endif
- jmp common_exception
-END(simd_coprocessor_error)
-
-ENTRY(device_not_available)
- ASM_CLAC
- pushl $-1 # mark this as an int
- pushl $do_device_not_available
- jmp common_exception
-END(device_not_available)
-
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_iret)
- iret
- _ASM_EXTABLE(native_iret, iret_exc)
-END(native_iret)
-#endif
-
-ENTRY(overflow)
- ASM_CLAC
- pushl $0
- pushl $do_overflow
- jmp common_exception
-END(overflow)
-
-ENTRY(bounds)
- ASM_CLAC
- pushl $0
- pushl $do_bounds
- jmp common_exception
-END(bounds)
-
-ENTRY(invalid_op)
- ASM_CLAC
- pushl $0
- pushl $do_invalid_op
- jmp common_exception
-END(invalid_op)
-
-ENTRY(coprocessor_segment_overrun)
- ASM_CLAC
- pushl $0
- pushl $do_coprocessor_segment_overrun
- jmp common_exception
-END(coprocessor_segment_overrun)
-
-ENTRY(invalid_TSS)
- ASM_CLAC
- pushl $do_invalid_TSS
- jmp common_exception
-END(invalid_TSS)
-
-ENTRY(segment_not_present)
- ASM_CLAC
- pushl $do_segment_not_present
- jmp common_exception
-END(segment_not_present)
-
-ENTRY(stack_segment)
- ASM_CLAC
- pushl $do_stack_segment
- jmp common_exception
-END(stack_segment)
-
-ENTRY(alignment_check)
- ASM_CLAC
- pushl $do_alignment_check
- jmp common_exception
-END(alignment_check)
-
-ENTRY(divide_error)
- ASM_CLAC
- pushl $0 # no error code
- pushl $do_divide_error
- jmp common_exception
-END(divide_error)
-
-#ifdef CONFIG_X86_MCE
-ENTRY(machine_check)
- ASM_CLAC
- pushl $0
- pushl machine_check_vector
- jmp common_exception
-END(machine_check)
-#endif
-
-ENTRY(spurious_interrupt_bug)
- ASM_CLAC
- pushl $0
- pushl $do_spurious_interrupt_bug
- jmp common_exception
-END(spurious_interrupt_bug)
-
-#ifdef CONFIG_XEN
-ENTRY(xen_hypervisor_callback)
- pushl $-1 /* orig_ax = -1 => not a system call */
- SAVE_ALL
- ENCODE_FRAME_POINTER
- TRACE_IRQS_OFF
-
/*
- * Check to see if we got the event in the critical
- * region in xen_iret_direct, after we've reenabled
- * events and checked for pending events. This simulates
- * iret instruction's behaviour where it delivers a
- * pending interrupt when enabling interrupts:
+ * We can be coming here from child spawned by kernel_thread().
*/
- movl PT_EIP(%esp), %eax
- cmpl $xen_iret_start_crit, %eax
- jb 1f
- cmpl $xen_iret_end_crit, %eax
- jae 1f
-
- jmp xen_iret_crit_fixup
-
-ENTRY(xen_do_upcall)
-1: mov %esp, %eax
- call xen_evtchn_do_upcall
-#ifndef CONFIG_PREEMPT
- call xen_maybe_preempt_hcall
-#endif
- jmp ret_from_intr
-ENDPROC(xen_hypervisor_callback)
-
-/*
- * Hypervisor uses this for application faults while it executes.
- * We get here for two reasons:
- * 1. Fault while reloading DS, ES, FS or GS
- * 2. Fault while executing IRET
- * Category 1 we fix up by reattempting the load, and zeroing the segment
- * register if the load fails.
- * Category 2 we fix up by jumping to do_iret_error. We cannot use the
- * normal Linux return path in this case because if we use the IRET hypercall
- * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
- * We distinguish between categories by maintaining a status value in EAX.
- */
-ENTRY(xen_failsafe_callback)
- pushl %eax
- movl $1, %eax
-1: mov 4(%esp), %ds
-2: mov 8(%esp), %es
-3: mov 12(%esp), %fs
-4: mov 16(%esp), %gs
- /* EAX == 0 => Category 1 (Bad segment)
- EAX != 0 => Category 2 (Bad IRET) */
- testl %eax, %eax
- popl %eax
- lea 16(%esp), %esp
- jz 5f
- jmp iret_exc
-5: pushl $-1 /* orig_ax = -1 => not a system call */
- SAVE_ALL
- ENCODE_FRAME_POINTER
- jmp ret_from_exception
-
-.section .fixup, "ax"
-6: xorl %eax, %eax
- movl %eax, 4(%esp)
- jmp 1b
-7: xorl %eax, %eax
- movl %eax, 8(%esp)
- jmp 2b
-8: xorl %eax, %eax
- movl %eax, 12(%esp)
- jmp 3b
-9: xorl %eax, %eax
- movl %eax, 16(%esp)
- jmp 4b
-.previous
- _ASM_EXTABLE(1b, 6b)
- _ASM_EXTABLE(2b, 7b)
- _ASM_EXTABLE(3b, 8b)
- _ASM_EXTABLE(4b, 9b)
-ENDPROC(xen_failsafe_callback)
-
-BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
- xen_evtchn_do_upcall)
-
-#endif /* CONFIG_XEN */
-
-#if IS_ENABLED(CONFIG_HYPERV)
-
-BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
- hyperv_vector_handler)
-
-#endif /* CONFIG_HYPERV */
-
-#ifdef CONFIG_TRACING
-ENTRY(trace_page_fault)
- ASM_CLAC
- pushl $trace_do_page_fault
- jmp common_exception
-END(trace_page_fault)
+ movl PT_CS(%esp), %eax
+ andl $SEGMENT_RPL_MASK, %eax
#endif
+ cmpl $USER_RPL, %eax # returning to v8086 or userspace ?
+ jnb ret_to_user
-ENTRY(page_fault)
- ASM_CLAC
- pushl $do_page_fault
- ALIGN
- jmp common_exception
-END(page_fault)
+ PARANOID_EXIT_TO_KERNEL_MODE
+ BUG_IF_WRONG_CR3
+ RESTORE_REGS 4
+ jmp .Lirq_return
-common_exception:
- /* the function address is in %gs's slot on the stack */
- pushl %fs
- pushl %es
- pushl %ds
- pushl %eax
- pushl %ebp
- pushl %edi
- pushl %esi
- pushl %edx
- pushl %ecx
- pushl %ebx
- ENCODE_FRAME_POINTER
- cld
- movl $(__KERNEL_PERCPU), %ecx
- movl %ecx, %fs
- UNWIND_ESPFIX_STACK
- GS_TO_REG %ecx
- movl PT_GS(%esp), %edi # get the function address
- movl PT_ORIG_EAX(%esp), %edx # get the error code
- movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
- REG_TO_PTGS %ecx
- SET_KERNEL_GS %ecx
- movl $(__USER_DS), %ecx
- movl %ecx, %ds
- movl %ecx, %es
- TRACE_IRQS_OFF
- movl %esp, %eax # pt_regs pointer
- call *%edi
- jmp ret_from_exception
-END(common_exception)
+ret_to_user:
+ movl %esp, %eax
+ jmp restore_all_switch_stack
+SYM_CODE_END(handle_exception)
-ENTRY(debug)
+SYM_CODE_START(asm_exc_double_fault)
+1:
/*
- * #DB can happen at the first instruction of
- * entry_SYSENTER_32 or in Xen's SYSENTER prologue. If this
- * happens, then we will be running on a very small stack. We
- * need to detect this condition and switch to the thread
- * stack before calling any C code at all.
+ * This is a task gate handler, not an interrupt gate handler.
+ * The error code is on the stack, but the stack is otherwise
+ * empty. Interrupts are off. Our state is sane with the following
+ * exceptions:
+ *
+ * - CR0.TS is set. "TS" literally means "task switched".
+ * - EFLAGS.NT is set because we're a "nested task".
+ * - The doublefault TSS has back_link set and has been marked busy.
+ * - TR points to the doublefault TSS and the normal TSS is busy.
+ * - CR3 is the normal kernel PGD. This would be delightful, except
+ * that the CPU didn't bother to save the old CR3 anywhere. This
+ * would make it very awkward to return back to the context we came
+ * from.
*
- * If you edit this code, keep in mind that NMIs can happen in here.
+ * The rest of EFLAGS is sanitized for us, so we don't need to
+ * worry about AC or DF.
+ *
+ * Don't even bother popping the error code. It's always zero,
+ * and ignoring it makes us a bit more robust against buggy
+ * hypervisor task gate implementations.
+ *
+ * We will manually undo the task switch instead of doing a
+ * task-switching IRET.
*/
- ASM_CLAC
- pushl $-1 # mark this as an int
- SAVE_ALL
- ENCODE_FRAME_POINTER
- xorl %edx, %edx # error code 0
- movl %esp, %eax # pt_regs pointer
- /* Are we currently on the SYSENTER stack? */
- PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
- subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
- cmpl $SIZEOF_SYSENTER_stack, %ecx
- jb .Ldebug_from_sysenter_stack
+ clts /* clear CR0.TS */
+ pushl $X86_EFLAGS_FIXED
+ popfl /* clear EFLAGS.NT */
- TRACE_IRQS_OFF
- call do_debug
- jmp ret_from_exception
+ call doublefault_shim
-.Ldebug_from_sysenter_stack:
- /* We're on the SYSENTER stack. Switch off. */
- movl %esp, %ebx
- movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
- TRACE_IRQS_OFF
- call do_debug
- movl %ebx, %esp
- jmp ret_from_exception
-END(debug)
+ /* We don't support returning, so we have no IRET here. */
+1:
+ hlt
+ jmp 1b
+SYM_CODE_END(asm_exc_double_fault)
/*
* NMI is doubly nasty. It can happen on the first instruction of
@@ -982,9 +1115,14 @@ END(debug)
* switched stacks. We handle both conditions by simply checking whether we
* interrupted kernel code running on the SYSENTER stack.
*/
-ENTRY(nmi)
+SYM_CODE_START(asm_exc_nmi)
ASM_CLAC
+
#ifdef CONFIG_X86_ESPFIX32
+ /*
+ * ESPFIX_SS is only ever set on the return to user path
+ * after we've switched to the entry stack.
+ */
pushl %eax
movl %ss, %eax
cmpw $__ESPFIX_SS, %ax
@@ -993,20 +1131,21 @@ ENTRY(nmi)
#endif
pushl %eax # pt_regs->orig_ax
- SAVE_ALL
+ SAVE_ALL_NMI cr3_reg=%edi
ENCODE_FRAME_POINTER
xorl %edx, %edx # zero error code
movl %esp, %eax # pt_regs pointer
/* Are we currently on the SYSENTER stack? */
- PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
- subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
- cmpl $SIZEOF_SYSENTER_stack, %ecx
+ movl PER_CPU_VAR(cpu_entry_area), %ecx
+ addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
+ subl %eax, %ecx /* ecx = (end of entry_stack) - esp */
+ cmpl $SIZEOF_entry_stack, %ecx
jb .Lnmi_from_sysenter_stack
/* Not on SYSENTER stack. */
- call do_nmi
- jmp .Lrestore_all_notrace
+ call exc_nmi
+ jmp .Lnmi_return
.Lnmi_from_sysenter_stack:
/*
@@ -1015,66 +1154,73 @@ ENTRY(nmi)
*/
movl %esp, %ebx
movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
- call do_nmi
+ call exc_nmi
movl %ebx, %esp
- jmp .Lrestore_all_notrace
+
+.Lnmi_return:
+#ifdef CONFIG_X86_ESPFIX32
+ testl $CS_FROM_ESPFIX, PT_CS(%esp)
+ jnz .Lnmi_from_espfix
+#endif
+
+ CHECK_AND_APPLY_ESPFIX
+ RESTORE_ALL_NMI cr3_reg=%edi pop=4
+ CLEAR_CPU_BUFFERS
+ jmp .Lirq_return
#ifdef CONFIG_X86_ESPFIX32
.Lnmi_espfix_stack:
/*
- * create the pointer to lss back
+ * Create the pointer to LSS back
*/
pushl %ss
pushl %esp
addl $4, (%esp)
- /* copy the iret frame of 12 bytes */
- .rept 3
- pushl 16(%esp)
- .endr
- pushl %eax
- SAVE_ALL
- ENCODE_FRAME_POINTER
- FIXUP_ESPFIX_STACK # %eax == %esp
- xorl %edx, %edx # zero error code
- call do_nmi
- RESTORE_REGS
- lss 12+4(%esp), %esp # back to espfix stack
- jmp .Lirq_return
-#endif
-END(nmi)
-ENTRY(int3)
- ASM_CLAC
- pushl $-1 # mark this as an int
- SAVE_ALL
+ /* Copy the (short) IRET frame */
+ pushl 4*4(%esp) # flags
+ pushl 4*4(%esp) # cs
+ pushl 4*4(%esp) # ip
+
+ pushl %eax # orig_ax
+
+ SAVE_ALL_NMI cr3_reg=%edi unwind_espfix=1
ENCODE_FRAME_POINTER
- TRACE_IRQS_OFF
+
+ /* clear CS_FROM_KERNEL, set CS_FROM_ESPFIX */
+ xorl $(CS_FROM_ESPFIX | CS_FROM_KERNEL), PT_CS(%esp)
+
xorl %edx, %edx # zero error code
movl %esp, %eax # pt_regs pointer
- call do_int3
- jmp ret_from_exception
-END(int3)
+ jmp .Lnmi_from_sysenter_stack
-ENTRY(general_protection)
- pushl $do_general_protection
- jmp common_exception
-END(general_protection)
-
-#ifdef CONFIG_KVM_GUEST
-ENTRY(async_page_fault)
- ASM_CLAC
- pushl $do_async_page_fault
- jmp common_exception
-END(async_page_fault)
+.Lnmi_from_espfix:
+ RESTORE_ALL_NMI cr3_reg=%edi
+ /*
+ * Because we cleared CS_FROM_KERNEL, IRET_FRAME 'forgot' to
+ * fix up the gap and long frame:
+ *
+ * 3 - original frame (exception)
+ * 2 - ESPFIX block (above)
+ * 6 - gap (FIXUP_FRAME)
+ * 5 - long frame (FIXUP_FRAME)
+ * 1 - orig_ax
+ */
+ lss (1+5+6)*4(%esp), %esp # back to espfix stack
+ CLEAR_CPU_BUFFERS
+ jmp .Lirq_return
#endif
+SYM_CODE_END(asm_exc_nmi)
-ENTRY(rewind_stack_do_exit)
+.pushsection .text, "ax"
+SYM_CODE_START(rewind_stack_and_make_dead)
/* Prevent any naive code from trying to unwind to our caller. */
xorl %ebp, %ebp
movl PER_CPU_VAR(cpu_current_top_of_stack), %esi
leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp
- call do_exit
+ call make_task_dead
1: jmp 1b
-END(rewind_stack_do_exit)
+SYM_CODE_END(rewind_stack_and_make_dead)
+.popsection