summaryrefslogtreecommitdiff
path: root/arch/x86/entry
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/entry')
-rw-r--r--arch/x86/entry/Makefile15
-rw-r--r--arch/x86/entry/calling.h135
-rw-r--r--arch/x86/entry/common.c320
-rw-r--r--arch/x86/entry/entry.S67
-rw-r--r--arch/x86/entry/entry_32.S17
-rw-r--r--arch/x86/entry/entry_64.S247
-rw-r--r--arch/x86/entry/entry_64_compat.S116
-rw-r--r--arch/x86/entry/entry_64_fred.S151
-rw-r--r--arch/x86/entry/entry_fred.c296
-rw-r--r--arch/x86/entry/syscall_32.c358
-rw-r--r--arch/x86/entry/syscall_64.c133
-rw-r--r--arch/x86/entry/syscall_x32.c18
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl31
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl29
-rw-r--r--arch/x86/entry/thunk.S (renamed from arch/x86/entry/thunk_64.S)35
-rw-r--r--arch/x86/entry/thunk_32.S36
-rw-r--r--arch/x86/entry/vdso/Makefile104
-rwxr-xr-xarch/x86/entry/vdso/checkundef.sh10
-rw-r--r--arch/x86/entry/vdso/extable.h2
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c10
-rw-r--r--arch/x86/entry/vdso/vdso-layout.lds.S22
-rw-r--r--arch/x86/entry/vdso/vdso.lds.S2
-rw-r--r--arch/x86/entry/vdso/vdso2c.c21
-rw-r--r--arch/x86/entry/vdso/vdso2c.h20
-rw-r--r--arch/x86/entry/vdso/vdso32-setup.c17
-rw-r--r--arch/x86/entry/vdso/vgetrandom-chacha.S178
-rw-r--r--arch/x86/entry/vdso/vgetrandom.c15
-rw-r--r--arch/x86/entry/vdso/vma.c255
-rw-r--r--arch/x86/entry/vdso/vsgx.S1
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c49
30 files changed, 1640 insertions, 1070 deletions
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index ca2fe186994b..72cae8e0ce85 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -7,17 +7,20 @@ KASAN_SANITIZE := n
UBSAN_SANITIZE := n
KCOV_INSTRUMENT := n
-CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE)
-CFLAGS_common.o += -fno-stack-protector
+CFLAGS_syscall_32.o += -fno-stack-protector
+CFLAGS_syscall_64.o += -fno-stack-protector
obj-y := entry.o entry_$(BITS).o syscall_$(BITS).o
-obj-y += common.o
obj-y += vdso/
obj-y += vsyscall/
-obj-$(CONFIG_PREEMPTION) += thunk_$(BITS).o
-obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o
-obj-$(CONFIG_X86_X32_ABI) += syscall_x32.o
+obj-$(CONFIG_PREEMPTION) += thunk.o
+CFLAGS_entry_fred.o += -fno-stack-protector
+CFLAGS_REMOVE_entry_fred.o += -pg $(CC_FLAGS_FTRACE)
+obj-$(CONFIG_X86_FRED) += entry_64_fred.o entry_fred.o
+obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index f6907627172b..77e2d920a640 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -65,11 +65,13 @@ For 32-bit we have the following conventions - kernel is built with
* for assembly code:
*/
-.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0
+.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 unwind_hint=1
.if \save_ret
pushq %rsi /* pt_regs->si */
movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */
movq %rdi, 8(%rsp) /* pt_regs->di (overwriting original return address) */
+ /* We just clobbered the return address - use the IRET frame for unwinding: */
+ UNWIND_HINT_IRET_REGS offset=3*8
.else
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
@@ -87,14 +89,17 @@ For 32-bit we have the following conventions - kernel is built with
pushq %r13 /* pt_regs->r13 */
pushq %r14 /* pt_regs->r14 */
pushq %r15 /* pt_regs->r15 */
+
+ .if \unwind_hint
UNWIND_HINT_REGS
+ .endif
.if \save_ret
pushq %rsi /* return address on top of stack */
.endif
.endm
-.macro CLEAR_REGS
+.macro CLEAR_REGS clear_callee=1
/*
* Sanitize registers of values that a speculation attack might
* otherwise want to exploit. The lower registers are likely clobbered
@@ -108,18 +113,19 @@ For 32-bit we have the following conventions - kernel is built with
xorl %r9d, %r9d /* nospec r9 */
xorl %r10d, %r10d /* nospec r10 */
xorl %r11d, %r11d /* nospec r11 */
+ .if \clear_callee
xorl %ebx, %ebx /* nospec rbx */
xorl %ebp, %ebp /* nospec rbp */
xorl %r12d, %r12d /* nospec r12 */
xorl %r13d, %r13d /* nospec r13 */
xorl %r14d, %r14d /* nospec r14 */
xorl %r15d, %r15d /* nospec r15 */
-
+ .endif
.endm
-.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0
- PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret
- CLEAR_REGS
+.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 clear_callee=1 unwind_hint=1
+ PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret unwind_hint=\unwind_hint
+ CLEAR_REGS clear_callee=\clear_callee
.endm
.macro POP_REGS pop_rdi=1
@@ -142,10 +148,10 @@ For 32-bit we have the following conventions - kernel is built with
.endif
.endm
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
- * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
+ * MITIGATION_PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
* halves:
*/
#define PTI_USER_PGTABLE_BIT PAGE_SHIFT
@@ -160,7 +166,7 @@ For 32-bit we have the following conventions - kernel is built with
.macro ADJUST_KERNEL_CR3 reg:req
ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
- /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
+ /* Clear PCID and "MITIGATION_PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
andq $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg
.endm
@@ -173,10 +179,9 @@ For 32-bit we have the following conventions - kernel is built with
.endm
#define THIS_CPU_user_pcid_flush_mask \
- PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
+ PER_CPU_VAR(cpu_tlbstate + TLB_STATE_user_pcid_flush_mask)
-.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
- ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+.macro SWITCH_TO_USER_CR3 scratch_reg:req scratch_reg2:req
mov %cr3, \scratch_reg
ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
@@ -206,13 +211,20 @@ For 32-bit we have the following conventions - kernel is built with
/* Flip the PGD to the user version */
orq $(PTI_USER_PGTABLE_MASK), \scratch_reg
mov \scratch_reg, %cr3
+.endm
+
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ SWITCH_TO_USER_CR3 \scratch_reg \scratch_reg2
.Lend_\@:
.endm
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
pushq %rax
- SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
+ SWITCH_TO_USER_CR3 scratch_reg=\scratch_reg scratch_reg2=%rax
popq %rax
+.Lend_\@:
.endm
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
@@ -233,17 +245,19 @@ For 32-bit we have the following conventions - kernel is built with
.Ldone_\@:
.endm
-.macro RESTORE_CR3 scratch_reg:req save_reg:req
+/* Restore CR3 from a kernel context. May restore a user CR3 value. */
+.macro PARANOID_RESTORE_CR3 scratch_reg:req save_reg:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
- ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
-
/*
- * KERNEL pages can always resume with NOFLUSH as we do
- * explicit flushes.
+ * If CR3 contained the kernel page tables at the paranoid exception
+ * entry, then there is nothing to restore as CR3 is not modified while
+ * handling the exception.
*/
bt $PTI_USER_PGTABLE_BIT, \save_reg
- jnc .Lnoflush_\@
+ jnc .Lend_\@
+
+ ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
/*
* Check if there's a pending flush for the user ASID we're
@@ -251,25 +265,17 @@ For 32-bit we have the following conventions - kernel is built with
*/
movq \save_reg, \scratch_reg
andq $(0x7FF), \scratch_reg
- bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
- jnc .Lnoflush_\@
-
btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
- jmp .Lwrcr3_\@
+ jc .Lwrcr3_\@
-.Lnoflush_\@:
SET_NOFLUSH_BIT \save_reg
.Lwrcr3_\@:
- /*
- * The CR3 write could be avoided when not changing its value,
- * but would require a CR3 read *and* a scratch register.
- */
movq \save_reg, %cr3
.Lend_\@:
.endm
-#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
+#else /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION=n: */
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
.endm
@@ -279,7 +285,7 @@ For 32-bit we have the following conventions - kernel is built with
.endm
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
.endm
-.macro RESTORE_CR3 scratch_reg:req save_reg:req
+.macro PARANOID_RESTORE_CR3 scratch_reg:req save_reg:req
.endm
#endif
@@ -297,7 +303,7 @@ For 32-bit we have the following conventions - kernel is built with
* Assumes x86_spec_ctrl_{base,current} to have SPEC_CTRL_IBRS set.
*/
.macro IBRS_ENTER save_reg
-#ifdef CONFIG_CPU_IBRS_ENTRY
+#ifdef CONFIG_MITIGATION_IBRS_ENTRY
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS
movl $MSR_IA32_SPEC_CTRL, %ecx
@@ -326,7 +332,7 @@ For 32-bit we have the following conventions - kernel is built with
* regs. Must be called after the last RET.
*/
.macro IBRS_EXIT save_reg
-#ifdef CONFIG_CPU_IBRS_ENTRY
+#ifdef CONFIG_MITIGATION_IBRS_ENTRY
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS
movl $MSR_IA32_SPEC_CTRL, %ecx
@@ -362,7 +368,7 @@ For 32-bit we have the following conventions - kernel is built with
.endm
.macro STACKLEAK_ERASE_NOCLOBBER
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
PUSH_AND_CLEAR_REGS
call stackleak_erase
POP_REGS
@@ -381,7 +387,7 @@ For 32-bit we have the following conventions - kernel is built with
#endif /* !CONFIG_X86_64 */
.macro STACKLEAK_ERASE
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
call stackleak_erase
#endif
.endm
@@ -420,3 +426,64 @@ For 32-bit we have the following conventions - kernel is built with
.endm
#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_X86_64
+
+/* rdi: arg1 ... normal C conventions. rax is saved/restored. */
+.macro THUNK name, func
+SYM_FUNC_START(\name)
+ ANNOTATE_NOENDBR
+ pushq %rbp
+ movq %rsp, %rbp
+
+ pushq %rdi
+ pushq %rsi
+ pushq %rdx
+ pushq %rcx
+ pushq %rax
+ pushq %r8
+ pushq %r9
+ pushq %r10
+ pushq %r11
+
+ call \func
+
+ popq %r11
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rax
+ popq %rcx
+ popq %rdx
+ popq %rsi
+ popq %rdi
+ popq %rbp
+ RET
+SYM_FUNC_END(\name)
+ _ASM_NOKPROBE(\name)
+.endm
+
+#else /* CONFIG_X86_32 */
+
+/* put return address in eax (arg1) */
+.macro THUNK name, func, put_ret_addr_in_eax=0
+SYM_CODE_START_NOALIGN(\name)
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+
+ .if \put_ret_addr_in_eax
+ /* Place EIP in the arg1 */
+ movl 3*4(%esp), %eax
+ .endif
+
+ call \func
+ popl %edx
+ popl %ecx
+ popl %eax
+ RET
+ _ASM_NOKPROBE(\name)
+SYM_CODE_END(\name)
+ .endm
+
+#endif
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
deleted file mode 100644
index 6c2826417b33..000000000000
--- a/arch/x86/entry/common.c
+++ /dev/null
@@ -1,320 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * common.c - C code for kernel entry and exit
- * Copyright (c) 2015 Andrew Lutomirski
- *
- * Based on asm and ptrace code by many authors. The code here originated
- * in ptrace.c and signal.c.
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/sched/task_stack.h>
-#include <linux/entry-common.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/errno.h>
-#include <linux/ptrace.h>
-#include <linux/export.h>
-#include <linux/nospec.h>
-#include <linux/syscalls.h>
-#include <linux/uaccess.h>
-
-#ifdef CONFIG_XEN_PV
-#include <xen/xen-ops.h>
-#include <xen/events.h>
-#endif
-
-#include <asm/desc.h>
-#include <asm/traps.h>
-#include <asm/vdso.h>
-#include <asm/cpufeature.h>
-#include <asm/fpu/api.h>
-#include <asm/nospec-branch.h>
-#include <asm/io_bitmap.h>
-#include <asm/syscall.h>
-#include <asm/irq_stack.h>
-
-#ifdef CONFIG_X86_64
-
-static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
-{
- /*
- * Convert negative numbers to very high and thus out of range
- * numbers for comparisons.
- */
- unsigned int unr = nr;
-
- if (likely(unr < NR_syscalls)) {
- unr = array_index_nospec(unr, NR_syscalls);
- regs->ax = sys_call_table[unr](regs);
- return true;
- }
- return false;
-}
-
-static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
-{
- /*
- * Adjust the starting offset of the table, and convert numbers
- * < __X32_SYSCALL_BIT to very high and thus out of range
- * numbers for comparisons.
- */
- unsigned int xnr = nr - __X32_SYSCALL_BIT;
-
- if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
- xnr = array_index_nospec(xnr, X32_NR_syscalls);
- regs->ax = x32_sys_call_table[xnr](regs);
- return true;
- }
- return false;
-}
-
-__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
-{
- add_random_kstack_offset();
- nr = syscall_enter_from_user_mode(regs, nr);
-
- instrumentation_begin();
-
- if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
- /* Invalid system call, but still a system call. */
- regs->ax = __x64_sys_ni_syscall(regs);
- }
-
- instrumentation_end();
- syscall_exit_to_user_mode(regs);
-}
-#endif
-
-#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
-static __always_inline int syscall_32_enter(struct pt_regs *regs)
-{
- if (IS_ENABLED(CONFIG_IA32_EMULATION))
- current_thread_info()->status |= TS_COMPAT;
-
- return (int)regs->orig_ax;
-}
-
-/*
- * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL.
- */
-static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
-{
- /*
- * Convert negative numbers to very high and thus out of range
- * numbers for comparisons.
- */
- unsigned int unr = nr;
-
- if (likely(unr < IA32_NR_syscalls)) {
- unr = array_index_nospec(unr, IA32_NR_syscalls);
- regs->ax = ia32_sys_call_table[unr](regs);
- } else if (nr != -1) {
- regs->ax = __ia32_sys_ni_syscall(regs);
- }
-}
-
-/* Handles int $0x80 */
-__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
-{
- int nr = syscall_32_enter(regs);
-
- add_random_kstack_offset();
- /*
- * Subtlety here: if ptrace pokes something larger than 2^31-1 into
- * orig_ax, the int return value truncates it. This matches
- * the semantics of syscall_get_nr().
- */
- nr = syscall_enter_from_user_mode(regs, nr);
- instrumentation_begin();
-
- do_syscall_32_irqs_on(regs, nr);
-
- instrumentation_end();
- syscall_exit_to_user_mode(regs);
-}
-
-static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
-{
- int nr = syscall_32_enter(regs);
- int res;
-
- add_random_kstack_offset();
- /*
- * This cannot use syscall_enter_from_user_mode() as it has to
- * fetch EBP before invoking any of the syscall entry work
- * functions.
- */
- syscall_enter_from_user_mode_prepare(regs);
-
- instrumentation_begin();
- /* Fetch EBP from where the vDSO stashed it. */
- if (IS_ENABLED(CONFIG_X86_64)) {
- /*
- * Micro-optimization: the pointer we're following is
- * explicitly 32 bits, so it can't be out of range.
- */
- res = __get_user(*(u32 *)&regs->bp,
- (u32 __user __force *)(unsigned long)(u32)regs->sp);
- } else {
- res = get_user(*(u32 *)&regs->bp,
- (u32 __user __force *)(unsigned long)(u32)regs->sp);
- }
-
- if (res) {
- /* User code screwed up. */
- regs->ax = -EFAULT;
-
- local_irq_disable();
- instrumentation_end();
- irqentry_exit_to_user_mode(regs);
- return false;
- }
-
- nr = syscall_enter_from_user_mode_work(regs, nr);
-
- /* Now this is just like a normal syscall. */
- do_syscall_32_irqs_on(regs, nr);
-
- instrumentation_end();
- syscall_exit_to_user_mode(regs);
- return true;
-}
-
-/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
-__visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
-{
- /*
- * Called using the internal vDSO SYSENTER/SYSCALL32 calling
- * convention. Adjust regs so it looks like we entered using int80.
- */
- unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
- vdso_image_32.sym_int80_landing_pad;
-
- /*
- * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
- * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
- * Fix it up.
- */
- regs->ip = landing_pad;
-
- /* Invoke the syscall. If it failed, keep it simple: use IRET. */
- if (!__do_fast_syscall_32(regs))
- return 0;
-
-#ifdef CONFIG_X86_64
- /*
- * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
- * SYSRETL is available on all 64-bit CPUs, so we don't need to
- * bother with SYSEXIT.
- *
- * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
- * because the ECX fixup above will ensure that this is essentially
- * never the case.
- */
- return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
- regs->ip == landing_pad &&
- (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
-#else
- /*
- * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
- *
- * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
- * because the ECX fixup above will ensure that this is essentially
- * never the case.
- *
- * We don't allow syscalls at all from VM86 mode, but we still
- * need to check VM, because we might be returning from sys_vm86.
- */
- return static_cpu_has(X86_FEATURE_SEP) &&
- regs->cs == __USER_CS && regs->ss == __USER_DS &&
- regs->ip == landing_pad &&
- (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
-#endif
-}
-
-/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
-__visible noinstr long do_SYSENTER_32(struct pt_regs *regs)
-{
- /* SYSENTER loses RSP, but the vDSO saved it in RBP. */
- regs->sp = regs->bp;
-
- /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */
- regs->flags |= X86_EFLAGS_IF;
-
- return do_fast_syscall_32(regs);
-}
-#endif
-
-SYSCALL_DEFINE0(ni_syscall)
-{
- return -ENOSYS;
-}
-
-#ifdef CONFIG_XEN_PV
-#ifndef CONFIG_PREEMPTION
-/*
- * Some hypercalls issued by the toolstack can take many 10s of
- * seconds. Allow tasks running hypercalls via the privcmd driver to
- * be voluntarily preempted even if full kernel preemption is
- * disabled.
- *
- * Such preemptible hypercalls are bracketed by
- * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
- * calls.
- */
-DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
-EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
-
-/*
- * In case of scheduling the flag must be cleared and restored after
- * returning from schedule as the task might move to a different CPU.
- */
-static __always_inline bool get_and_clear_inhcall(void)
-{
- bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
-
- __this_cpu_write(xen_in_preemptible_hcall, false);
- return inhcall;
-}
-
-static __always_inline void restore_inhcall(bool inhcall)
-{
- __this_cpu_write(xen_in_preemptible_hcall, inhcall);
-}
-#else
-static __always_inline bool get_and_clear_inhcall(void) { return false; }
-static __always_inline void restore_inhcall(bool inhcall) { }
-#endif
-
-static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
-{
- struct pt_regs *old_regs = set_irq_regs(regs);
-
- inc_irq_stat(irq_hv_callback_count);
-
- xen_hvm_evtchn_do_upcall();
-
- set_irq_regs(old_regs);
-}
-
-__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
-{
- irqentry_state_t state = irqentry_enter(regs);
- bool inhcall;
-
- instrumentation_begin();
- run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
-
- inhcall = get_and_clear_inhcall();
- if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
- irqentry_exit_cond_resched();
- instrumentation_end();
- restore_inhcall(inhcall);
- } else {
- instrumentation_end();
- irqentry_exit(regs, state);
- }
-}
-#endif /* CONFIG_XEN_PV */
diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S
index bfb7bcb362bc..6ba2b3adcef0 100644
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -3,20 +3,75 @@
* Common place for both 32- and 64-bit entry routines.
*/
+#include <linux/export.h>
+#include <linux/kvm_types.h>
#include <linux/linkage.h>
-#include <asm/export.h>
+#include <linux/objtool.h>
#include <asm/msr-index.h>
+#include <asm/unwind_hints.h>
+#include <asm/segment.h>
+#include <asm/cache.h>
+#include <asm/cpufeatures.h>
+#include <asm/nospec-branch.h>
+
+#include "calling.h"
.pushsection .noinstr.text, "ax"
-SYM_FUNC_START(entry_ibpb)
+/* Clobbers AX, CX, DX */
+SYM_FUNC_START(write_ibpb)
+ ANNOTATE_NOENDBR
movl $MSR_IA32_PRED_CMD, %ecx
- movl $PRED_CMD_IBPB, %eax
+ movl _ASM_RIP(x86_pred_cmd), %eax
xorl %edx, %edx
wrmsr
+
+ /* Make sure IBPB clears return stack preductions too. */
+ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET
+ RET
+SYM_FUNC_END(write_ibpb)
+EXPORT_SYMBOL_FOR_KVM(write_ibpb);
+
+SYM_FUNC_START(__WARN_trap)
+ ANNOTATE_NOENDBR
+ ANNOTATE_REACHABLE
+ ud1 (%edx), %_ASM_ARG1
RET
-SYM_FUNC_END(entry_ibpb)
-/* For KVM */
-EXPORT_SYMBOL_GPL(entry_ibpb);
+SYM_FUNC_END(__WARN_trap)
+EXPORT_SYMBOL(__WARN_trap)
+
+.popsection
+
+/*
+ * Define the VERW operand that is disguised as entry code so that
+ * it can be referenced with KPTI enabled. This ensures VERW can be
+ * used late in exit-to-user path after page tables are switched.
+ */
+.pushsection .entry.text, "ax"
+
+.align L1_CACHE_BYTES, 0xcc
+SYM_CODE_START_NOALIGN(x86_verw_sel)
+ UNWIND_HINT_UNDEFINED
+ ANNOTATE_NOENDBR
+ .word __KERNEL_DS
+.align L1_CACHE_BYTES, 0xcc
+SYM_CODE_END(x86_verw_sel);
+EXPORT_SYMBOL_FOR_KVM(x86_verw_sel);
.popsection
+
+THUNK warn_thunk_thunk, __warn_thunk
+
+/*
+ * Clang's implementation of TLS stack cookies requires the variable in
+ * question to be a TLS variable. If the variable happens to be defined as an
+ * ordinary variable with external linkage in the same compilation unit (which
+ * amounts to the whole of vmlinux with LTO enabled), Clang will drop the
+ * segment register prefix from the references, resulting in broken code. Work
+ * around this by avoiding the symbol used in -mstack-protector-guard-symbol=
+ * entirely in the C code, and use an alias emitted by the linker script
+ * instead.
+ */
+#if defined(CONFIG_STACKPROTECTOR) && defined(CONFIG_SMP)
+EXPORT_SYMBOL(__ref_stack_chk_guard);
+#endif
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 6e6af42e044a..92c0b4a94e0a 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -305,7 +305,7 @@
.macro CHECK_AND_APPLY_ESPFIX
#ifdef CONFIG_X86_ESPFIX32
#define GDT_ESPFIX_OFFSET (GDT_ENTRY_ESPFIX_SS * 8)
-#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + GDT_ESPFIX_OFFSET
+#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page + GDT_ESPFIX_OFFSET)
ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX
@@ -649,10 +649,6 @@ SYM_CODE_START_LOCAL(asm_\cfunc)
SYM_CODE_END(asm_\cfunc)
.endm
-.macro idtentry_sysvec vector cfunc
- idtentry \vector asm_\cfunc \cfunc has_error_code=0
-.endm
-
/*
* Include the defines which emit the idt entries which are shared
* shared between 32 and 64 bit and emit the __irqentry_text_* markers
@@ -837,7 +833,7 @@ SYM_FUNC_START(entry_SYSENTER_32)
movl %esp, %eax
call do_SYSENTER_32
- testl %eax, %eax
+ testb %al, %al
jz .Lsyscall_32_done
STACKLEAK_ERASE
@@ -875,6 +871,8 @@ SYM_FUNC_START(entry_SYSENTER_32)
/* Now ready to switch the cr3 */
SWITCH_TO_USER_CR3 scratch_reg=%eax
+ /* Clobbers ZF */
+ CLEAR_CPU_BUFFERS
/*
* Restore all flags except IF. (We restore IF separately because
@@ -954,6 +952,7 @@ restore_all_switch_stack:
/* Restore user state */
RESTORE_REGS pop=4 # skip orig_eax/error_code
+ CLEAR_CPU_BUFFERS
.Lirq_return:
/*
* ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
@@ -1154,7 +1153,7 @@ SYM_CODE_START(asm_exc_nmi)
* is using the thread stack right now, so it's safe for us to use it.
*/
movl %esp, %ebx
- movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esp
+ movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
call exc_nmi
movl %ebx, %esp
@@ -1166,6 +1165,7 @@ SYM_CODE_START(asm_exc_nmi)
CHECK_AND_APPLY_ESPFIX
RESTORE_ALL_NMI cr3_reg=%edi pop=4
+ CLEAR_CPU_BUFFERS
jmp .Lirq_return
#ifdef CONFIG_X86_ESPFIX32
@@ -1207,6 +1207,7 @@ SYM_CODE_START(asm_exc_nmi)
* 1 - orig_ax
*/
lss (1+5+6)*4(%esp), %esp # back to espfix stack
+ CLEAR_CPU_BUFFERS
jmp .Lirq_return
#endif
SYM_CODE_END(asm_exc_nmi)
@@ -1216,7 +1217,7 @@ SYM_CODE_START(rewind_stack_and_make_dead)
/* Prevent any naive code from trying to unwind to our caller. */
xorl %ebp, %ebp
- movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esi
+ movl PER_CPU_VAR(cpu_current_top_of_stack), %esi
leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp
call make_task_dead
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 43606de22511..f9983a1907bf 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -18,6 +18,8 @@
* - SYM_FUNC_START/END:Define functions in the symbol table.
* - idtentry: Define exception entry points.
*/
+#include <linux/export.h>
+#include <linux/kvm_types.h>
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/cache.h>
@@ -34,7 +36,6 @@
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/pgtable_types.h>
-#include <asm/export.h>
#include <asm/frame.h>
#include <asm/trapnr.h>
#include <asm/nospec-branch.h>
@@ -92,7 +93,7 @@ SYM_CODE_START(entry_SYSCALL_64)
/* tss.sp2 is scratch space. */
movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
- movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
@@ -116,6 +117,7 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
/* clobbers %rax, make sure it is after saving the syscall nr */
IBRS_ENTER
UNTRAIN_RET
+ CLEAR_BRANCH_HISTORY
call do_syscall_64 /* returns with IRQs disabled */
@@ -126,70 +128,8 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
* In the Xen PV case we must use iret anyway.
*/
- ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \
- X86_FEATURE_XENPV
-
- movq RCX(%rsp), %rcx
- movq RIP(%rsp), %r11
-
- cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
- jne swapgs_restore_regs_and_return_to_usermode
-
- /*
- * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
- * in kernel space. This essentially lets the user take over
- * the kernel, since userspace controls RSP.
- *
- * If width of "canonical tail" ever becomes variable, this will need
- * to be updated to remain correct on both old and new CPUs.
- *
- * Change top bits to match most significant bit (47th or 56th bit
- * depending on paging mode) in the address.
- */
-#ifdef CONFIG_X86_5LEVEL
- ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
- "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
-#else
- shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
- sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
-#endif
-
- /* If this changed %rcx, it was not canonical */
- cmpq %rcx, %r11
- jne swapgs_restore_regs_and_return_to_usermode
-
- cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
- jne swapgs_restore_regs_and_return_to_usermode
-
- movq R11(%rsp), %r11
- cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
- jne swapgs_restore_regs_and_return_to_usermode
-
- /*
- * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
- * restore RF properly. If the slowpath sets it for whatever reason, we
- * need to restore it correctly.
- *
- * SYSRET can restore TF, but unlike IRET, restoring TF results in a
- * trap from userspace immediately after SYSRET. This would cause an
- * infinite loop whenever #DB happens with register state that satisfies
- * the opportunistic SYSRET conditions. For example, single-stepping
- * this user code:
- *
- * movq $stuck_here, %rcx
- * pushfq
- * popq %r11
- * stuck_here:
- *
- * would never get past 'stuck_here'.
- */
- testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
- jnz swapgs_restore_regs_and_return_to_usermode
-
- /* nothing to check for RSP */
-
- cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
- jne swapgs_restore_regs_and_return_to_usermode
+ ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
+ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
/*
* We win! This label is here just for ease of understanding
@@ -223,6 +163,7 @@ syscall_return_via_sysret:
SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
swapgs
+ CLEAR_CPU_BUFFERS
sysretq
SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
@@ -235,6 +176,7 @@ SYM_CODE_END(entry_SYSCALL_64)
*/
.pushsection .text, "ax"
SYM_FUNC_START(__switch_to_asm)
+ ANNOTATE_NOENDBR
/*
* Save callee-saved registers
* This must match the order in inactive_task_frame
@@ -252,7 +194,7 @@ SYM_FUNC_START(__switch_to_asm)
#ifdef CONFIG_STACKPROTECTOR
movq TASK_stack_canary(%rsi), %rbx
- movq %rbx, PER_CPU_VAR(fixed_percpu_data) + FIXED_stack_canary
+ movq %rbx, PER_CPU_VAR(__stack_chk_guard)
#endif
/*
@@ -309,7 +251,13 @@ SYM_CODE_START(ret_from_fork_asm)
* and unwind should work normally.
*/
UNWIND_HINT_REGS
+
+#ifdef CONFIG_X86_FRED
+ ALTERNATIVE "jmp swapgs_restore_regs_and_return_to_usermode", \
+ "jmp asm_fred_exit_user", X86_FEATURE_FRED
+#else
jmp swapgs_restore_regs_and_return_to_usermode
+#endif
SYM_CODE_END(ret_from_fork_asm)
.popsection
@@ -362,10 +310,9 @@ SYM_CODE_END(xen_error_entry)
movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
.endif
- call \cfunc
-
/* For some configurations \cfunc ends up being a noreturn. */
- REACHABLE
+ ANNOTATE_REACHABLE
+ call \cfunc
jmp error_return
.endm
@@ -432,14 +379,6 @@ SYM_CODE_END(\asmsym)
idtentry \vector asm_\cfunc \cfunc has_error_code=1
.endm
-/*
- * System vectors which invoke their handlers directly and are not
- * going through the regular common device interrupt handling code.
- */
-.macro idtentry_sysvec vector cfunc
- idtentry \vector asm_\cfunc \cfunc has_error_code=0
-.endm
-
/**
* idtentry_mce_db - Macro to generate entry stubs for #MC and #DB
* @vector: Vector number
@@ -591,10 +530,10 @@ SYM_CODE_START(\asmsym)
movq %rsp, %rdi /* pt_regs pointer into first argument */
movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
- call \cfunc
/* For some configurations \cfunc ends up being a noreturn. */
- REACHABLE
+ ANNOTATE_REACHABLE
+ call \cfunc
jmp paranoid_exit
@@ -621,17 +560,28 @@ __irqentry_text_end:
SYM_CODE_START_LOCAL(common_interrupt_return)
SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
IBRS_EXIT
-#ifdef CONFIG_DEBUG_ENTRY
- /* Assert that pt_regs indicates user mode. */
- testb $3, CS(%rsp)
- jnz 1f
- ud2
-1:
-#endif
#ifdef CONFIG_XEN_PV
ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
#endif
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+ ALTERNATIVE "", "jmp .Lpti_restore_regs_and_return_to_usermode", X86_FEATURE_PTI
+#endif
+
+ STACKLEAK_ERASE
+ POP_REGS
+ add $8, %rsp /* orig_ax */
+ UNWIND_HINT_IRET_REGS
+
+.Lswapgs_and_iret:
+ swapgs
+ CLEAR_CPU_BUFFERS
+ /* Assert that the IRET frame indicates user mode. */
+ testb $3, 8(%rsp)
+ jnz .Lnative_iret
+ ud2
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+.Lpti_restore_regs_and_return_to_usermode:
POP_REGS pop_rdi=0
/*
@@ -658,13 +608,14 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
*/
STACKLEAK_ERASE_NOCLOBBER
- SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+ push %rax
+ SWITCH_TO_USER_CR3 scratch_reg=%rdi scratch_reg2=%rax
+ pop %rax
/* Restore RDI. */
popq %rdi
- swapgs
- jmp .Lnative_iret
-
+ jmp .Lswapgs_and_iret
+#endif
SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
#ifdef CONFIG_DEBUG_ENTRY
@@ -774,6 +725,8 @@ native_irq_return_ldt:
*/
popq %rax /* Restore user RAX */
+ CLEAR_CPU_BUFFERS
+
/*
* RSP now points to an ordinary IRET frame, except that the page
* is read-only and RSP[31:16] are preloaded with the userspace
@@ -791,6 +744,7 @@ _ASM_NOKPROBE(common_interrupt_return)
* Is in entry.text as it shouldn't be instrumented.
*/
SYM_FUNC_START(asm_load_gs_index)
+ ANNOTATE_NOENDBR
FRAME_BEGIN
swapgs
.Lgs_change:
@@ -1019,14 +973,14 @@ SYM_CODE_START_LOCAL(paranoid_exit)
IBRS_EXIT save_reg=%r15
/*
- * The order of operations is important. RESTORE_CR3 requires
+ * The order of operations is important. PARANOID_RESTORE_CR3 requires
* kernel GSBASE.
*
* NB to anyone to try to optimize this code: this code does
* not execute at all for exceptions from user mode. Those
* exceptions go through error_return instead.
*/
- RESTORE_CR3 scratch_reg=%rax save_reg=%r14
+ PARANOID_RESTORE_CR3 scratch_reg=%rax save_reg=%r14
/* Handle the three GSBASE cases */
ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE
@@ -1147,7 +1101,7 @@ SYM_CODE_END(error_return)
*
* Registers:
* %r14: Used to save/restore the CR3 of the interrupted context
- * when PAGE_TABLE_ISOLATION is in use. Do not clobber.
+ * when MITIGATION_PAGE_TABLE_ISOLATION is in use. Do not clobber.
*/
SYM_CODE_START(asm_exc_nmi)
UNWIND_HINT_IRET_ENTRY
@@ -1163,8 +1117,8 @@ SYM_CODE_START(asm_exc_nmi)
* anyway.
*
* To handle this case we do the following:
- * Check the a special location on the stack that contains
- * a variable that is set when NMIs are executing.
+ * Check a special location on the stack that contains a
+ * variable that is set when NMIs are executing.
* The interrupted task's stack is also checked to see if it
* is an NMI stack.
* If the variable is not set and the stack is not the NMI
@@ -1215,7 +1169,7 @@ SYM_CODE_START(asm_exc_nmi)
FENCE_SWAPGS_USER_ENTRY
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
movq %rsp, %rdx
- movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
UNWIND_HINT_IRET_REGS base=%rdx offset=8
pushq 5*8(%rdx) /* pt_regs->ss */
pushq 4*8(%rdx) /* pt_regs->rsp */
@@ -1237,7 +1191,6 @@ SYM_CODE_START(asm_exc_nmi)
*/
movq %rsp, %rdi
- movq $-1, %rsi
call exc_nmi
/*
@@ -1295,8 +1248,8 @@ SYM_CODE_START(asm_exc_nmi)
* end_repeat_nmi, then we are a nested NMI. We must not
* modify the "iret" frame because it's being written by
* the outer NMI. That's okay; the outer NMI handler is
- * about to about to call exc_nmi() anyway, so we can just
- * resume the outer NMI.
+ * about to call exc_nmi() anyway, so we can just resume
+ * the outer NMI.
*/
movq $repeat_nmi, %rdx
@@ -1451,14 +1404,12 @@ end_repeat_nmi:
UNWIND_HINT_REGS
movq %rsp, %rdi
- movq $-1, %rsi
call exc_nmi
/* Always restore stashed SPEC_CTRL value (see paranoid_entry) */
IBRS_EXIT save_reg=%r15
- /* Always restore stashed CR3 value (see paranoid_entry) */
- RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
+ PARANOID_RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
/*
* The above invocation of paranoid_entry stored the GSBASE
@@ -1503,6 +1454,12 @@ nmi_restore:
movq $0, 5*8(%rsp) /* clear "NMI executing" */
/*
+ * Skip CLEAR_CPU_BUFFERS here, since it only helps in rare cases like
+ * NMI in kernel after user state is restored. For an unprivileged user
+ * these conditions are hard to meet.
+ */
+
+ /*
* iretq reads the "iret" frame and exits the NMI stack in a
* single instruction. We are returning to kernel mode, so this
* cannot result in a fault. Similarly, we don't need to worry
@@ -1511,18 +1468,17 @@ nmi_restore:
iretq
SYM_CODE_END(asm_exc_nmi)
-#ifndef CONFIG_IA32_EMULATION
/*
* This handles SYSCALL from 32-bit code. There is no way to program
* MSRs to fully disable 32-bit SYSCALL.
*/
-SYM_CODE_START(ignore_sysret)
+SYM_CODE_START(entry_SYSCALL32_ignore)
UNWIND_HINT_END_OF_STACK
ENDBR
mov $-ENOSYS, %eax
+ CLEAR_CPU_BUFFERS
sysretl
-SYM_CODE_END(ignore_sysret)
-#endif
+SYM_CODE_END(entry_SYSCALL32_ignore)
.pushsection .text, "ax"
__FUNC_ALIGN
@@ -1531,10 +1487,85 @@ SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead)
/* Prevent any naive code from trying to unwind to our caller. */
xorl %ebp, %ebp
- movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rax
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
leaq -PTREGS_SIZE(%rax), %rsp
UNWIND_HINT_REGS
call make_task_dead
SYM_CODE_END(rewind_stack_and_make_dead)
.popsection
+
+/*
+ * This sequence executes branches in order to remove user branch information
+ * from the branch history tracker in the Branch Predictor, therefore removing
+ * user influence on subsequent BTB lookups.
+ *
+ * It should be used on parts prior to Alder Lake. Newer parts should use the
+ * BHI_DIS_S hardware control instead. If a pre-Alder Lake part is being
+ * virtualized on newer hardware the VMM should protect against BHI attacks by
+ * setting BHI_DIS_S for the guests.
+ *
+ * CALLs/RETs are necessary to prevent Loop Stream Detector(LSD) from engaging
+ * and not clearing the branch history. The call tree looks like:
+ *
+ * call 1
+ * call 2
+ * call 2
+ * call 2
+ * call 2
+ * call 2
+ * ret
+ * ret
+ * ret
+ * ret
+ * ret
+ * ret
+ *
+ * This means that the stack is non-constant and ORC can't unwind it with %rsp
+ * alone. Therefore we unconditionally set up the frame pointer, which allows
+ * ORC to unwind properly.
+ *
+ * The alignment is for performance and not for safety, and may be safely
+ * refactored in the future if needed. The .skips are for safety, to ensure
+ * that all RETs are in the second half of a cacheline to mitigate Indirect
+ * Target Selection, rather than taking the slowpath via its_return_thunk.
+ */
+SYM_FUNC_START(clear_bhb_loop)
+ ANNOTATE_NOENDBR
+ push %rbp
+ mov %rsp, %rbp
+ movl $5, %ecx
+ ANNOTATE_INTRA_FUNCTION_CALL
+ call 1f
+ jmp 5f
+ .align 64, 0xcc
+ /*
+ * Shift instructions so that the RET is in the upper half of the
+ * cacheline and don't take the slowpath to its_return_thunk.
+ */
+ .skip 32 - (.Lret1 - 1f), 0xcc
+ ANNOTATE_INTRA_FUNCTION_CALL
+1: call 2f
+.Lret1: RET
+ .align 64, 0xcc
+ /*
+ * As above shift instructions for RET at .Lret2 as well.
+ *
+ * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc
+ * but some Clang versions (e.g. 18) don't like this.
+ */
+ .skip 32 - 18, 0xcc
+2: movl $5, %eax
+3: jmp 4f
+ nop
+4: sub $1, %eax
+ jnz 3b
+ sub $1, %ecx
+ jnz 1b
+.Lret2: RET
+5: lfence
+ pop %rbp
+ RET
+SYM_FUNC_END(clear_bhb_loop)
+EXPORT_SYMBOL_FOR_KVM(clear_bhb_loop)
+STACK_FRAME_NON_STANDARD(clear_bhb_loop)
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 70150298f8bd..a45e1125fc6c 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -7,7 +7,6 @@
#include <asm/asm-offsets.h>
#include <asm/current.h>
#include <asm/errno.h>
-#include <asm/ia32_unistd.h>
#include <asm/thread_info.h>
#include <asm/segment.h>
#include <asm/irqflags.h>
@@ -58,7 +57,7 @@ SYM_CODE_START(entry_SYSENTER_compat)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
popq %rax
- movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
@@ -90,9 +89,6 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)
cld
- IBRS_ENTER
- UNTRAIN_RET
-
/*
* SYSENTER doesn't filter flags, so we need to clear NT and AC
* ourselves. To save a few cycles, we can check whether
@@ -116,11 +112,18 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)
jnz .Lsysenter_fix_flags
.Lsysenter_flags_fixed:
+ /*
+ * CPU bugs mitigations mechanisms can call other functions. They
+ * should be invoked after making sure TF is cleared because
+ * single-step is ignored only for instructions inside the
+ * entry_SYSENTER_compat function.
+ */
+ IBRS_ENTER
+ UNTRAIN_RET
+ CLEAR_BRANCH_HISTORY
+
movq %rsp, %rdi
call do_SYSENTER_32
- /* XEN PV guests always use IRET path */
- ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
- "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
jmp sysret32_from_system_call
.Lsysenter_fix_flags:
@@ -190,7 +193,7 @@ SYM_CODE_START(entry_SYSCALL_compat)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
/* Switch to the kernel stack */
- movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
@@ -209,16 +212,19 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL)
IBRS_ENTER
UNTRAIN_RET
+ CLEAR_BRANCH_HISTORY
movq %rsp, %rdi
call do_fast_syscall_32
+
+sysret32_from_system_call:
/* XEN PV guests always use IRET path */
- ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
+ ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
- /* Opportunistic SYSRET */
-sysret32_from_system_call:
/*
+ * Opportunistic SYSRET
+ *
* We are not going to return to userspace from the trampoline
* stack. So let's erase the thread stack right now.
*/
@@ -271,6 +277,7 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_unsafe_stack, SYM_L_GLOBAL)
xorl %r9d, %r9d
xorl %r10d, %r10d
swapgs
+ CLEAR_CPU_BUFFERS
sysretl
SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
@@ -278,78 +285,15 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL)
SYM_CODE_END(entry_SYSCALL_compat)
/*
- * 32-bit legacy system call entry.
- *
- * 32-bit x86 Linux system calls traditionally used the INT $0x80
- * instruction. INT $0x80 lands here.
- *
- * This entry point can be used by 32-bit and 64-bit programs to perform
- * 32-bit system calls. Instances of INT $0x80 can be found inline in
- * various programs and libraries. It is also used by the vDSO's
- * __kernel_vsyscall fallback for hardware that doesn't support a faster
- * entry method. Restarted 32-bit system calls also fall back to INT
- * $0x80 regardless of what instruction was originally used to do the
- * system call.
- *
- * This is considered a slow path. It is not used by most libc
- * implementations on modern hardware except during process startup.
- *
- * Arguments:
- * eax system call number
- * ebx arg1
- * ecx arg2
- * edx arg3
- * esi arg4
- * edi arg5
- * ebp arg6
+ * int 0x80 is used by 32 bit mode as a system call entry. Normally idt entries
+ * point to C routines, however since this is a system call interface the branch
+ * history needs to be scrubbed to protect against BHI attacks, and that
+ * scrubbing needs to take place in assembly code prior to entering any C
+ * routines.
*/
-SYM_CODE_START(entry_INT80_compat)
- UNWIND_HINT_ENTRY
- ENDBR
- /*
- * Interrupts are off on entry.
- */
- ASM_CLAC /* Do this early to minimize exposure */
- ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV
-
- /*
- * User tracing code (ptrace or signal handlers) might assume that
- * the saved RAX contains a 32-bit number when we're invoking a 32-bit
- * syscall. Just in case the high bits are nonzero, zero-extend
- * the syscall number. (This could almost certainly be deleted
- * with no ill effects.)
- */
- movl %eax, %eax
-
- /* switch to thread stack expects orig_ax and rdi to be pushed */
- pushq %rax /* pt_regs->orig_ax */
-
- /* Need to switch before accessing the thread stack. */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
-
- /* In the Xen PV case we already run on the thread stack. */
- ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV
-
- movq %rsp, %rax
- movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
-
- pushq 5*8(%rax) /* regs->ss */
- pushq 4*8(%rax) /* regs->rsp */
- pushq 3*8(%rax) /* regs->eflags */
- pushq 2*8(%rax) /* regs->cs */
- pushq 1*8(%rax) /* regs->ip */
- pushq 0*8(%rax) /* regs->orig_ax */
-.Lint80_keep_stack:
-
- PUSH_AND_CLEAR_REGS rax=$-ENOSYS
- UNWIND_HINT_REGS
-
- cld
-
- IBRS_ENTER
- UNTRAIN_RET
-
- movq %rsp, %rdi
- call do_int80_syscall_32
- jmp swapgs_restore_regs_and_return_to_usermode
-SYM_CODE_END(entry_INT80_compat)
+SYM_CODE_START(int80_emulation)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ CLEAR_BRANCH_HISTORY
+ jmp do_int80_emulation
+SYM_CODE_END(int80_emulation)
diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S
new file mode 100644
index 000000000000..894f7f16eb80
--- /dev/null
+++ b/arch/x86/entry/entry_64_fred.S
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The actual FRED entry points.
+ */
+
+#include <linux/export.h>
+#include <linux/kvm_types.h>
+
+#include <asm/asm.h>
+#include <asm/fred.h>
+#include <asm/segment.h>
+
+#include "calling.h"
+
+ .code64
+ .section .noinstr.text, "ax"
+
+.macro FRED_ENTER
+ UNWIND_HINT_END_OF_STACK
+ ANNOTATE_NOENDBR
+ PUSH_AND_CLEAR_REGS
+ movq %rsp, %rdi /* %rdi -> pt_regs */
+.endm
+
+.macro FRED_EXIT
+ UNWIND_HINT_REGS
+ POP_REGS
+.endm
+
+/*
+ * The new RIP value that FRED event delivery establishes is
+ * IA32_FRED_CONFIG & ~FFFH for events that occur in ring 3.
+ * Thus the FRED ring 3 entry point must be 4K page aligned.
+ */
+ .align 4096
+
+SYM_CODE_START_NOALIGN(asm_fred_entrypoint_user)
+ FRED_ENTER
+ call fred_entry_from_user
+SYM_INNER_LABEL(asm_fred_exit_user, SYM_L_GLOBAL)
+ FRED_EXIT
+1: ERETU
+
+ _ASM_EXTABLE_TYPE(1b, asm_fred_entrypoint_user, EX_TYPE_ERETU)
+SYM_CODE_END(asm_fred_entrypoint_user)
+
+/*
+ * The new RIP value that FRED event delivery establishes is
+ * (IA32_FRED_CONFIG & ~FFFH) + 256 for events that occur in
+ * ring 0, i.e., asm_fred_entrypoint_user + 256.
+ */
+ .org asm_fred_entrypoint_user + 256, 0xcc
+SYM_CODE_START_NOALIGN(asm_fred_entrypoint_kernel)
+ FRED_ENTER
+ call fred_entry_from_kernel
+ FRED_EXIT
+ ERETS
+SYM_CODE_END(asm_fred_entrypoint_kernel)
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+SYM_FUNC_START(asm_fred_entry_from_kvm)
+ ANNOTATE_NOENDBR
+ push %rbp
+ mov %rsp, %rbp
+
+ UNWIND_HINT_SAVE
+
+ /*
+ * Both IRQ and NMI from VMX can be handled on current task stack
+ * because there is no need to protect from reentrancy and the call
+ * stack leading to this helper is effectively constant and shallow
+ * (relatively speaking). Do the same when FRED is active, i.e., no
+ * need to check current stack level for a stack switch.
+ *
+ * Emulate the FRED-defined redzone and stack alignment.
+ */
+ sub $(FRED_CONFIG_REDZONE_AMOUNT << 6), %rsp
+ and $FRED_STACK_FRAME_RSP_MASK, %rsp
+
+ /*
+ * Start to push a FRED stack frame, which is always 64 bytes:
+ *
+ * +--------+-----------------+
+ * | Bytes | Usage |
+ * +--------+-----------------+
+ * | 63:56 | Reserved |
+ * | 55:48 | Event Data |
+ * | 47:40 | SS + Event Info |
+ * | 39:32 | RSP |
+ * | 31:24 | RFLAGS |
+ * | 23:16 | CS + Aux Info |
+ * | 15:8 | RIP |
+ * | 7:0 | Error Code |
+ * +--------+-----------------+
+ */
+ push $0 /* Reserved, must be 0 */
+ push $0 /* Event data, 0 for IRQ/NMI */
+ push %rdi /* fred_ss handed in by the caller */
+ push %rbp
+ pushf
+ push $__KERNEL_CS
+
+ /*
+ * Unlike the IDT event delivery, FRED _always_ pushes an error code
+ * after pushing the return RIP, thus the CALL instruction CANNOT be
+ * used here to push the return RIP, otherwise there is no chance to
+ * push an error code before invoking the IRQ/NMI handler.
+ *
+ * Use LEA to get the return RIP and push it, then push an error code.
+ */
+ lea 1f(%rip), %rax
+ push %rax /* Return RIP */
+ push $0 /* Error code, 0 for IRQ/NMI */
+
+ PUSH_AND_CLEAR_REGS clear_callee=0 unwind_hint=0
+
+ movq %rsp, %rdi /* %rdi -> pt_regs */
+ /*
+ * At this point: {rdi, rsi, rdx, rcx, r8, r9}, {r10, r11}, {rax, rdx}
+ * are clobbered, which corresponds to: arguments, extra caller-saved
+ * and return. All registers a C function is allowed to clobber.
+ *
+ * Notably, the callee-saved registers: {rbx, r12, r13, r14, r15}
+ * are untouched, with the exception of rbp, which carries the stack
+ * frame and will be restored before exit.
+ *
+ * Further calling another C function will not alter this state.
+ */
+ call __fred_entry_from_kvm /* Call the C entry point */
+
+ /*
+ * When FRED, use ERETS to potentially clear NMIs, otherwise simply
+ * restore the stack pointer.
+ */
+ ALTERNATIVE "nop; nop; mov %rbp, %rsp", \
+ __stringify(add $C_PTREGS_SIZE, %rsp; ERETS), \
+ X86_FEATURE_FRED
+
+1: /*
+ * Objtool doesn't understand ERETS, and the cfi register state is
+ * different from initial_func_cfi due to PUSH_REGS. Tell it the state
+ * is similar to where UNWIND_HINT_SAVE is.
+ */
+ UNWIND_HINT_RESTORE
+
+ pop %rbp
+ RET
+
+SYM_FUNC_END(asm_fred_entry_from_kvm)
+EXPORT_SYMBOL_FOR_KVM(asm_fred_entry_from_kvm);
+#endif
diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c
new file mode 100644
index 000000000000..94e626cc6a07
--- /dev/null
+++ b/arch/x86/entry/entry_fred.c
@@ -0,0 +1,296 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The FRED specific kernel/user entry functions which are invoked from
+ * assembly code and dispatch to the associated handlers.
+ */
+#include <linux/kernel.h>
+#include <linux/kdebug.h>
+#include <linux/nospec.h>
+
+#include <asm/desc.h>
+#include <asm/fred.h>
+#include <asm/idtentry.h>
+#include <asm/syscall.h>
+#include <asm/trapnr.h>
+#include <asm/traps.h>
+
+/* FRED EVENT_TYPE_OTHER vector numbers */
+#define FRED_SYSCALL 1
+#define FRED_SYSENTER 2
+
+static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code)
+{
+ irqentry_state_t irq_state = irqentry_nmi_enter(regs);
+
+ instrumentation_begin();
+
+ /* Panic on events from a high stack level */
+ if (regs->fred_cs.sl > 0) {
+ pr_emerg("PANIC: invalid or fatal FRED event; event type %u "
+ "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n",
+ regs->fred_ss.type, regs->fred_ss.vector, error_code,
+ fred_event_data(regs), regs->cs, regs->ip);
+ die("invalid or fatal FRED event", regs, error_code);
+ panic("invalid or fatal FRED event");
+ } else {
+ unsigned long flags = oops_begin();
+ int sig = SIGKILL;
+
+ pr_alert("BUG: invalid or fatal FRED event; event type %u "
+ "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n",
+ regs->fred_ss.type, regs->fred_ss.vector, error_code,
+ fred_event_data(regs), regs->cs, regs->ip);
+
+ if (__die("Invalid or fatal FRED event", regs, error_code))
+ sig = 0;
+
+ oops_end(flags, regs, sig);
+ }
+
+ instrumentation_end();
+ irqentry_nmi_exit(regs, irq_state);
+}
+
+static noinstr void fred_intx(struct pt_regs *regs)
+{
+ switch (regs->fred_ss.vector) {
+ /* Opcode 0xcd, 0x3, NOT INT3 (opcode 0xcc) */
+ case X86_TRAP_BP:
+ return exc_int3(regs);
+
+ /* Opcode 0xcd, 0x4, NOT INTO (opcode 0xce) */
+ case X86_TRAP_OF:
+ return exc_overflow(regs);
+
+#ifdef CONFIG_IA32_EMULATION
+ /* INT80 */
+ case IA32_SYSCALL_VECTOR:
+ if (ia32_enabled())
+ return fred_int80_emulation(regs);
+ fallthrough;
+#endif
+
+ default:
+ return exc_general_protection(regs, 0);
+ }
+}
+
+static __always_inline void fred_other(struct pt_regs *regs)
+{
+ /* The compiler can fold these conditions into a single test */
+ if (likely(regs->fred_ss.vector == FRED_SYSCALL && regs->fred_ss.l)) {
+ regs->orig_ax = regs->ax;
+ regs->ax = -ENOSYS;
+ do_syscall_64(regs, regs->orig_ax);
+ return;
+ } else if (ia32_enabled() &&
+ likely(regs->fred_ss.vector == FRED_SYSENTER && !regs->fred_ss.l)) {
+ regs->orig_ax = regs->ax;
+ regs->ax = -ENOSYS;
+ do_fast_syscall_32(regs);
+ return;
+ } else {
+ exc_invalid_op(regs);
+ return;
+ }
+}
+
+#define SYSVEC(_vector, _function) [_vector - FIRST_SYSTEM_VECTOR] = fred_sysvec_##_function
+
+static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = {
+ SYSVEC(ERROR_APIC_VECTOR, error_interrupt),
+ SYSVEC(SPURIOUS_APIC_VECTOR, spurious_apic_interrupt),
+ SYSVEC(LOCAL_TIMER_VECTOR, apic_timer_interrupt),
+ SYSVEC(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi),
+
+ SYSVEC(RESCHEDULE_VECTOR, reschedule_ipi),
+ SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, call_function_single),
+ SYSVEC(CALL_FUNCTION_VECTOR, call_function),
+ SYSVEC(REBOOT_VECTOR, reboot),
+
+ SYSVEC(THRESHOLD_APIC_VECTOR, threshold),
+ SYSVEC(DEFERRED_ERROR_VECTOR, deferred_error),
+ SYSVEC(THERMAL_APIC_VECTOR, thermal),
+
+ SYSVEC(IRQ_WORK_VECTOR, irq_work),
+
+ SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi),
+ SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi),
+ SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi),
+
+ SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, posted_msi_notification),
+};
+
+static bool fred_setup_done __initdata;
+
+void __init fred_install_sysvec(unsigned int sysvec, idtentry_t handler)
+{
+ if (WARN_ON_ONCE(sysvec < FIRST_SYSTEM_VECTOR))
+ return;
+
+ if (WARN_ON_ONCE(fred_setup_done))
+ return;
+
+ if (!WARN_ON_ONCE(sysvec_table[sysvec - FIRST_SYSTEM_VECTOR]))
+ sysvec_table[sysvec - FIRST_SYSTEM_VECTOR] = handler;
+}
+
+static noinstr void fred_handle_spurious_interrupt(struct pt_regs *regs)
+{
+ spurious_interrupt(regs, regs->fred_ss.vector);
+}
+
+void __init fred_complete_exception_setup(void)
+{
+ unsigned int vector;
+
+ for (vector = 0; vector < FIRST_EXTERNAL_VECTOR; vector++)
+ set_bit(vector, system_vectors);
+
+ for (vector = 0; vector < NR_SYSTEM_VECTORS; vector++) {
+ if (sysvec_table[vector])
+ set_bit(vector + FIRST_SYSTEM_VECTOR, system_vectors);
+ else
+ sysvec_table[vector] = fred_handle_spurious_interrupt;
+ }
+ fred_setup_done = true;
+}
+
+static noinstr void fred_extint(struct pt_regs *regs)
+{
+ unsigned int vector = regs->fred_ss.vector;
+ unsigned int index = array_index_nospec(vector - FIRST_SYSTEM_VECTOR,
+ NR_SYSTEM_VECTORS);
+
+ if (WARN_ON_ONCE(vector < FIRST_EXTERNAL_VECTOR))
+ return;
+
+ if (likely(vector >= FIRST_SYSTEM_VECTOR)) {
+ irqentry_state_t state = irqentry_enter(regs);
+
+ instrumentation_begin();
+ sysvec_table[index](regs);
+ instrumentation_end();
+ irqentry_exit(regs, state);
+ } else {
+ common_interrupt(regs, vector);
+ }
+}
+
+static noinstr void fred_hwexc(struct pt_regs *regs, unsigned long error_code)
+{
+ /* Optimize for #PF. That's the only exception which matters performance wise */
+ if (likely(regs->fred_ss.vector == X86_TRAP_PF))
+ return exc_page_fault(regs, error_code);
+
+ switch (regs->fred_ss.vector) {
+ case X86_TRAP_DE: return exc_divide_error(regs);
+ case X86_TRAP_DB: return fred_exc_debug(regs);
+ case X86_TRAP_BR: return exc_bounds(regs);
+ case X86_TRAP_UD: return exc_invalid_op(regs);
+ case X86_TRAP_NM: return exc_device_not_available(regs);
+ case X86_TRAP_DF: return exc_double_fault(regs, error_code);
+ case X86_TRAP_TS: return exc_invalid_tss(regs, error_code);
+ case X86_TRAP_NP: return exc_segment_not_present(regs, error_code);
+ case X86_TRAP_SS: return exc_stack_segment(regs, error_code);
+ case X86_TRAP_GP: return exc_general_protection(regs, error_code);
+ case X86_TRAP_MF: return exc_coprocessor_error(regs);
+ case X86_TRAP_AC: return exc_alignment_check(regs, error_code);
+ case X86_TRAP_XF: return exc_simd_coprocessor_error(regs);
+
+#ifdef CONFIG_X86_MCE
+ case X86_TRAP_MC: return fred_exc_machine_check(regs);
+#endif
+#ifdef CONFIG_INTEL_TDX_GUEST
+ case X86_TRAP_VE: return exc_virtualization_exception(regs);
+#endif
+#ifdef CONFIG_X86_CET
+ case X86_TRAP_CP: return exc_control_protection(regs, error_code);
+#endif
+ default: return fred_bad_type(regs, error_code);
+ }
+
+}
+
+static noinstr void fred_swexc(struct pt_regs *regs, unsigned long error_code)
+{
+ switch (regs->fred_ss.vector) {
+ case X86_TRAP_BP: return exc_int3(regs);
+ case X86_TRAP_OF: return exc_overflow(regs);
+ default: return fred_bad_type(regs, error_code);
+ }
+}
+
+__visible noinstr void fred_entry_from_user(struct pt_regs *regs)
+{
+ unsigned long error_code = regs->orig_ax;
+
+ /* Invalidate orig_ax so that syscall_get_nr() works correctly */
+ regs->orig_ax = -1;
+
+ switch (regs->fred_ss.type) {
+ case EVENT_TYPE_EXTINT:
+ return fred_extint(regs);
+ case EVENT_TYPE_NMI:
+ if (likely(regs->fred_ss.vector == X86_TRAP_NMI))
+ return fred_exc_nmi(regs);
+ break;
+ case EVENT_TYPE_HWEXC:
+ return fred_hwexc(regs, error_code);
+ case EVENT_TYPE_SWINT:
+ return fred_intx(regs);
+ case EVENT_TYPE_PRIV_SWEXC:
+ if (likely(regs->fred_ss.vector == X86_TRAP_DB))
+ return fred_exc_debug(regs);
+ break;
+ case EVENT_TYPE_SWEXC:
+ return fred_swexc(regs, error_code);
+ case EVENT_TYPE_OTHER:
+ return fred_other(regs);
+ default: break;
+ }
+
+ return fred_bad_type(regs, error_code);
+}
+
+__visible noinstr void fred_entry_from_kernel(struct pt_regs *regs)
+{
+ unsigned long error_code = regs->orig_ax;
+
+ /* Invalidate orig_ax so that syscall_get_nr() works correctly */
+ regs->orig_ax = -1;
+
+ switch (regs->fred_ss.type) {
+ case EVENT_TYPE_EXTINT:
+ return fred_extint(regs);
+ case EVENT_TYPE_NMI:
+ if (likely(regs->fred_ss.vector == X86_TRAP_NMI))
+ return fred_exc_nmi(regs);
+ break;
+ case EVENT_TYPE_HWEXC:
+ return fred_hwexc(regs, error_code);
+ case EVENT_TYPE_PRIV_SWEXC:
+ if (likely(regs->fred_ss.vector == X86_TRAP_DB))
+ return fred_exc_debug(regs);
+ break;
+ case EVENT_TYPE_SWEXC:
+ return fred_swexc(regs, error_code);
+ default: break;
+ }
+
+ return fred_bad_type(regs, error_code);
+}
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+__visible noinstr void __fred_entry_from_kvm(struct pt_regs *regs)
+{
+ switch (regs->fred_ss.type) {
+ case EVENT_TYPE_EXTINT:
+ return fred_extint(regs);
+ case EVENT_TYPE_NMI:
+ return fred_exc_nmi(regs);
+ default:
+ WARN_ON_ONCE(1);
+ }
+}
+#endif
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 8cfc9bc73e7f..a67a644d0cfe 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -1,10 +1,16 @@
-// SPDX-License-Identifier: GPL-2.0
-/* System call table for i386. */
+// SPDX-License-Identifier: GPL-2.0-only
+/* 32-bit system call dispatch */
#include <linux/linkage.h>
#include <linux/sys.h>
#include <linux/cache.h>
#include <linux/syscalls.h>
+#include <linux/entry-common.h>
+#include <linux/nospec.h>
+#include <linux/uaccess.h>
+#include <asm/apic.h>
+#include <asm/traps.h>
+#include <asm/cpufeature.h>
#include <asm/syscall.h>
#ifdef CONFIG_IA32_EMULATION
@@ -14,12 +20,352 @@
#endif
#define __SYSCALL(nr, sym) extern long __ia32_##sym(const struct pt_regs *);
-
+#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __ia32_##sym(const struct pt_regs *);
#include <asm/syscalls_32.h>
-#undef __SYSCALL
+#undef __SYSCALL
-#define __SYSCALL(nr, sym) __ia32_##sym,
+#undef __SYSCALL_NORETURN
+#define __SYSCALL_NORETURN __SYSCALL
-__visible const sys_call_ptr_t ia32_sys_call_table[] = {
+/*
+ * The sys_call_table[] is no longer used for system calls, but
+ * kernel/trace/trace_syscalls.c still wants to know the system
+ * call address.
+ */
+#ifdef CONFIG_X86_32
+#define __SYSCALL(nr, sym) __ia32_##sym,
+const sys_call_ptr_t sys_call_table[] = {
#include <asm/syscalls_32.h>
};
+#undef __SYSCALL
+#endif
+
+#define __SYSCALL(nr, sym) case nr: return __ia32_##sym(regs);
+long ia32_sys_call(const struct pt_regs *regs, unsigned int nr)
+{
+ switch (nr) {
+ #include <asm/syscalls_32.h>
+ default: return __ia32_sys_ni_syscall(regs);
+ }
+}
+
+static __always_inline int syscall_32_enter(struct pt_regs *regs)
+{
+ if (IS_ENABLED(CONFIG_IA32_EMULATION))
+ current_thread_info()->status |= TS_COMPAT;
+
+ return (int)regs->orig_ax;
+}
+
+#ifdef CONFIG_IA32_EMULATION
+bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED);
+
+static int __init ia32_emulation_override_cmdline(char *arg)
+{
+ return kstrtobool(arg, &__ia32_enabled);
+}
+early_param("ia32_emulation", ia32_emulation_override_cmdline);
+#endif
+
+/*
+ * Invoke a 32-bit syscall. Called with IRQs on in CT_STATE_KERNEL.
+ */
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
+{
+ /*
+ * Convert negative numbers to very high and thus out of range
+ * numbers for comparisons.
+ */
+ unsigned int unr = nr;
+
+ if (likely(unr < IA32_NR_syscalls)) {
+ unr = array_index_nospec(unr, IA32_NR_syscalls);
+ regs->ax = ia32_sys_call(regs, unr);
+ } else if (nr != -1) {
+ regs->ax = __ia32_sys_ni_syscall(regs);
+ }
+}
+
+#ifdef CONFIG_IA32_EMULATION
+static __always_inline bool int80_is_external(void)
+{
+ const unsigned int offs = (0x80 / 32) * 0x10;
+ const u32 bit = BIT(0x80 % 32);
+
+ /* The local APIC on XENPV guests is fake */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /*
+ * If vector 0x80 is set in the APIC ISR then this is an external
+ * interrupt. Either from broken hardware or injected by a VMM.
+ *
+ * Note: In guest mode this is only valid for secure guests where
+ * the secure module fully controls the vAPIC exposed to the guest.
+ */
+ return apic_read(APIC_ISR + offs) & bit;
+}
+
+/**
+ * do_int80_emulation - 32-bit legacy syscall C entry from asm
+ * @regs: syscall arguments in struct pt_args on the stack.
+ *
+ * This entry point can be used by 32-bit and 64-bit programs to perform
+ * 32-bit system calls. Instances of INT $0x80 can be found inline in
+ * various programs and libraries. It is also used by the vDSO's
+ * __kernel_vsyscall fallback for hardware that doesn't support a faster
+ * entry method. Restarted 32-bit system calls also fall back to INT
+ * $0x80 regardless of what instruction was originally used to do the
+ * system call.
+ *
+ * This is considered a slow path. It is not used by most libc
+ * implementations on modern hardware except during process startup.
+ *
+ * The arguments for the INT $0x80 based syscall are on stack in the
+ * pt_regs structure:
+ * eax: system call number
+ * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6
+ */
+__visible noinstr void do_int80_emulation(struct pt_regs *regs)
+{
+ int nr;
+
+ /* Kernel does not use INT $0x80! */
+ if (unlikely(!user_mode(regs))) {
+ irqentry_enter(regs);
+ instrumentation_begin();
+ panic("Unexpected external interrupt 0x80\n");
+ }
+
+ /*
+ * Establish kernel context for instrumentation, including for
+ * int80_is_external() below which calls into the APIC driver.
+ * Identical for soft and external interrupts.
+ */
+ enter_from_user_mode(regs);
+
+ instrumentation_begin();
+ add_random_kstack_offset();
+
+ /* Validate that this is a soft interrupt to the extent possible */
+ if (unlikely(int80_is_external()))
+ panic("Unexpected external interrupt 0x80\n");
+
+ /*
+ * The low level idtentry code pushed -1 into regs::orig_ax
+ * and regs::ax contains the syscall number.
+ *
+ * User tracing code (ptrace or signal handlers) might assume
+ * that the regs::orig_ax contains a 32-bit number on invoking
+ * a 32-bit syscall.
+ *
+ * Establish the syscall convention by saving the 32bit truncated
+ * syscall number in regs::orig_ax and by invalidating regs::ax.
+ */
+ regs->orig_ax = regs->ax & GENMASK(31, 0);
+ regs->ax = -ENOSYS;
+
+ nr = syscall_32_enter(regs);
+
+ local_irq_enable();
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+}
+
+#ifdef CONFIG_X86_FRED
+/*
+ * A FRED-specific INT80 handler is warranted for the follwing reasons:
+ *
+ * 1) As INT instructions and hardware interrupts are separate event
+ * types, FRED does not preclude the use of vector 0x80 for external
+ * interrupts. As a result, the FRED setup code does not reserve
+ * vector 0x80 and calling int80_is_external() is not merely
+ * suboptimal but actively incorrect: it could cause a system call
+ * to be incorrectly ignored.
+ *
+ * 2) It is called only for handling vector 0x80 of event type
+ * EVENT_TYPE_SWINT and will never be called to handle any external
+ * interrupt (event type EVENT_TYPE_EXTINT).
+ *
+ * 3) FRED has separate entry flows depending on if the event came from
+ * user space or kernel space, and because the kernel does not use
+ * INT insns, the FRED kernel entry handler fred_entry_from_kernel()
+ * falls through to fred_bad_type() if the event type is
+ * EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling
+ * an INT insn, it can only be from a user level.
+ *
+ * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will
+ * likely take a different approach if it is ever needed: it
+ * probably belongs in either fred_intx()/ fred_other() or
+ * asm_fred_entrypoint_user(), depending on if this ought to be done
+ * for all entries from userspace or only system
+ * calls.
+ *
+ * 5) INT $0x80 is the fast path for 32-bit system calls under FRED.
+ */
+DEFINE_FREDENTRY_RAW(int80_emulation)
+{
+ int nr;
+
+ enter_from_user_mode(regs);
+
+ instrumentation_begin();
+ add_random_kstack_offset();
+
+ /*
+ * FRED pushed 0 into regs::orig_ax and regs::ax contains the
+ * syscall number.
+ *
+ * User tracing code (ptrace or signal handlers) might assume
+ * that the regs::orig_ax contains a 32-bit number on invoking
+ * a 32-bit syscall.
+ *
+ * Establish the syscall convention by saving the 32bit truncated
+ * syscall number in regs::orig_ax and by invalidating regs::ax.
+ */
+ regs->orig_ax = regs->ax & GENMASK(31, 0);
+ regs->ax = -ENOSYS;
+
+ nr = syscall_32_enter(regs);
+
+ local_irq_enable();
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+}
+#endif /* CONFIG_X86_FRED */
+
+#else /* CONFIG_IA32_EMULATION */
+
+/* Handles int $0x80 on a 32bit kernel */
+__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
+{
+ int nr = syscall_32_enter(regs);
+
+ add_random_kstack_offset();
+ /*
+ * Subtlety here: if ptrace pokes something larger than 2^31-1 into
+ * orig_ax, the int return value truncates it. This matches
+ * the semantics of syscall_get_nr().
+ */
+ nr = syscall_enter_from_user_mode(regs, nr);
+ instrumentation_begin();
+
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+}
+#endif /* !CONFIG_IA32_EMULATION */
+
+static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
+{
+ int nr = syscall_32_enter(regs);
+ int res;
+
+ add_random_kstack_offset();
+ /*
+ * This cannot use syscall_enter_from_user_mode() as it has to
+ * fetch EBP before invoking any of the syscall entry work
+ * functions.
+ */
+ enter_from_user_mode(regs);
+
+ instrumentation_begin();
+ local_irq_enable();
+ /* Fetch EBP from where the vDSO stashed it. */
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /*
+ * Micro-optimization: the pointer we're following is
+ * explicitly 32 bits, so it can't be out of range.
+ */
+ res = __get_user(*(u32 *)&regs->bp,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp);
+ } else {
+ res = get_user(*(u32 *)&regs->bp,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp);
+ }
+
+ if (res) {
+ /* User code screwed up. */
+ regs->ax = -EFAULT;
+
+ local_irq_disable();
+ instrumentation_end();
+ irqentry_exit_to_user_mode(regs);
+ return false;
+ }
+
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+
+ /* Now this is just like a normal syscall. */
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+ return true;
+}
+
+/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
+__visible noinstr bool do_fast_syscall_32(struct pt_regs *regs)
+{
+ /*
+ * Called using the internal vDSO SYSENTER/SYSCALL32 calling
+ * convention. Adjust regs so it looks like we entered using int80.
+ */
+ unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
+ vdso_image_32.sym_int80_landing_pad;
+
+ /*
+ * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
+ * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
+ * Fix it up.
+ */
+ regs->ip = landing_pad;
+
+ /* Invoke the syscall. If it failed, keep it simple: use IRET. */
+ if (!__do_fast_syscall_32(regs))
+ return false;
+
+ /*
+ * Check that the register state is valid for using SYSRETL/SYSEXIT
+ * to exit to userspace. Otherwise use the slower but fully capable
+ * IRET exit path.
+ */
+
+ /* XEN PV guests always use the IRET path */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /* EIP must point to the VDSO landing pad */
+ if (unlikely(regs->ip != landing_pad))
+ return false;
+
+ /* CS and SS must match the values set in MSR_STAR */
+ if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS))
+ return false;
+
+ /* If the TF, RF, or VM flags are set, use IRET */
+ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)))
+ return false;
+
+ /* Use SYSRETL/SYSEXIT to exit to userspace */
+ return true;
+}
+
+/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
+__visible noinstr bool do_SYSENTER_32(struct pt_regs *regs)
+{
+ /* SYSENTER loses RSP, but the vDSO saved it in RBP. */
+ regs->sp = regs->bp;
+
+ /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */
+ regs->flags |= X86_EFLAGS_IF;
+
+ return do_fast_syscall_32(regs);
+}
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index be120eec1fc9..b6e68ea98b83 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -1,18 +1,141 @@
-// SPDX-License-Identifier: GPL-2.0
-/* System call table for x86-64. */
+// SPDX-License-Identifier: GPL-2.0-only
+/* 64-bit system call dispatch */
#include <linux/linkage.h>
#include <linux/sys.h>
#include <linux/cache.h>
#include <linux/syscalls.h>
+#include <linux/entry-common.h>
+#include <linux/nospec.h>
#include <asm/syscall.h>
#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
+#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
#include <asm/syscalls_64.h>
-#undef __SYSCALL
+#ifdef CONFIG_X86_X32_ABI
+#include <asm/syscalls_x32.h>
+#endif
+#undef __SYSCALL
-#define __SYSCALL(nr, sym) __x64_##sym,
+#undef __SYSCALL_NORETURN
+#define __SYSCALL_NORETURN __SYSCALL
-asmlinkage const sys_call_ptr_t sys_call_table[] = {
+/*
+ * The sys_call_table[] is no longer used for system calls, but
+ * kernel/trace/trace_syscalls.c still wants to know the system
+ * call address.
+ */
+#define __SYSCALL(nr, sym) __x64_##sym,
+const sys_call_ptr_t sys_call_table[] = {
#include <asm/syscalls_64.h>
};
+#undef __SYSCALL
+
+#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
+long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
+{
+ switch (nr) {
+ #include <asm/syscalls_64.h>
+ default: return __x64_sys_ni_syscall(regs);
+ }
+}
+
+#ifdef CONFIG_X86_X32_ABI
+long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
+{
+ switch (nr) {
+ #include <asm/syscalls_x32.h>
+ default: return __x64_sys_ni_syscall(regs);
+ }
+}
+#endif
+
+static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
+{
+ /*
+ * Convert negative numbers to very high and thus out of range
+ * numbers for comparisons.
+ */
+ unsigned int unr = nr;
+
+ if (likely(unr < NR_syscalls)) {
+ unr = array_index_nospec(unr, NR_syscalls);
+ regs->ax = x64_sys_call(regs, unr);
+ return true;
+ }
+ return false;
+}
+
+static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
+{
+ /*
+ * Adjust the starting offset of the table, and convert numbers
+ * < __X32_SYSCALL_BIT to very high and thus out of range
+ * numbers for comparisons.
+ */
+ unsigned int xnr = nr - __X32_SYSCALL_BIT;
+
+ if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
+ xnr = array_index_nospec(xnr, X32_NR_syscalls);
+ regs->ax = x32_sys_call(regs, xnr);
+ return true;
+ }
+ return false;
+}
+
+/* Returns true to return using SYSRET, or false to use IRET */
+__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
+{
+ add_random_kstack_offset();
+ nr = syscall_enter_from_user_mode(regs, nr);
+
+ instrumentation_begin();
+
+ if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
+ /* Invalid system call, but still a system call. */
+ regs->ax = __x64_sys_ni_syscall(regs);
+ }
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+
+ /*
+ * Check that the register state is valid for using SYSRET to exit
+ * to userspace. Otherwise use the slower but fully capable IRET
+ * exit path.
+ */
+
+ /* XEN PV guests always use the IRET path */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /* SYSRET requires RCX == RIP and R11 == EFLAGS */
+ if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
+ return false;
+
+ /* CS and SS must match the values set in MSR_STAR */
+ if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
+ return false;
+
+ /*
+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP.
+ *
+ * TASK_SIZE_MAX covers all user-accessible addresses other than
+ * the deprecated vsyscall page.
+ */
+ if (unlikely(regs->ip >= TASK_SIZE_MAX))
+ return false;
+
+ /*
+ * SYSRET cannot restore RF. It can restore TF, but unlike IRET,
+ * restoring TF results in a trap from userspace immediately after
+ * SYSRET.
+ */
+ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
+ return false;
+
+ /* Use SYSRET to exit to userspace */
+ return true;
+}
diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c
deleted file mode 100644
index bdd0e03a1265..000000000000
--- a/arch/x86/entry/syscall_x32.c
+++ /dev/null
@@ -1,18 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* System call table for x32 ABI. */
-
-#include <linux/linkage.h>
-#include <linux/sys.h>
-#include <linux/cache.h>
-#include <linux/syscalls.h>
-#include <asm/syscall.h>
-
-#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
-#include <asm/syscalls_x32.h>
-#undef __SYSCALL
-
-#define __SYSCALL(nr, sym) __x64_##sym,
-
-asmlinkage const sys_call_ptr_t x32_sys_call_table[] = {
-#include <asm/syscalls_x32.h>
-};
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 2d0b1bd866ea..e979a3eac7a3 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -1,8 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
#
# 32-bit system call numbers and entry vectors
#
# The format is:
-# <number> <abi> <name> <entry point> <compat entry point>
+# <number> <abi> <name> <entry point> [<compat entry point> [noreturn]]
#
# The __ia32_sys and __ia32_compat_sys stubs are created on-the-fly for
# sys_*() system calls and compat_sys_*() compat system calls if
@@ -12,7 +13,7 @@
# The abi is always "i386" for this file.
#
0 i386 restart_syscall sys_restart_syscall
-1 i386 exit sys_exit
+1 i386 exit sys_exit - noreturn
2 i386 fork sys_fork
3 i386 read sys_read
4 i386 write sys_write
@@ -263,8 +264,8 @@
249 i386 io_cancel sys_io_cancel
250 i386 fadvise64 sys_ia32_fadvise64
# 251 is available for reuse (was briefly sys_set_zone_reclaim)
-252 i386 exit_group sys_exit_group
-253 i386 lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie
+252 i386 exit_group sys_exit_group - noreturn
+253 i386 lookup_dcookie
254 i386 epoll_create sys_epoll_create
255 i386 epoll_ctl sys_epoll_ctl
256 i386 epoll_wait sys_epoll_wait
@@ -395,7 +396,7 @@
381 i386 pkey_alloc sys_pkey_alloc
382 i386 pkey_free sys_pkey_free
383 i386 statx sys_statx
-384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
+384 i386 arch_prctl sys_arch_prctl
385 i386 io_pgetevents sys_io_pgetevents_time32 compat_sys_io_pgetevents
386 i386 rseq sys_rseq
393 i386 semget sys_semget
@@ -420,7 +421,7 @@
412 i386 utimensat_time64 sys_utimensat
413 i386 pselect6_time64 sys_pselect6 compat_sys_pselect6_time64
414 i386 ppoll_time64 sys_ppoll compat_sys_ppoll_time64
-416 i386 io_pgetevents_time64 sys_io_pgetevents
+416 i386 io_pgetevents_time64 sys_io_pgetevents compat_sys_io_pgetevents_time64
417 i386 recvmmsg_time64 sys_recvmmsg compat_sys_recvmmsg_time64
418 i386 mq_timedsend_time64 sys_mq_timedsend
419 i386 mq_timedreceive_time64 sys_mq_timedreceive
@@ -457,3 +458,21 @@
450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node
451 i386 cachestat sys_cachestat
452 i386 fchmodat2 sys_fchmodat2
+453 i386 map_shadow_stack sys_map_shadow_stack
+454 i386 futex_wake sys_futex_wake
+455 i386 futex_wait sys_futex_wait
+456 i386 futex_requeue sys_futex_requeue
+457 i386 statmount sys_statmount
+458 i386 listmount sys_listmount
+459 i386 lsm_get_self_attr sys_lsm_get_self_attr
+460 i386 lsm_set_self_attr sys_lsm_set_self_attr
+461 i386 lsm_list_modules sys_lsm_list_modules
+462 i386 mseal sys_mseal
+463 i386 setxattrat sys_setxattrat
+464 i386 getxattrat sys_getxattrat
+465 i386 listxattrat sys_listxattrat
+466 i386 removexattrat sys_removexattrat
+467 i386 open_tree_attr sys_open_tree_attr
+468 i386 file_getattr sys_file_getattr
+469 i386 file_setattr sys_file_setattr
+470 i386 listns sys_listns
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 814768249eae..8a4ac4841be6 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -1,8 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
#
# 64-bit system call numbers and entry vectors
#
# The format is:
-# <number> <abi> <name> <entry point>
+# <number> <abi> <name> <entry point> [<compat entry point> [noreturn]]
#
# The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls
#
@@ -68,7 +69,7 @@
57 common fork sys_fork
58 common vfork sys_vfork
59 64 execve sys_execve
-60 common exit sys_exit
+60 common exit sys_exit - noreturn
61 common wait4 sys_wait4
62 common kill sys_kill
63 common uname sys_newuname
@@ -220,7 +221,7 @@
209 64 io_submit sys_io_submit
210 common io_cancel sys_io_cancel
211 64 get_thread_area
-212 common lookup_dcookie sys_lookup_dcookie
+212 common lookup_dcookie
213 common epoll_create sys_epoll_create
214 64 epoll_ctl_old
215 64 epoll_wait_old
@@ -239,7 +240,7 @@
228 common clock_gettime sys_clock_gettime
229 common clock_getres sys_clock_getres
230 common clock_nanosleep sys_clock_nanosleep
-231 common exit_group sys_exit_group
+231 common exit_group sys_exit_group - noreturn
232 common epoll_wait sys_epoll_wait
233 common epoll_ctl sys_epoll_ctl
234 common tgkill sys_tgkill
@@ -343,6 +344,8 @@
332 common statx sys_statx
333 common io_pgetevents sys_io_pgetevents
334 common rseq sys_rseq
+335 common uretprobe sys_uretprobe
+336 common uprobe sys_uprobe
# don't use numbers 387 through 423, add new calls after the last
# 'common' entry
424 common pidfd_send_signal sys_pidfd_send_signal
@@ -374,6 +377,24 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2
+453 common map_shadow_stack sys_map_shadow_stack
+454 common futex_wake sys_futex_wake
+455 common futex_wait sys_futex_wait
+456 common futex_requeue sys_futex_requeue
+457 common statmount sys_statmount
+458 common listmount sys_listmount
+459 common lsm_get_self_attr sys_lsm_get_self_attr
+460 common lsm_set_self_attr sys_lsm_set_self_attr
+461 common lsm_list_modules sys_lsm_list_modules
+462 common mseal sys_mseal
+463 common setxattrat sys_setxattrat
+464 common getxattrat sys_getxattrat
+465 common listxattrat sys_listxattrat
+466 common removexattrat sys_removexattrat
+467 common open_tree_attr sys_open_tree_attr
+468 common file_getattr sys_file_getattr
+469 common file_setattr sys_file_setattr
+470 common listns sys_listns
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk.S
index 27b5da2111ac..119ebdc3d362 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk.S
@@ -4,43 +4,10 @@
* disturbance of register allocation in some inline assembly constructs.
* Copyright 2001,2002 by Andi Kleen, SuSE Labs.
*/
+#include <linux/export.h>
#include <linux/linkage.h>
#include "calling.h"
#include <asm/asm.h>
-#include <asm/export.h>
-
- /* rdi: arg1 ... normal C conventions. rax is saved/restored. */
- .macro THUNK name, func
-SYM_FUNC_START(\name)
- pushq %rbp
- movq %rsp, %rbp
-
- pushq %rdi
- pushq %rsi
- pushq %rdx
- pushq %rcx
- pushq %rax
- pushq %r8
- pushq %r9
- pushq %r10
- pushq %r11
-
- call \func
-
- popq %r11
- popq %r10
- popq %r9
- popq %r8
- popq %rax
- popq %rcx
- popq %rdx
- popq %rsi
- popq %rdi
- popq %rbp
- RET
-SYM_FUNC_END(\name)
- _ASM_NOKPROBE(\name)
- .endm
THUNK preempt_schedule_thunk, preempt_schedule
THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace
diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S
deleted file mode 100644
index ff6e7003da97..000000000000
--- a/arch/x86/entry/thunk_32.S
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash)
- * Copyright 2008 by Steven Rostedt, Red Hat, Inc
- * (inspired by Andi Kleen's thunk_64.S)
- */
- #include <linux/linkage.h>
- #include <asm/asm.h>
- #include <asm/export.h>
-
- /* put return address in eax (arg1) */
- .macro THUNK name, func, put_ret_addr_in_eax=0
-SYM_CODE_START_NOALIGN(\name)
- pushl %eax
- pushl %ecx
- pushl %edx
-
- .if \put_ret_addr_in_eax
- /* Place EIP in the arg1 */
- movl 3*4(%esp), %eax
- .endif
-
- call \func
- popl %edx
- popl %ecx
- popl %eax
- RET
- _ASM_NOKPROBE(\name)
-SYM_CODE_END(\name)
- .endm
-
- THUNK preempt_schedule_thunk, preempt_schedule
- THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace
- EXPORT_SYMBOL(preempt_schedule_thunk)
- EXPORT_SYMBOL(preempt_schedule_notrace_thunk)
-
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 6a1821bd7d5e..f247f5f5cb44 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -3,66 +3,36 @@
# Building vDSO images for x86.
#
-# Include the generic Makefile to check the built vdso.
-include $(srctree)/lib/vdso/Makefile
+# Include the generic Makefile to check the built vDSO:
+include $(srctree)/lib/vdso/Makefile.include
-# Sanitizer runtimes are unavailable and cannot be linked here.
-KASAN_SANITIZE := n
-KMSAN_SANITIZE_vclock_gettime.o := n
-KMSAN_SANITIZE_vgetcpu.o := n
-
-UBSAN_SANITIZE := n
-KCSAN_SANITIZE := n
-OBJECT_FILES_NON_STANDARD := y
-
-# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
-KCOV_INSTRUMENT := n
-
-VDSO64-$(CONFIG_X86_64) := y
-VDSOX32-$(CONFIG_X86_X32_ABI) := y
-VDSO32-$(CONFIG_X86_32) := y
-VDSO32-$(CONFIG_IA32_EMULATION) := y
-
-# files to link into the vdso
-vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
+# Files to link into the vDSO:
+vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vgetrandom.o vgetrandom-chacha.o
vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
vobjs32-y += vdso32/vclock_gettime.o vdso32/vgetcpu.o
vobjs-$(CONFIG_X86_SGX) += vsgx.o
-# files to link into kernel
-obj-y += vma.o extable.o
-KASAN_SANITIZE_vma.o := y
-UBSAN_SANITIZE_vma.o := y
-KCSAN_SANITIZE_vma.o := y
-OBJECT_FILES_NON_STANDARD_vma.o := n
-OBJECT_FILES_NON_STANDARD_extable.o := n
+# Files to link into the kernel:
+obj-y += vma.o extable.o
-# vDSO images to build
-vdso_img-$(VDSO64-y) += 64
-vdso_img-$(VDSOX32-y) += x32
-vdso_img-$(VDSO32-y) += 32
+# vDSO images to build:
+obj-$(CONFIG_X86_64) += vdso-image-64.o
+obj-$(CONFIG_X86_X32_ABI) += vdso-image-x32.o
+obj-$(CONFIG_COMPAT_32) += vdso-image-32.o vdso32-setup.o
-obj-$(VDSO32-y) += vdso32-setup.o
-
-vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
-vobjs32 := $(foreach F,$(vobjs32-y),$(obj)/$F)
+vobjs := $(addprefix $(obj)/, $(vobjs-y))
+vobjs32 := $(addprefix $(obj)/, $(vobjs32-y))
$(obj)/vdso.o: $(obj)/vdso.so
targets += vdso.lds $(vobjs-y)
targets += vdso32/vdso32.lds $(vobjs32-y)
-# Build the vDSO image C files and link them in.
-vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o)
-vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c)
-vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg)
-obj-y += $(vdso_img_objs)
-targets += $(vdso_img_cfiles)
-targets += $(vdso_img_sodbg) $(vdso_img-y:%=vdso%.so)
+targets += $(foreach x, 64 x32 32, vdso-image-$(x).c vdso$(x).so vdso$(x).so.dbg)
CPPFLAGS_vdso.lds += -P -C
-VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 --no-undefined \
+VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 \
-z max-page-size=4096
$(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE
@@ -86,13 +56,13 @@ CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
-fno-omit-frame-pointer -foptimize-sibling-calls \
-DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
-ifdef CONFIG_RETPOLINE
+ifdef CONFIG_MITIGATION_RETPOLINE
ifneq ($(RETPOLINE_VDSO_CFLAGS),)
CFL += $(RETPOLINE_VDSO_CFLAGS)
endif
endif
-$(vobjs): KBUILD_CFLAGS := $(filter-out $(PADDING_CFLAGS) $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
+$(vobjs): KBUILD_CFLAGS := $(filter-out $(PADDING_CFLAGS) $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(KSTACK_ERASE_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
$(vobjs): KBUILD_AFLAGS += -DBUILD_VDSO
#
@@ -103,6 +73,7 @@ CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg
CFLAGS_REMOVE_vgetcpu.o = -pg
CFLAGS_REMOVE_vdso32/vgetcpu.o = -pg
CFLAGS_REMOVE_vsgx.o = -pg
+CFLAGS_REMOVE_vgetrandom.o = -pg
#
# X32 processes use x32 vDSO to access 64bit kernel data.
@@ -122,7 +93,7 @@ VDSO_LDFLAGS_vdsox32.lds = -m elf32_x86_64 -soname linux-vdso.so.1 \
vobjx32s-y := $(vobjs-y:.o=-x32.o)
# same thing, but in the output directory
-vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F)
+vobjx32s := $(addprefix $(obj)/, $(vobjx32s-y))
# Convert 64bit object file to x32 for x32 vDSO.
quiet_cmd_x32 = X32 $@
@@ -152,6 +123,7 @@ KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(RANDSTRUCT_CFLAGS),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(KSTACK_ERASE_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32))
@@ -162,8 +134,9 @@ KBUILD_CFLAGS_32 += -fno-stack-protector
KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING
+KBUILD_CFLAGS_32 += -DBUILD_VDSO
-ifdef CONFIG_RETPOLINE
+ifdef CONFIG_MITIGATION_RETPOLINE
ifneq ($(RETPOLINE_VDSO_CFLAGS),)
KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS)
endif
@@ -180,41 +153,10 @@ $(obj)/vdso32.so.dbg: $(obj)/vdso32/vdso32.lds $(vobjs32) FORCE
quiet_cmd_vdso = VDSO $@
cmd_vdso = $(LD) -o $@ \
$(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
- -T $(filter %.lds,$^) $(filter %.o,$^) && \
- sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
+ -T $(filter %.lds,$^) $(filter %.o,$^)
-VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 \
+VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 --no-undefined \
$(call ld-option, --eh-frame-hdr) -Bsymbolic -z noexecstack
-GCOV_PROFILE := n
quiet_cmd_vdso_and_check = VDSO $@
cmd_vdso_and_check = $(cmd_vdso); $(cmd_vdso_check)
-
-#
-# Install the unstripped copies of vdso*.so. If our toolchain supports
-# build-id, install .build-id links as well.
-#
-quiet_cmd_vdso_install = INSTALL $(@:install_%=%)
-define cmd_vdso_install
- cp $< "$(MODLIB)/vdso/$(@:install_%=%)"; \
- if readelf -n $< |grep -q 'Build ID'; then \
- buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \
- first=`echo $$buildid | cut -b-2`; \
- last=`echo $$buildid | cut -b3-`; \
- mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \
- ln -sf "../../$(@:install_%=%)" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \
- fi
-endef
-
-vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%)
-
-$(MODLIB)/vdso: FORCE
- @mkdir -p $(MODLIB)/vdso
-
-$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso
- $(call cmd,vdso_install)
-
-PHONY += vdso_install $(vdso_img_insttargets)
-vdso_install: $(vdso_img_insttargets)
-
-clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so*
diff --git a/arch/x86/entry/vdso/checkundef.sh b/arch/x86/entry/vdso/checkundef.sh
deleted file mode 100755
index 7ee90a9b549d..000000000000
--- a/arch/x86/entry/vdso/checkundef.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/sh
-nm="$1"
-file="$2"
-$nm "$file" | grep '^ *U' > /dev/null 2>&1
-if [ $? -eq 1 ]; then
- exit 0
-else
- echo "$file: undefined symbols found" >&2
- exit 1
-fi
diff --git a/arch/x86/entry/vdso/extable.h b/arch/x86/entry/vdso/extable.h
index b56f6b012941..baba612b832c 100644
--- a/arch/x86/entry/vdso/extable.h
+++ b/arch/x86/entry/vdso/extable.h
@@ -7,7 +7,7 @@
* vDSO uses a dedicated handler the addresses are relative to the overall
* exception table, not each individual entry.
*/
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
#define _ASM_VDSO_EXTABLE_HANDLE(from, to) \
ASM_VDSO_EXTABLE_HANDLE from to
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 7d70935b6758..0debc194bd78 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -11,12 +11,10 @@
#include <linux/time.h>
#include <linux/kernel.h>
#include <linux/types.h>
+#include <vdso/gettime.h>
#include "../../../../lib/vdso/gettimeofday.c"
-extern int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz);
-extern __kernel_old_time_t __vdso_time(__kernel_old_time_t *t);
-
int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz)
{
return __cvdso_gettimeofday(tv, tz);
@@ -35,9 +33,6 @@ __kernel_old_time_t time(__kernel_old_time_t *t) __attribute__((weak, alias("__v
#if defined(CONFIG_X86_64) && !defined(BUILD_VDSO32_64)
/* both 64-bit and x32 use these */
-extern int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts);
-extern int __vdso_clock_getres(clockid_t clock, struct __kernel_timespec *res);
-
int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts)
{
return __cvdso_clock_gettime(clock, ts);
@@ -56,9 +51,6 @@ int clock_getres(clockid_t, struct __kernel_timespec *)
#else
/* i386 only */
-extern int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts);
-extern int __vdso_clock_getres(clockid_t clock, struct old_timespec32 *res);
-
int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts)
{
return __cvdso_clock_gettime32(clock, ts);
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
index bafa73f09e92..ec1ac191a057 100644
--- a/arch/x86/entry/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -1,5 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/vdso.h>
+#include <asm/vdso/vsyscall.h>
+#include <vdso/datapage.h>
/*
* Linker script for vDSO. This is an ELF shared object prelinked to
@@ -16,23 +18,11 @@ SECTIONS
* segment.
*/
- vvar_start = . - 4 * PAGE_SIZE;
- vvar_page = vvar_start;
+ VDSO_VVAR_SYMS
- /* Place all vvars at the offsets in asm/vvar.h. */
-#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset;
-#include <asm/vvar.h>
-#undef EMIT_VVAR
-
- pvclock_page = vvar_start + PAGE_SIZE;
- hvclock_page = vvar_start + 2 * PAGE_SIZE;
- timens_page = vvar_start + 3 * PAGE_SIZE;
-
-#undef _ASM_X86_VVAR_H
- /* Place all vvars in timens too at the offsets in asm/vvar.h. */
-#define EMIT_VVAR(name, offset) timens_ ## name = timens_page + offset;
-#include <asm/vvar.h>
-#undef EMIT_VVAR
+ vclock_pages = VDSO_VCLOCK_PAGES_START(vdso_u_data);
+ pvclock_page = vclock_pages + VDSO_PAGE_PVCLOCK_OFFSET * PAGE_SIZE;
+ hvclock_page = vclock_pages + VDSO_PAGE_HVCLOCK_OFFSET * PAGE_SIZE;
. = SIZEOF_HEADERS;
diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
index e8c60ae7a7c8..0bab5f4af6d1 100644
--- a/arch/x86/entry/vdso/vdso.lds.S
+++ b/arch/x86/entry/vdso/vdso.lds.S
@@ -30,6 +30,8 @@ VERSION {
#ifdef CONFIG_X86_SGX
__vdso_sgx_enter_enclave;
#endif
+ getrandom;
+ __vdso_getrandom;
local: *;
};
}
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 90d15f2a7205..f84e8f8fa5fe 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -69,33 +69,12 @@
const char *outfilename;
-/* Symbols that we need in vdso2c. */
-enum {
- sym_vvar_start,
- sym_vvar_page,
- sym_pvclock_page,
- sym_hvclock_page,
- sym_timens_page,
-};
-
-const int special_pages[] = {
- sym_vvar_page,
- sym_pvclock_page,
- sym_hvclock_page,
- sym_timens_page,
-};
-
struct vdso_sym {
const char *name;
bool export;
};
struct vdso_sym required_syms[] = {
- [sym_vvar_start] = {"vvar_start", true},
- [sym_vvar_page] = {"vvar_page", true},
- [sym_pvclock_page] = {"pvclock_page", true},
- [sym_hvclock_page] = {"hvclock_page", true},
- [sym_timens_page] = {"timens_page", true},
{"VDSO32_NOTE_MASK", true},
{"__kernel_vsyscall", true},
{"__kernel_sigreturn", true},
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index 67b3e37576a6..78ed1c1f28b9 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -150,26 +150,6 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
}
}
- /* Validate mapping addresses. */
- for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) {
- INT_BITS symval = syms[special_pages[i]];
-
- if (!symval)
- continue; /* The mapping isn't used; ignore it. */
-
- if (symval % 4096)
- fail("%s must be a multiple of 4096\n",
- required_syms[i].name);
- if (symval + 4096 < syms[sym_vvar_start])
- fail("%s underruns vvar_start\n",
- required_syms[i].name);
- if (symval + 4096 > 0)
- fail("%s is on the wrong side of the vdso text\n",
- required_syms[i].name);
- }
- if (syms[sym_vvar_start] % 4096)
- fail("vvar_begin must be a multiple of 4096\n");
-
if (!image_name) {
fwrite(stripped_addr, stripped_len, 1, outfile);
return;
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index f3b3cacbcbb0..8894013eea1d 100644
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -51,15 +51,17 @@ __setup("vdso32=", vdso32_setup);
__setup_param("vdso=", vdso_setup, vdso32_setup, 0);
#endif
-#ifdef CONFIG_X86_64
#ifdef CONFIG_SYSCTL
-/* Register vsyscall32 into the ABI table */
#include <linux/sysctl.h>
-static struct ctl_table abi_table2[] = {
+static const struct ctl_table vdso_table[] = {
{
+#ifdef CONFIG_X86_64
.procname = "vsyscall32",
+#else
+ .procname = "vdso_enabled",
+#endif
.data = &vdso32_enabled,
.maxlen = sizeof(int),
.mode = 0644,
@@ -67,15 +69,18 @@ static struct ctl_table abi_table2[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
- {}
};
static __init int ia32_binfmt_init(void)
{
- register_sysctl("abi", abi_table2);
+#ifdef CONFIG_X86_64
+ /* Register vsyscall32 into the ABI table */
+ register_sysctl("abi", vdso_table);
+#else
+ register_sysctl_init("vm", vdso_table);
+#endif
return 0;
}
__initcall(ia32_binfmt_init);
#endif /* CONFIG_SYSCTL */
-#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/entry/vdso/vgetrandom-chacha.S b/arch/x86/entry/vdso/vgetrandom-chacha.S
new file mode 100644
index 000000000000..bcba5639b8ee
--- /dev/null
+++ b/arch/x86/entry/vdso/vgetrandom-chacha.S
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+.section .rodata, "a"
+.align 16
+CONSTANTS: .octa 0x6b20657479622d323320646e61707865
+.text
+
+/*
+ * Very basic SSE2 implementation of ChaCha20. Produces a given positive number
+ * of blocks of output with a nonce of 0, taking an input key and 8-byte
+ * counter. Importantly does not spill to the stack. Its arguments are:
+ *
+ * rdi: output bytes
+ * rsi: 32-byte key input
+ * rdx: 8-byte counter input/output
+ * rcx: number of 64-byte blocks to write to output
+ */
+SYM_FUNC_START(__arch_chacha20_blocks_nostack)
+
+.set output, %rdi
+.set key, %rsi
+.set counter, %rdx
+.set nblocks, %rcx
+.set i, %al
+/* xmm registers are *not* callee-save. */
+.set temp, %xmm0
+.set state0, %xmm1
+.set state1, %xmm2
+.set state2, %xmm3
+.set state3, %xmm4
+.set copy0, %xmm5
+.set copy1, %xmm6
+.set copy2, %xmm7
+.set copy3, %xmm8
+.set one, %xmm9
+
+ /* copy0 = "expand 32-byte k" */
+ movaps CONSTANTS(%rip),copy0
+ /* copy1,copy2 = key */
+ movups 0x00(key),copy1
+ movups 0x10(key),copy2
+ /* copy3 = counter || zero nonce */
+ movq 0x00(counter),copy3
+ /* one = 1 || 0 */
+ movq $1,%rax
+ movq %rax,one
+
+.Lblock:
+ /* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */
+ movdqa copy0,state0
+ movdqa copy1,state1
+ movdqa copy2,state2
+ movdqa copy3,state3
+
+ movb $10,i
+.Lpermute:
+ /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
+ paddd state1,state0
+ pxor state0,state3
+ movdqa state3,temp
+ pslld $16,temp
+ psrld $16,state3
+ por temp,state3
+
+ /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
+ paddd state3,state2
+ pxor state2,state1
+ movdqa state1,temp
+ pslld $12,temp
+ psrld $20,state1
+ por temp,state1
+
+ /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
+ paddd state1,state0
+ pxor state0,state3
+ movdqa state3,temp
+ pslld $8,temp
+ psrld $24,state3
+ por temp,state3
+
+ /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
+ paddd state3,state2
+ pxor state2,state1
+ movdqa state1,temp
+ pslld $7,temp
+ psrld $25,state1
+ por temp,state1
+
+ /* state1[0,1,2,3] = state1[1,2,3,0] */
+ pshufd $0x39,state1,state1
+ /* state2[0,1,2,3] = state2[2,3,0,1] */
+ pshufd $0x4e,state2,state2
+ /* state3[0,1,2,3] = state3[3,0,1,2] */
+ pshufd $0x93,state3,state3
+
+ /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
+ paddd state1,state0
+ pxor state0,state3
+ movdqa state3,temp
+ pslld $16,temp
+ psrld $16,state3
+ por temp,state3
+
+ /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
+ paddd state3,state2
+ pxor state2,state1
+ movdqa state1,temp
+ pslld $12,temp
+ psrld $20,state1
+ por temp,state1
+
+ /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
+ paddd state1,state0
+ pxor state0,state3
+ movdqa state3,temp
+ pslld $8,temp
+ psrld $24,state3
+ por temp,state3
+
+ /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
+ paddd state3,state2
+ pxor state2,state1
+ movdqa state1,temp
+ pslld $7,temp
+ psrld $25,state1
+ por temp,state1
+
+ /* state1[0,1,2,3] = state1[3,0,1,2] */
+ pshufd $0x93,state1,state1
+ /* state2[0,1,2,3] = state2[2,3,0,1] */
+ pshufd $0x4e,state2,state2
+ /* state3[0,1,2,3] = state3[1,2,3,0] */
+ pshufd $0x39,state3,state3
+
+ decb i
+ jnz .Lpermute
+
+ /* output0 = state0 + copy0 */
+ paddd copy0,state0
+ movups state0,0x00(output)
+ /* output1 = state1 + copy1 */
+ paddd copy1,state1
+ movups state1,0x10(output)
+ /* output2 = state2 + copy2 */
+ paddd copy2,state2
+ movups state2,0x20(output)
+ /* output3 = state3 + copy3 */
+ paddd copy3,state3
+ movups state3,0x30(output)
+
+ /* ++copy3.counter */
+ paddq one,copy3
+
+ /* output += 64, --nblocks */
+ addq $64,output
+ decq nblocks
+ jnz .Lblock
+
+ /* counter = copy3.counter */
+ movq copy3,0x00(counter)
+
+ /* Zero out the potentially sensitive regs, in case nothing uses these again. */
+ pxor state0,state0
+ pxor state1,state1
+ pxor state2,state2
+ pxor state3,state3
+ pxor copy1,copy1
+ pxor copy2,copy2
+ pxor temp,temp
+
+ ret
+SYM_FUNC_END(__arch_chacha20_blocks_nostack)
diff --git a/arch/x86/entry/vdso/vgetrandom.c b/arch/x86/entry/vdso/vgetrandom.c
new file mode 100644
index 000000000000..430862b8977c
--- /dev/null
+++ b/arch/x86/entry/vdso/vgetrandom.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+#include <linux/types.h>
+
+#include "../../../../lib/vdso/getrandom.c"
+
+ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len)
+{
+ return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len);
+}
+
+ssize_t getrandom(void *, size_t, unsigned int, void *, size_t)
+ __attribute__((weak, alias("__vdso_getrandom")));
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 7645730dc228..afe105b2f907 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -14,29 +14,20 @@
#include <linux/elf.h>
#include <linux/cpu.h>
#include <linux/ptrace.h>
-#include <linux/time_namespace.h>
+#include <linux/vdso_datastore.h>
#include <asm/pvclock.h>
#include <asm/vgtod.h>
#include <asm/proto.h>
#include <asm/vdso.h>
-#include <asm/vvar.h>
#include <asm/tlb.h>
#include <asm/page.h>
#include <asm/desc.h>
#include <asm/cpufeature.h>
+#include <asm/vdso/vsyscall.h>
#include <clocksource/hyperv_timer.h>
-#undef _ASM_X86_VVAR_H
-#define EMIT_VVAR(name, offset) \
- const size_t name ## _offset = offset;
-#include <asm/vvar.h>
-
-struct vdso_data *arch_get_vdso_data(void *vvar_page)
-{
- return (struct vdso_data *)(vvar_page + _vdso_data_offset);
-}
-#undef EMIT_VVAR
+static_assert(VDSO_NR_PAGES + VDSO_NR_VCLOCK_PAGES == __VDSO_PAGES);
unsigned int vclocks_used __read_mostly;
@@ -56,7 +47,6 @@ int __init init_vdso_image(const struct vdso_image *image)
return 0;
}
-static const struct vm_special_mapping vvar_mapping;
struct linux_binprm;
static vm_fault_t vdso_fault(const struct vm_special_mapping *sm,
@@ -75,7 +65,6 @@ static vm_fault_t vdso_fault(const struct vm_special_mapping *sm,
static void vdso_fix_landing(const struct vdso_image *image,
struct vm_area_struct *new_vma)
{
-#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
if (in_ia32_syscall() && image == &vdso_image_32) {
struct pt_regs *regs = current_pt_regs();
unsigned long vdso_land = image->sym_int80_landing_pad;
@@ -86,7 +75,6 @@ static void vdso_fix_landing(const struct vdso_image *image,
if (regs->ip == old_land_addr)
regs->ip = new_vma->vm_start + vdso_land;
}
-#endif
}
static int vdso_mremap(const struct vm_special_mapping *sm,
@@ -100,106 +88,32 @@ static int vdso_mremap(const struct vm_special_mapping *sm,
return 0;
}
-#ifdef CONFIG_TIME_NS
-/*
- * The vvar page layout depends on whether a task belongs to the root or
- * non-root time namespace. Whenever a task changes its namespace, the VVAR
- * page tables are cleared and then they will re-faulted with a
- * corresponding layout.
- * See also the comment near timens_setup_vdso_data() for details.
- */
-int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
-{
- struct mm_struct *mm = task->mm;
- struct vm_area_struct *vma;
- VMA_ITERATOR(vmi, mm, 0);
-
- mmap_read_lock(mm);
- for_each_vma(vmi, vma) {
- if (vma_is_special_mapping(vma, &vvar_mapping))
- zap_vma_pages(vma);
- }
- mmap_read_unlock(mm);
-
- return 0;
-}
-#endif
-
-static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
- struct vm_area_struct *vma, struct vm_fault *vmf)
+static vm_fault_t vvar_vclock_fault(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
{
- const struct vdso_image *image = vma->vm_mm->context.vdso_image;
- unsigned long pfn;
- long sym_offset;
-
- if (!image)
- return VM_FAULT_SIGBUS;
-
- sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) +
- image->sym_vvar_start;
-
- /*
- * Sanity check: a symbol offset of zero means that the page
- * does not exist for this vdso image, not that the page is at
- * offset zero relative to the text mapping. This should be
- * impossible here, because sym_offset should only be zero for
- * the page past the end of the vvar mapping.
- */
- if (sym_offset == 0)
- return VM_FAULT_SIGBUS;
-
- if (sym_offset == image->sym_vvar_page) {
- struct page *timens_page = find_timens_vvar_page(vma);
-
- pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT;
-
- /*
- * If a task belongs to a time namespace then a namespace
- * specific VVAR is mapped with the sym_vvar_page offset and
- * the real VVAR page is mapped with the sym_timens_page
- * offset.
- * See also the comment near timens_setup_vdso_data().
- */
- if (timens_page) {
- unsigned long addr;
- vm_fault_t err;
-
- /*
- * Optimization: inside time namespace pre-fault
- * VVAR page too. As on timens page there are only
- * offsets for clocks on VVAR, it'll be faulted
- * shortly by VDSO code.
- */
- addr = vmf->address + (image->sym_timens_page - sym_offset);
- err = vmf_insert_pfn(vma, addr, pfn);
- if (unlikely(err & VM_FAULT_ERROR))
- return err;
-
- pfn = page_to_pfn(timens_page);
- }
-
- return vmf_insert_pfn(vma, vmf->address, pfn);
- } else if (sym_offset == image->sym_pvclock_page) {
+ switch (vmf->pgoff) {
+#ifdef CONFIG_PARAVIRT_CLOCK
+ case VDSO_PAGE_PVCLOCK_OFFSET:
+ {
struct pvclock_vsyscall_time_info *pvti =
pvclock_get_pvti_cpu0_va();
- if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)) {
+
+ if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK))
return vmf_insert_pfn_prot(vma, vmf->address,
__pa(pvti) >> PAGE_SHIFT,
pgprot_decrypted(vma->vm_page_prot));
- }
- } else if (sym_offset == image->sym_hvclock_page) {
- pfn = hv_get_tsc_pfn();
-
+ break;
+ }
+#endif /* CONFIG_PARAVIRT_CLOCK */
+#ifdef CONFIG_HYPERV_TIMER
+ case VDSO_PAGE_HVCLOCK_OFFSET:
+ {
+ unsigned long pfn = hv_get_tsc_pfn();
if (pfn && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK))
return vmf_insert_pfn(vma, vmf->address, pfn);
- } else if (sym_offset == image->sym_timens_page) {
- struct page *timens_page = find_timens_vvar_page(vma);
-
- if (!timens_page)
- return VM_FAULT_SIGBUS;
-
- pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT;
- return vmf_insert_pfn(vma, vmf->address, pfn);
+ break;
+ }
+#endif /* CONFIG_HYPERV_TIMER */
}
return VM_FAULT_SIGBUS;
@@ -210,9 +124,9 @@ static const struct vm_special_mapping vdso_mapping = {
.fault = vdso_fault,
.mremap = vdso_mremap,
};
-static const struct vm_special_mapping vvar_mapping = {
- .name = "[vvar]",
- .fault = vvar_fault,
+static const struct vm_special_mapping vvar_vclock_mapping = {
+ .name = "[vvar_vclock]",
+ .fault = vvar_vclock_fault,
};
/*
@@ -231,13 +145,13 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr)
return -EINTR;
addr = get_unmapped_area(NULL, addr,
- image->size - image->sym_vvar_start, 0, 0);
+ image->size + __VDSO_PAGES * PAGE_SIZE, 0, 0);
if (IS_ERR_VALUE(addr)) {
ret = addr;
goto up_fail;
}
- text_start = addr - image->sym_vvar_start;
+ text_start = addr + __VDSO_PAGES * PAGE_SIZE;
/*
* MAYWRITE to allow gdb to COW and set breakpoints
@@ -246,7 +160,8 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr)
text_start,
image->size,
VM_READ|VM_EXEC|
- VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
+ VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
+ VM_SEALED_SYSMAP,
&vdso_mapping);
if (IS_ERR(vma)) {
@@ -254,79 +169,35 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr)
goto up_fail;
}
+ vma = vdso_install_vvar_mapping(mm, addr);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ do_munmap(mm, text_start, image->size, NULL);
+ goto up_fail;
+ }
+
vma = _install_special_mapping(mm,
- addr,
- -image->sym_vvar_start,
+ VDSO_VCLOCK_PAGES_START(addr),
+ VDSO_NR_VCLOCK_PAGES * PAGE_SIZE,
VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
- VM_PFNMAP,
- &vvar_mapping);
+ VM_PFNMAP|VM_SEALED_SYSMAP,
+ &vvar_vclock_mapping);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
do_munmap(mm, text_start, image->size, NULL);
- } else {
- current->mm->context.vdso = (void __user *)text_start;
- current->mm->context.vdso_image = image;
+ do_munmap(mm, addr, image->size, NULL);
+ goto up_fail;
}
+ current->mm->context.vdso = (void __user *)text_start;
+ current->mm->context.vdso_image = image;
+
up_fail:
mmap_write_unlock(mm);
return ret;
}
-#ifdef CONFIG_X86_64
-/*
- * Put the vdso above the (randomized) stack with another randomized
- * offset. This way there is no hole in the middle of address space.
- * To save memory make sure it is still in the same PTE as the stack
- * top. This doesn't give that many random bits.
- *
- * Note that this algorithm is imperfect: the distribution of the vdso
- * start address within a PMD is biased toward the end.
- *
- * Only used for the 64-bit and x32 vdsos.
- */
-static unsigned long vdso_addr(unsigned long start, unsigned len)
-{
- unsigned long addr, end;
- unsigned offset;
-
- /*
- * Round up the start address. It can start out unaligned as a result
- * of stack start randomization.
- */
- start = PAGE_ALIGN(start);
-
- /* Round the lowest possible end address up to a PMD boundary. */
- end = (start + len + PMD_SIZE - 1) & PMD_MASK;
- if (end >= DEFAULT_MAP_WINDOW)
- end = DEFAULT_MAP_WINDOW;
- end -= len;
-
- if (end > start) {
- offset = get_random_u32_below(((end - start) >> PAGE_SHIFT) + 1);
- addr = start + (offset << PAGE_SHIFT);
- } else {
- addr = start;
- }
-
- /*
- * Forcibly align the final address in case we have a hardware
- * issue that requires alignment for performance reasons.
- */
- addr = align_vdso_addr(addr);
-
- return addr;
-}
-
-static int map_vdso_randomized(const struct vdso_image *image)
-{
- unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start);
-
- return map_vdso(image, addr);
-}
-#endif
-
int map_vdso_once(const struct vdso_image *image, unsigned long addr)
{
struct mm_struct *mm = current->mm;
@@ -343,7 +214,8 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr)
*/
for_each_vma(vmi, vma) {
if (vma_is_special_mapping(vma, &vdso_mapping) ||
- vma_is_special_mapping(vma, &vvar_mapping)) {
+ vma_is_special_mapping(vma, &vdso_vvar_mapping) ||
+ vma_is_special_mapping(vma, &vvar_vclock_mapping)) {
mmap_write_unlock(mm);
return -EEXIST;
}
@@ -353,7 +225,6 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr)
return map_vdso(image, addr);
}
-#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
static int load_vdso32(void)
{
if (vdso32_enabled != 1) /* Other values all mean "disabled" */
@@ -361,45 +232,38 @@ static int load_vdso32(void)
return map_vdso(&vdso_image_32, 0);
}
-#endif
-#ifdef CONFIG_X86_64
int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
{
- if (!vdso64_enabled)
- return 0;
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ if (!vdso64_enabled)
+ return 0;
+
+ return map_vdso(&vdso_image_64, 0);
+ }
- return map_vdso_randomized(&vdso_image_64);
+ return load_vdso32();
}
#ifdef CONFIG_COMPAT
int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
int uses_interp, bool x32)
{
-#ifdef CONFIG_X86_X32_ABI
- if (x32) {
+ if (IS_ENABLED(CONFIG_X86_X32_ABI) && x32) {
if (!vdso64_enabled)
return 0;
- return map_vdso_randomized(&vdso_image_x32);
+ return map_vdso(&vdso_image_x32, 0);
}
-#endif
-#ifdef CONFIG_IA32_EMULATION
- return load_vdso32();
-#else
+
+ if (IS_ENABLED(CONFIG_IA32_EMULATION))
+ return load_vdso32();
+
return 0;
-#endif
-}
-#endif
-#else
-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
-{
- return load_vdso32();
}
#endif
bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs)
{
-#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
const struct vdso_image *image = current->mm->context.vdso_image;
unsigned long vdso = (unsigned long) current->mm->context.vdso;
@@ -408,7 +272,6 @@ bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs)
regs->ip == vdso + image->sym_vdso32_rt_sigreturn_landing_pad)
return true;
}
-#endif
return false;
}
diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S
index d77d278ee9dd..37a3d4c02366 100644
--- a/arch/x86/entry/vdso/vsgx.S
+++ b/arch/x86/entry/vdso/vsgx.S
@@ -1,7 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
-#include <asm/export.h>
#include <asm/errno.h>
#include <asm/enclu.h>
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index e0ca8120aea8..6e6c0a740837 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -76,7 +76,7 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
if (!show_unhandled_signals)
return;
- printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+ printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n",
level, current->comm, task_pid_nr(current),
message, regs->ip, regs->cs,
regs->sp, regs->ax, regs->si, regs->di);
@@ -98,11 +98,6 @@ static int addr_to_vsyscall_nr(unsigned long addr)
static bool write_ok_or_segv(unsigned long ptr, size_t size)
{
- /*
- * XXX: if access_ok, get_user, and put_user handled
- * sig_on_uaccess_err, this could go away.
- */
-
if (!access_ok((void __user *)ptr, size)) {
struct thread_struct *thread = &current->thread;
@@ -120,10 +115,8 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
bool emulate_vsyscall(unsigned long error_code,
struct pt_regs *regs, unsigned long address)
{
- struct task_struct *tsk;
unsigned long caller;
int vsyscall_nr, syscall_nr, tmp;
- int prev_sig_on_uaccess_err;
long ret;
unsigned long orig_dx;
@@ -131,7 +124,12 @@ bool emulate_vsyscall(unsigned long error_code,
if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
return false;
- if (!(error_code & X86_PF_INSTR)) {
+ /*
+ * Assume that faults at regs->ip are because of an
+ * instruction fetch. Return early and avoid
+ * emulation for faults during data accesses:
+ */
+ if (address != regs->ip) {
/* Failed vsyscall read */
if (vsyscall_mode == EMULATE)
return false;
@@ -144,12 +142,18 @@ bool emulate_vsyscall(unsigned long error_code,
}
/*
+ * X86_PF_INSTR is only set when NX is supported. When
+ * available, use it to double-check that the emulation code
+ * is only being used for instruction fetches:
+ */
+ if (cpu_feature_enabled(X86_FEATURE_NX))
+ WARN_ON_ONCE(!(error_code & X86_PF_INSTR));
+
+ /*
* No point in checking CS -- the only way to get here is a user mode
* trap to a high address, which means that we're in 64-bit user code.
*/
- WARN_ON_ONCE(address != regs->ip);
-
if (vsyscall_mode == NONE) {
warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall attempted with vsyscall=none");
@@ -172,8 +176,6 @@ bool emulate_vsyscall(unsigned long error_code,
goto sigsegv;
}
- tsk = current;
-
/*
* Check for access_ok violations and find the syscall nr.
*
@@ -234,12 +236,8 @@ bool emulate_vsyscall(unsigned long error_code,
goto do_ret; /* skip requested */
/*
- * With a real vsyscall, page faults cause SIGSEGV. We want to
- * preserve that behavior to make writing exploits harder.
+ * With a real vsyscall, page faults cause SIGSEGV.
*/
- prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err;
- current->thread.sig_on_uaccess_err = 1;
-
ret = -EFAULT;
switch (vsyscall_nr) {
case 0:
@@ -262,23 +260,12 @@ bool emulate_vsyscall(unsigned long error_code,
break;
}
- current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err;
-
check_fault:
if (ret == -EFAULT) {
/* Bad news -- userspace fed a bad pointer to a vsyscall. */
warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall fault (exploit attempt?)");
-
- /*
- * If we failed to generate a signal for any reason,
- * generate one here. (This should be impossible.)
- */
- if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
- !sigismember(&tsk->pending.signal, SIGSEGV)))
- goto sigsegv;
-
- return true; /* Don't emulate the ret. */
+ goto sigsegv;
}
regs->ax = ret;
@@ -365,9 +352,7 @@ void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
p4d = p4d_offset(pgd, VSYSCALL_ADDR);
-#if CONFIG_PGTABLE_LEVELS >= 5
set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER));
-#endif
pud = pud_offset(p4d, VSYSCALL_ADDR);
set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));
pmd = pmd_offset(pud, VSYSCALL_ADDR);